public class TikaExtractor
extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
| Modifier and Type | Class and Description |
|---|---|
protected static interface |
TikaExtractor.DestinationStorage |
protected static class |
TikaExtractor.FileDestinationStorage |
protected static class |
TikaExtractor.MemoryDestinationStorage |
protected static class |
TikaExtractor.SpecPacker |
| Modifier and Type | Field and Description |
|---|---|
static java.lang.String |
_rcsid |
protected static java.lang.String[] |
activitiesList |
protected static java.lang.String |
ACTIVITY_EXTRACT |
protected static java.net.URI |
checkURI
Check URI
|
protected static java.net.URI |
contentURI
Content URI
|
protected static long |
inMemoryMaximumFile
We handle up to 64K in memory; after that we go to disk.
|
protected static java.net.URI |
metaURI
Metadata URI
|
protected static long |
sessionExpirationInterval |
| Constructor and Description |
|---|
TikaExtractor() |
| Modifier and Type | Method and Description |
|---|---|
int |
addOrReplaceDocumentWithException(java.lang.String documentURI,
org.apache.manifoldcf.core.interfaces.VersionContext pipelineDescription,
org.apache.manifoldcf.agents.interfaces.RepositoryDocument document,
java.lang.String authorityNameString,
org.apache.manifoldcf.agents.interfaces.IOutputAddActivity activities)
Add (or replace) a document in the output data store using the connector.
|
java.lang.String |
check()
Test the connection.
|
boolean |
checkDocumentIndexable(org.apache.manifoldcf.core.interfaces.VersionContext pipelineDescription,
java.io.File localFile,
org.apache.manifoldcf.agents.interfaces.IOutputCheckActivity checkActivity)
Pre-determine whether a document (passed here as a File object) is
acceptable or not.
|
boolean |
checkLengthIndexable(org.apache.manifoldcf.core.interfaces.VersionContext pipelineDescription,
long length,
org.apache.manifoldcf.agents.interfaces.IOutputCheckActivity checkActivity)
Pre-determine whether a document's length is acceptable.
|
boolean |
checkMimeTypeIndexable(org.apache.manifoldcf.core.interfaces.VersionContext pipelineDescription,
java.lang.String mimeType,
org.apache.manifoldcf.agents.interfaces.IOutputCheckActivity checkActivity)
Detect if a mime type is acceptable or not.
|
void |
connect(org.apache.manifoldcf.core.interfaces.ConfigParams configParameters)
Connect.
|
void |
disconnect()
Close the connection.
|
protected void |
expireSession()
Expire the current session
|
protected static void |
fillInFieldMappingSpecificationMap(java.util.Map<java.lang.String,java.lang.Object> paramMap,
org.apache.manifoldcf.core.interfaces.Specification os) |
protected static void |
fillInServerTab(java.util.Map<java.lang.String,java.lang.Object> velocityContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters) |
java.lang.String[] |
getActivitiesList()
Return a list of activities that this connector generates.
|
java.lang.String |
getFormCheckJavascriptMethodName(int connectionSequenceNumber)
Obtain the name of the form check javascript method to call.
|
java.lang.String |
getFormPresaveCheckJavascriptMethodName(int connectionSequenceNumber)
Obtain the name of the form presave check javascript method to call.
|
org.apache.manifoldcf.core.interfaces.VersionContext |
getPipelineDescription(org.apache.manifoldcf.core.interfaces.Specification os)
Get an output version string, given an output specification.
|
protected void |
getSession()
Set up a session
|
protected static int |
handleIOException(java.io.IOException e) |
protected static int |
handleTikaServerError(java.lang.String description) |
protected static int |
handleTikaServerException(java.io.IOException e) |
protected static int |
handleTikaServerException(org.json.simple.parser.ParseException e) |
protected static int |
handleTikaServerRejects(java.lang.String reason) |
boolean |
isConnected()
This method is called to assess whether to count this connector instance should
actually be counted as being connected.
|
void |
outputConfigurationBody(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.lang.String tabName)
Output the configuration body section.
|
void |
outputConfigurationHeader(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.util.List<java.lang.String> tabsArray)
Output the configuration header section.
|
void |
outputSpecificationBody(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification os,
int connectionSequenceNumber,
int actualSequenceNumber,
java.lang.String tabName)
Output the specification body section.
|
void |
outputSpecificationHeader(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification os,
int connectionSequenceNumber,
java.util.List<java.lang.String> tabsArray)
Output the specification header section.
|
void |
poll()
This method is periodically called for all connectors that are connected but not
in active use.
|
java.lang.String |
processConfigurationPost(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
Process a configuration post.
|
java.lang.String |
processSpecificationPost(org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification os,
int connectionSequenceNumber)
Process a specification post.
|
void |
viewConfiguration(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
View configuration.
|
void |
viewSpecification(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification os,
int connectionSequenceNumber)
View specification.
|
checkDateIndexable, checkURLIndexable, requestInfoclearThreadContext, deinstall, getConfiguration, install, outputConfigurationBody, outputConfigurationHeader, outputConfigurationHeader, pack, packFixedList, packList, packList, processConfigurationPost, setThreadContext, unpack, unpackFixedList, unpackList, viewConfigurationpublic static final java.lang.String _rcsid
protected static final java.lang.String ACTIVITY_EXTRACT
protected static final java.lang.String[] activitiesList
protected static final long sessionExpirationInterval
protected static final long inMemoryMaximumFile
protected static final java.net.URI metaURI
protected static final java.net.URI contentURI
protected static final java.net.URI checkURI
public void connect(org.apache.manifoldcf.core.interfaces.ConfigParams configParameters)
connect in interface org.apache.manifoldcf.core.interfaces.IConnectorconnect in class org.apache.manifoldcf.core.connector.BaseConnectorconfigParameters - is the set of configuration parameters, which
in this case describe the root directory.public void disconnect()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
disconnect in interface org.apache.manifoldcf.core.interfaces.IConnectordisconnect in class org.apache.manifoldcf.core.connector.BaseConnectororg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void poll()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
poll in interface org.apache.manifoldcf.core.interfaces.IConnectorpoll in class org.apache.manifoldcf.core.connector.BaseConnectororg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic boolean isConnected()
isConnected in interface org.apache.manifoldcf.core.interfaces.IConnectorisConnected in class org.apache.manifoldcf.core.connector.BaseConnectorprotected void getSession()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected void expireSession()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic java.lang.String check()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
check in interface org.apache.manifoldcf.core.interfaces.IConnectorcheck in class org.apache.manifoldcf.core.connector.BaseConnectororg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic java.lang.String[] getActivitiesList()
getActivitiesList in interface org.apache.manifoldcf.agents.interfaces.ITransformationConnectorgetActivitiesList in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorpublic void outputConfigurationHeader(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.util.List<java.lang.String> tabsArray)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputConfigurationHeader in interface org.apache.manifoldcf.core.interfaces.IConnectoroutputConfigurationHeader in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.out - is the output to which any HTML should be sent.parameters - are the configuration parameters, as they currently exist, for this connection being configured.tabsArray - is an array of tab names. Add to this array any tab names that are specific to the connector.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic void outputConfigurationBody(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.lang.String tabName)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputConfigurationBody in interface org.apache.manifoldcf.core.interfaces.IConnectoroutputConfigurationBody in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.out - is the output to which any HTML should be sent.parameters - are the configuration parameters, as they currently exist, for this connection being configured.tabName - is the current tab name.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic java.lang.String processConfigurationPost(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
processConfigurationPost in interface org.apache.manifoldcf.core.interfaces.IConnectorprocessConfigurationPost in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.variableContext - is the set of variables available from the post, including binary file post information.parameters - are the configuration parameters, as they currently exist, for this connection being configured.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void viewConfiguration(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
viewConfiguration in interface org.apache.manifoldcf.core.interfaces.IConnectorviewConfiguration in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.out - is the output to which any HTML should be sent.parameters - are the configuration parameters, as they currently exist, for this connection being configured.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionprotected static void fillInServerTab(java.util.Map<java.lang.String,java.lang.Object> velocityContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic org.apache.manifoldcf.core.interfaces.VersionContext getPipelineDescription(org.apache.manifoldcf.core.interfaces.Specification os)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
getPipelineDescription in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectorgetPipelineDescription in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectoros - is the current output specification for the job that is doing the
crawling.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic boolean checkMimeTypeIndexable(org.apache.manifoldcf.core.interfaces.VersionContext pipelineDescription,
java.lang.String mimeType,
org.apache.manifoldcf.agents.interfaces.IOutputCheckActivity checkActivity)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
checkMimeTypeIndexable in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectorcheckMimeTypeIndexable in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorpipelineDescription - is the document's pipeline version string, for this connection.mimeType - is the mime type of the document.checkActivity - is an object including the activities that can be performed by
this method.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic boolean checkDocumentIndexable(org.apache.manifoldcf.core.interfaces.VersionContext pipelineDescription,
java.io.File localFile,
org.apache.manifoldcf.agents.interfaces.IOutputCheckActivity checkActivity)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
checkDocumentIndexable in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectorcheckDocumentIndexable in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorpipelineDescription - is the document's pipeline version string, for this connection.localFile - is the local file to check.checkActivity - is an object including the activities that can be done by this
method.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic boolean checkLengthIndexable(org.apache.manifoldcf.core.interfaces.VersionContext pipelineDescription,
long length,
org.apache.manifoldcf.agents.interfaces.IOutputCheckActivity checkActivity)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
checkLengthIndexable in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectorcheckLengthIndexable in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorpipelineDescription - is the document's pipeline version string, for this connection.length - is the length of the document.checkActivity - is an object including the activities that can be done by this
method.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic int addOrReplaceDocumentWithException(java.lang.String documentURI,
org.apache.manifoldcf.core.interfaces.VersionContext pipelineDescription,
org.apache.manifoldcf.agents.interfaces.RepositoryDocument document,
java.lang.String authorityNameString,
org.apache.manifoldcf.agents.interfaces.IOutputAddActivity activities)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption,
java.io.IOException
addOrReplaceDocumentWithException in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectoraddOrReplaceDocumentWithException in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectordocumentURI - is the URI of the document. The URI is presumed to be the unique
identifier which the output data store will use to process and
serve the document. This URI is constructed by the repository
connector which fetches the document, and is thus universal across
all output connectors.pipelineDescription - is the description string that was constructed for this document
by the getOutputDescription() method.document - is the document data to be processed (handed to the output data
store).authorityNameString - is the name of the authority responsible for authorizing any
access tokens passed in with the repository document. May be null.activities - is the handle to an object that the implementer of a pipeline
connector may use to perform operations, such as logging
processing activity, or sending a modified document to the next
stage in the pipeline.java.io.IOException - only if there's a stream error reading the document data.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic java.lang.String getFormCheckJavascriptMethodName(int connectionSequenceNumber)
getFormCheckJavascriptMethodName in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectorgetFormCheckJavascriptMethodName in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorconnectionSequenceNumber - is the unique number of this connection within the job.public java.lang.String getFormPresaveCheckJavascriptMethodName(int connectionSequenceNumber)
getFormPresaveCheckJavascriptMethodName in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectorgetFormPresaveCheckJavascriptMethodName in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorconnectionSequenceNumber - is the unique number of this connection within the job.public void outputSpecificationHeader(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification os,
int connectionSequenceNumber,
java.util.List<java.lang.String> tabsArray)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputSpecificationHeader in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectoroutputSpecificationHeader in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorout - is the output to which any HTML should be sent.locale - is the preferred local of the output.os - is the current pipeline specification for this connection.connectionSequenceNumber - is the unique number of this connection within the job.tabsArray - is an array of tab names. Add to this array any tab names that are
specific to the connector.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic void outputSpecificationBody(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification os,
int connectionSequenceNumber,
int actualSequenceNumber,
java.lang.String tabName)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputSpecificationBody in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectoroutputSpecificationBody in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorout - is the output to which any HTML should be sent.locale - is the preferred local of the output.os - is the current pipeline specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.actualSequenceNumber - is the connection within the job that has currently been selected.tabName - is the current tab name.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic java.lang.String processSpecificationPost(org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification os,
int connectionSequenceNumber)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
processSpecificationPost in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectorprocessSpecificationPost in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorvariableContext - contains the post data, including binary file-upload information.locale - is the preferred local of the output.os - is the current pipeline specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void viewSpecification(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification os,
int connectionSequenceNumber)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
viewSpecification in interface org.apache.manifoldcf.agents.interfaces.IPipelineConnectorviewSpecification in class org.apache.manifoldcf.agents.transformation.BaseTransformationConnectorout - is the output to which any HTML should be sent.locale - is the preferred local of the output.connectionSequenceNumber - is the unique number of this connection within the job.os - is the current pipeline specification for this job.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionprotected static void fillInFieldMappingSpecificationMap(java.util.Map<java.lang.String,java.lang.Object> paramMap,
org.apache.manifoldcf.core.interfaces.Specification os)
protected static int handleTikaServerRejects(java.lang.String reason)
throws java.io.IOException,
org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
java.io.IOExceptionorg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected static int handleTikaServerError(java.lang.String description)
throws java.io.IOException,
org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
java.io.IOExceptionorg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected static int handleTikaServerException(java.io.IOException e)
throws java.io.IOException,
org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
java.io.IOExceptionorg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected static int handleTikaServerException(org.json.simple.parser.ParseException e)
throws java.io.IOException,
org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
java.io.IOExceptionorg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected static int handleIOException(java.io.IOException e)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFException