public class HDFSRepositoryConnector
extends org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector
| Modifier and Type | Class and Description |
|---|---|
protected static class |
HDFSRepositoryConnector.BackgroundStreamThread |
protected static class |
HDFSRepositoryConnector.CheckConnectionThread |
protected class |
HDFSRepositoryConnector.GetChildrenThread |
protected static class |
HDFSRepositoryConnector.GetObjectThread |
protected static class |
HDFSRepositoryConnector.GetSessionThread |
| Modifier and Type | Field and Description |
|---|---|
static java.lang.String |
_rcsid |
protected static java.lang.String[] |
activitiesList |
protected static java.lang.String |
ACTIVITY_READ |
protected long |
lastSessionFetch |
protected java.lang.String |
nameNodeHost |
protected java.lang.String |
nameNodePort |
protected java.lang.String |
nameNodeProtocol |
protected static java.lang.String |
RELATIONSHIP_CHILD |
protected HDFSSession |
session |
protected static long |
timeToRelease |
protected java.lang.String |
user |
currentContext, paramsGLOBAL_DENY_TOKEN, JOBMODE_CONTINUOUS, JOBMODE_ONCEONLY, MODEL_ADD, MODEL_ADD_CHANGE, MODEL_ADD_CHANGE_DELETE, MODEL_ALL, MODEL_CHAINED_ADD, MODEL_CHAINED_ADD_CHANGE, MODEL_CHAINED_ADD_CHANGE_DELETE, MODEL_PARTIAL| Constructor and Description |
|---|
HDFSRepositoryConnector() |
| Modifier and Type | Method and Description |
|---|---|
java.lang.String |
addSeedDocuments(org.apache.manifoldcf.crawler.interfaces.ISeedingActivity activities,
org.apache.manifoldcf.core.interfaces.Specification spec,
java.lang.String lastSeedVersion,
long seedTime,
int jobMode)
Queue "seed" documents.
|
java.lang.String |
check()
Test the connection.
|
protected void |
checkConnection() |
protected static boolean |
checkInclude(java.lang.String nameNode,
org.apache.hadoop.fs.FileStatus fileStatus,
java.lang.String fileName,
org.apache.manifoldcf.core.interfaces.Specification documentSpecification)
Check if a file or directory should be included, given a document specification.
|
protected static boolean |
checkIngest(java.lang.String nameNode,
org.apache.hadoop.fs.FileStatus fileStatus,
org.apache.manifoldcf.core.interfaces.Specification documentSpecification)
Check if a file should be ingested, given a document specification.
|
protected static boolean |
checkMatch(java.lang.String sourceMatch,
int sourceIndex,
java.lang.String match)
Check a match between two strings with wildcards.
|
protected void |
closeSession() |
void |
connect(org.apache.manifoldcf.core.interfaces.ConfigParams configParams) |
protected static java.lang.String |
convertToWGETURI(java.lang.String path)
Convert a path to an HDFS wget URI.
|
void |
disconnect() |
protected static java.lang.String |
findConvertPath(java.lang.String nameNode,
org.apache.manifoldcf.core.interfaces.Specification spec,
org.apache.hadoop.fs.Path theFile)
This method finds the part of the path that should be converted to a URI.
|
java.lang.String[] |
getActivitiesList()
List the activities we might report on.
|
java.lang.String[] |
getBinNames(java.lang.String documentIdentifier)
For any given document, list the bins that it is a member of.
|
protected org.apache.hadoop.fs.FileStatus[] |
getChildren(org.apache.hadoop.fs.Path path) |
int |
getConnectorModel()
Tell the world what model this connector uses for getDocumentIdentifiers().
|
int |
getMaxDocumentRequest()
Get the maximum number of documents to amalgamate together into one
batch, for this connector.
|
protected org.apache.hadoop.fs.FileStatus |
getObject(org.apache.hadoop.fs.Path path) |
java.lang.String[] |
getRelationshipTypes()
Return the list of relationship types that this connector recognizes.
|
protected HDFSSession |
getSession()
Set up a session
|
boolean |
isConnected()
This method is called to assess whether to count this connector instance should
actually be counted as being connected.
|
protected static java.lang.String |
mapExtensionToMimeType(java.lang.String fileName)
Map an extension to a mime type
|
protected static int |
matchSubPath(java.lang.String subPath,
java.lang.String fullPath)
Match a sub-path.
|
void |
outputConfigurationBody(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.lang.String tabName)
Output the configuration body section.
|
void |
outputConfigurationHeader(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.util.List<java.lang.String> tabsArray)
Output the configuration header section.
|
void |
outputSpecificationBody(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber,
int actualSequenceNumber,
java.lang.String tabName)
Output the specification body section.
|
void |
outputSpecificationHeader(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber,
java.util.List<java.lang.String> tabsArray)
Output the specification header section.
|
void |
poll() |
protected static boolean |
processCheck(boolean caseSensitive,
java.lang.String sourceMatch,
int sourceIndex,
java.lang.String match,
int matchIndex)
Recursive worker method for checkMatch.
|
java.lang.String |
processConfigurationPost(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
Process a configuration post.
|
void |
processDocuments(java.lang.String[] documentIdentifiers,
org.apache.manifoldcf.crawler.interfaces.IExistingVersions statuses,
org.apache.manifoldcf.core.interfaces.Specification spec,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
int jobMode,
boolean usesDefaultAuthority)
Process a set of documents.
|
java.lang.String |
processSpecificationPost(org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber)
Process a specification post.
|
void |
viewConfiguration(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
View configuration.
|
void |
viewSpecification(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber)
View specification.
|
getFormCheckJavascriptMethodName, getFormPresaveCheckJavascriptMethodName, requestInfoclearThreadContext, deinstall, getConfiguration, install, outputConfigurationBody, outputConfigurationHeader, outputConfigurationHeader, pack, packFixedList, packList, packList, processConfigurationPost, setThreadContext, unpack, unpackFixedList, unpackList, viewConfigurationpublic static final java.lang.String _rcsid
protected static final java.lang.String ACTIVITY_READ
protected static final java.lang.String RELATIONSHIP_CHILD
protected static final java.lang.String[] activitiesList
protected java.lang.String nameNodeProtocol
protected java.lang.String nameNodeHost
protected java.lang.String nameNodePort
protected java.lang.String user
protected HDFSSession session
protected long lastSessionFetch
protected static final long timeToRelease
public int getConnectorModel()
getConnectorModel in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetConnectorModel in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorpublic java.lang.String[] getRelationshipTypes()
getRelationshipTypes in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetRelationshipTypes in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorpublic java.lang.String[] getActivitiesList()
getActivitiesList in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetActivitiesList in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorpublic java.lang.String[] getBinNames(java.lang.String documentIdentifier)
getBinNames in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetBinNames in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorpublic int getMaxDocumentRequest()
getMaxDocumentRequest in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetMaxDocumentRequest in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorpublic void connect(org.apache.manifoldcf.core.interfaces.ConfigParams configParams)
connect in interface org.apache.manifoldcf.core.interfaces.IConnectorconnect in class org.apache.manifoldcf.core.connector.BaseConnectorpublic void disconnect()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
disconnect in interface org.apache.manifoldcf.core.interfaces.IConnectordisconnect in class org.apache.manifoldcf.core.connector.BaseConnectororg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected HDFSSession getSession() throws org.apache.manifoldcf.core.interfaces.ManifoldCFException, org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic java.lang.String check()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
check in interface org.apache.manifoldcf.core.interfaces.IConnectorcheck in class org.apache.manifoldcf.core.connector.BaseConnectororg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic boolean isConnected()
isConnected in interface org.apache.manifoldcf.core.interfaces.IConnectorisConnected in class org.apache.manifoldcf.core.connector.BaseConnectorpublic void poll()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
poll in interface org.apache.manifoldcf.core.interfaces.IConnectorpoll in class org.apache.manifoldcf.core.connector.BaseConnectororg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected void closeSession()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic java.lang.String addSeedDocuments(org.apache.manifoldcf.crawler.interfaces.ISeedingActivity activities,
org.apache.manifoldcf.core.interfaces.Specification spec,
java.lang.String lastSeedVersion,
long seedTime,
int jobMode)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
addSeedDocuments in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectoraddSeedDocuments in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectoractivities - is the interface this method should use to perform whatever framework actions are desired.spec - is a document specification (that comes from the job).seedTime - is the end of the time range of documents to consider, exclusive.lastSeedVersion - is the last seeding version string for this job, or null if the job has no previous seeding version string.jobMode - is an integer describing how the job is being run, whether continuous or once-only.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic void processDocuments(java.lang.String[] documentIdentifiers,
org.apache.manifoldcf.crawler.interfaces.IExistingVersions statuses,
org.apache.manifoldcf.core.interfaces.Specification spec,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
int jobMode,
boolean usesDefaultAuthority)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
processDocuments in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorprocessDocuments in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectordocumentIdentifiers - is the set of document identifiers to process.statuses - are the currently-stored document versions for each document in the set of document identifiers
passed in above.activities - is the interface this method should use to queue up new document references
and ingest documents.jobMode - is an integer describing how the job is being run, whether continuous or once-only.usesDefaultAuthority - will be true only if the authority in use for these documents is the default one.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic void outputConfigurationHeader(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.util.List<java.lang.String> tabsArray)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputConfigurationHeader in interface org.apache.manifoldcf.core.interfaces.IConnectoroutputConfigurationHeader in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.out - is the output to which any HTML should be sent.parameters - are the configuration parameters, as they currently exist, for this connection being configured.tabsArray - is an array of tab names. Add to this array any tab names that are specific to the connector.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic void outputConfigurationBody(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.lang.String tabName)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputConfigurationBody in interface org.apache.manifoldcf.core.interfaces.IConnectoroutputConfigurationBody in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.out - is the output to which any HTML should be sent.parameters - are the configuration parameters, as they currently exist, for this connection being configured.tabName - is the current tab name.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic java.lang.String processConfigurationPost(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
processConfigurationPost in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.variableContext - is the set of variables available from the post, including binary file post information.parameters - are the configuration parameters, as they currently exist, for this connection being configured.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void viewConfiguration(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
viewConfiguration in interface org.apache.manifoldcf.core.interfaces.IConnectorviewConfiguration in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.out - is the output to which any HTML should be sent.parameters - are the configuration parameters, as they currently exist, for this connection being configured.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic void outputSpecificationHeader(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber,
java.util.List<java.lang.String> tabsArray)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputSpecificationHeader in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectoroutputSpecificationHeader in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorout - is the output to which any HTML should be sent.locale - is the locale the output is preferred to be in.ds - is the current document specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.tabsArray - is an array of tab names. Add to this array any tab names that are specific to the connector.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic void outputSpecificationBody(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber,
int actualSequenceNumber,
java.lang.String tabName)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputSpecificationBody in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectoroutputSpecificationBody in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorout - is the output to which any HTML should be sent.locale - is the locale the output is preferred to be in.ds - is the current document specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.actualSequenceNumber - is the connection within the job that has currently been selected.tabName - is the current tab name. (actualSequenceNumber, tabName) form a unique tuple within
the job.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic java.lang.String processSpecificationPost(org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
processSpecificationPost in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorprocessSpecificationPost in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorvariableContext - contains the post data, including binary file-upload information.locale - is the locale the output is preferred to be in.ds - is the current document specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void viewSpecification(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
viewSpecification in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorviewSpecification in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorout - is the output to which any HTML should be sent.locale - is the locale the output is preferred to be in.ds - is the current document specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionprotected static java.lang.String convertToWGETURI(java.lang.String path)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
path - is the document filePath.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected static java.lang.String findConvertPath(java.lang.String nameNode,
org.apache.manifoldcf.core.interfaces.Specification spec,
org.apache.hadoop.fs.Path theFile)
spec - is the document specification.protected static java.lang.String mapExtensionToMimeType(java.lang.String fileName)
protected static boolean checkInclude(java.lang.String nameNode,
org.apache.hadoop.fs.FileStatus fileStatus,
java.lang.String fileName,
org.apache.manifoldcf.core.interfaces.Specification documentSpecification)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
fileName - is the canonical file name.documentSpecification - is the specification.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected static boolean checkIngest(java.lang.String nameNode,
org.apache.hadoop.fs.FileStatus fileStatus,
org.apache.manifoldcf.core.interfaces.Specification documentSpecification)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
documentSpecification - is the specification.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected static int matchSubPath(java.lang.String subPath,
java.lang.String fullPath)
subPath - is the sub path.fullPath - is the full path.protected static boolean checkMatch(java.lang.String sourceMatch,
int sourceIndex,
java.lang.String match)
sourceMatch - is the expanded string (no wildcards)sourceIndex - is the starting point in the expanded string.match - is the wildcard-based string.protected static boolean processCheck(boolean caseSensitive,
java.lang.String sourceMatch,
int sourceIndex,
java.lang.String match,
int matchIndex)
caseSensitive - is true if file names are case sensitive.sourceMatch - is the source string (w/o wildcards)sourceIndex - is the current point in the source string.match - is the match string (w/wildcards)matchIndex - is the current point in the match string.protected void checkConnection()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected org.apache.hadoop.fs.FileStatus[] getChildren(org.apache.hadoop.fs.Path path)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected org.apache.hadoop.fs.FileStatus getObject(org.apache.hadoop.fs.Path path)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruption