public class WebcrawlerConnector
extends org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnector
| Modifier and Type | Class and Description |
|---|---|
protected static class |
WebcrawlerConnector.CanonicalizationPolicies
Class representing a list of canonicalization rules
|
protected static class |
WebcrawlerConnector.CanonicalizationPolicy
Class representing a URL regular expression match, for the purposes of determining canonicalization policy
|
protected class |
WebcrawlerConnector.DocumentURLFilter
This class describes the url filtering information (for crawling and indexing) obtained from a digested DocumentSpecification.
|
protected static class |
WebcrawlerConnector.EvaluatorToken
Evaluator token.
|
protected static class |
WebcrawlerConnector.EvaluatorTokenStream
Token stream.
|
protected class |
WebcrawlerConnector.FeedContextClass |
protected class |
WebcrawlerConnector.FeedItemContextClass |
protected static class |
WebcrawlerConnector.FetchStatus |
protected static class |
WebcrawlerConnector.MappingRule
Class representing a mapping rule
|
protected static class |
WebcrawlerConnector.MappingRules
Class that represents all mappings
|
protected static class |
WebcrawlerConnector.NameValue
Name/value class
|
protected class |
WebcrawlerConnector.OuterContextClass
This class handles the outermost XML context for the feed document.
|
protected class |
WebcrawlerConnector.ProcessActivityHTMLHandler
Class that describes HTML handling
|
protected class |
WebcrawlerConnector.ProcessActivityLinkHandler
This class is the handler for links that get added into a IProcessActivity object.
|
protected class |
WebcrawlerConnector.ProcessActivityRedirectionHandler
Class that describes redirection handling
|
protected class |
WebcrawlerConnector.ProcessActivityXMLHandler
Class that describes XML handling
|
protected class |
WebcrawlerConnector.RDFContextClass |
protected class |
WebcrawlerConnector.RDFItemContextClass |
protected class |
WebcrawlerConnector.RSSChannelContextClass |
protected class |
WebcrawlerConnector.RSSContextClass |
protected class |
WebcrawlerConnector.RSSItemContextClass |
protected class |
WebcrawlerConnector.UrlsetContextClass |
protected class |
WebcrawlerConnector.UrlsetItemContextClass |
| Modifier and Type | Field and Description |
|---|---|
static java.lang.String |
_rcsid |
static java.lang.String |
ACTIVITY_FETCH |
static java.lang.String |
ACTIVITY_LOGON_END |
static java.lang.String |
ACTIVITY_LOGON_START |
static java.lang.String |
ACTIVITY_PROCESS |
static java.lang.String |
ACTIVITY_ROBOTSPARSE |
protected static DataCache |
cache
This is where we keep data around between the getVersions() phase and the processDocuments() phase.
|
protected int |
connectionTimeoutMilliseconds
Connection timeout, milliseconds.
|
protected CookieManager |
cookieManager
The cookie manager used by this instance
|
protected CredentialsDescription |
credentialsDescription
The credentials description
|
protected DNSManager |
dnsManager
The DNS manager currently used by this instance
|
protected static java.lang.String |
FETCH_LOGIN |
protected static java.lang.String |
FETCH_ROBOTS |
protected static java.lang.String |
FETCH_STANDARD |
protected java.lang.String |
from
The email address for this connector instance
|
protected static java.lang.String[] |
interestingMimeTypeArray
This represents a list of the mime types that this connector knows how to extract links from.
|
protected static java.util.Set<java.lang.String> |
interestingMimeTypeMap |
protected boolean |
isInitialized
This flag is set when the instance has been initialized
|
protected static int |
META_ROBOTS_ALL |
protected static int |
META_ROBOTS_NONE |
protected int |
metaRobotsTagsUsage
Meta robots tag usage flag
|
protected static java.util.List<java.lang.String> |
potentiallyExcludedHeaders |
protected java.lang.String |
proxyAuthDomain
Proxy auth domain
|
protected java.lang.String |
proxyAuthPassword
Proxy auth password
|
protected java.lang.String |
proxyAuthUsername
Proxy auth user name
|
protected java.lang.String |
proxyHost
Proxy host
|
protected int |
proxyPort
Proxy port
|
static java.lang.String |
REL_LINK |
static java.lang.String |
REL_REDIRECT |
protected static java.util.Set<java.lang.String> |
reservedHeaders |
protected static int |
RESULT_NO_DOCUMENT |
protected static int |
RESULT_NO_VERSION |
protected static int |
RESULT_RETRY_DOCUMENT |
protected static int |
RESULT_VERSION_NEEDED |
protected static int |
RESULTSTATUS_FALSE |
protected static int |
RESULTSTATUS_NOTYETDETERMINED |
protected static int |
RESULTSTATUS_TRUE |
protected static int |
ROBOTS_ALL |
protected static int |
ROBOTS_DATA |
protected static int |
ROBOTS_NONE |
protected RobotsManager |
robotsManager
The robots manager currently used by this instance
|
protected int |
robotsUsage
Robots usage flag
|
protected static int |
SESSIONSTATE_LOGIN
We're in 'login mode'
|
protected static int |
SESSIONSTATE_NORMAL
Normal fetch of content document.
|
protected int |
socketTimeoutMilliseconds
Socket timeout, milliseconds
|
protected ThrottleDescription |
throttleDescription
The throttle description
|
protected java.lang.String |
throttleGroupName
Throttle group name
|
protected TrustsDescription |
trustsDescription
The trusts description
|
protected static java.util.Set<java.lang.String> |
understoodProtocols |
protected java.lang.String |
userAgent
The user-agent for this connector instance
|
currentContext, paramsGLOBAL_DENY_TOKEN, JOBMODE_CONTINUOUS, JOBMODE_ONCEONLY, MODEL_ADD, MODEL_ADD_CHANGE, MODEL_ADD_CHANGE_DELETE, MODEL_ALL, MODEL_CHAINED_ADD, MODEL_CHAINED_ADD_CHANGE, MODEL_CHAINED_ADD_CHANGE_DELETE, MODEL_PARTIAL| Constructor and Description |
|---|
WebcrawlerConnector()
Constructor.
|
| Modifier and Type | Method and Description |
|---|---|
java.lang.String |
addSeedDocuments(org.apache.manifoldcf.crawler.interfaces.ISeedingActivity activities,
org.apache.manifoldcf.core.interfaces.Specification spec,
java.lang.String lastSeedVersion,
long seedTime,
int jobMode)
Queue "seed" documents.
|
protected java.lang.String[] |
calculateDocumentEvents(org.apache.manifoldcf.crawler.interfaces.INamingActivity activities,
java.lang.String documentIdentifier)
Calculate events that should be associated with a document.
|
java.lang.String |
check()
Check status of connection.
|
protected int |
checkFetchAllowed(java.lang.String documentIdentifier,
java.lang.String protocol,
java.lang.String hostIPAddress,
int port,
PageCredentials credential,
org.apache.manifoldcf.connectorcommon.interfaces.IKeystoreManager trustStore,
java.lang.String hostName,
java.lang.String[] binNames,
long currentTime,
java.lang.String pathString,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity versionActivities,
int connectionLimit,
java.lang.String proxyHost,
int proxyPort,
java.lang.String proxyAuthDomain,
java.lang.String proxyAuthUsername,
java.lang.String proxyAuthPassword)
Check robots to see if fetch is allowed.
|
void |
clearThreadContext()
Clear out any state information specific to a given thread.
|
protected static void |
compileList(java.util.List<java.util.regex.Pattern> output,
java.util.List<java.lang.String> input)
Compile all regexp entries in the passed in list, and add them to the output
list.
|
void |
deinstall(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext)
Uninstall the connector.
|
void |
disconnect()
Close the connection.
|
protected java.lang.String |
doCanonicalization(WebcrawlerConnector.DocumentURLFilter filter,
WebURL url)
Code to canonicalize a URL.
|
protected java.lang.String |
documentIdentifiertoFileName(java.lang.String documentIdentifier)
Convert a document identifier to filename.
|
protected static java.lang.String |
extractContentType(java.lang.String contentType) |
protected static java.lang.String |
extractEncoding(java.lang.String contentType) |
protected boolean |
extractLinks(java.lang.String documentIdentifier,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
WebcrawlerConnector.DocumentURLFilter filter)
Code to extract links from an already-fetched document.
|
protected static java.lang.String |
extractMimeType(java.lang.String contentType) |
protected static java.util.Set<java.lang.String> |
findExcludedHeaders(org.apache.manifoldcf.core.interfaces.Specification spec)
Read a document specification to get a set of excluded headers
|
protected FormData |
findHTMLForm(java.lang.String currentURI,
LoginParameters lp)
Find matching HTML form data, if present.
|
protected java.lang.String |
findHTMLLinkURI(java.lang.String currentURI,
LoginParameters lp)
Find HTML link URI, if present, making sure specified preference is matched.
|
protected java.lang.String |
findPreferredRedirectionURI(java.lang.String currentURI,
LoginParameters lp)
Find a preferred redirection URI, if it exists
|
protected java.lang.String |
findRedirectionURI(java.lang.String currentURI)
Find a redirection URI, if it exists
|
protected java.lang.String |
findSpecifiedContent(java.lang.String currentURI,
LoginParameters lp)
Find existence of specific content on the page (never finds a URL)
|
protected static java.lang.String[] |
getAcls(org.apache.manifoldcf.core.interfaces.Specification spec)
Grab forced acl out of document specification.
|
java.lang.String[] |
getActivitiesList()
Return the list of activities that this connector supports (i.e.
|
java.lang.String[] |
getBinNames(java.lang.String documentIdentifier)
Get the bin name string for a document identifier.
|
int |
getConnectorModel()
Tell the world what model this connector uses for getDocumentIdentifiers().
|
int |
getMaxDocumentRequest()
Get the maximum number of documents to amalgamate together into one batch, for this connector.
|
protected PageCredentials |
getPageCredential(java.lang.String documentIdentifier)
Get the page credentials for a given document identifier (URL)
|
java.lang.String[] |
getRelationshipTypes()
Return the list of relationship types that this connector recognizes.
|
protected SequenceCredentials |
getSequenceCredential(java.lang.String documentIdentifier)
Get the sequence credentials for a given document identifier (URL)
|
protected void |
getSession()
Start a session
|
protected org.apache.manifoldcf.connectorcommon.interfaces.IKeystoreManager |
getTrustStore(java.lang.String documentIdentifier)
Get the trust store for a given document identifier (URL)
|
protected void |
handleHTML(java.lang.String documentURI,
IHTMLHandler handler)
Handle document references from HTML
|
protected static void |
handleIOException(java.io.IOException e,
java.lang.String context) |
protected void |
handleRedirects(java.lang.String documentURI,
IRedirectionHandler handler)
Handle extracting the redirect link from a redirect response.
|
protected void |
handleXML(java.lang.String documentURI,
IXMLHandler handler)
Handle document references from XML.
|
void |
install(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext)
Install the connector.
|
protected boolean |
isContentInteresting(org.apache.manifoldcf.crawler.interfaces.IFingerprintActivity activities,
java.lang.String documentIdentifier,
int response,
java.lang.String contentType)
Code to check if data is interesting, based on response code and content type.
|
protected boolean |
isDocumentText(java.lang.String documentURI)
Is the document text, as far as we can tell?
|
protected static boolean |
isStrange(byte x)
Check if character is not typical ASCII or utf-8.
|
protected static boolean |
isText(byte[] beginChunk,
int chunkLength)
Test to see if a document is text or not.
|
protected static boolean |
isWhiteSpace(byte x)
Check if a byte is a whitespace character.
|
protected void |
loginAndFetch(WebcrawlerConnector.FetchStatus fetchStatus,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
java.lang.String documentIdentifier,
SequenceCredentials sessionCredential,
java.lang.String globalSequenceEvent) |
protected int |
lookupIPAddress(java.lang.String documentIdentifier,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
java.lang.String hostName,
long currentTime,
java.lang.StringBuilder ipAddressBuffer)
Look up an ipaddress given a non-canonical host name.
|
protected java.lang.String |
makeDNSEventName(org.apache.manifoldcf.crawler.interfaces.INamingActivity activities,
java.lang.String hostNameKey)
Calculate the event name for DNS access.
|
protected java.lang.String |
makeDocumentIdentifier(java.lang.String parentIdentifier,
java.lang.String rawURL,
WebcrawlerConnector.DocumentURLFilter filter,
org.apache.manifoldcf.crawler.interfaces.IHistoryActivity activities)
Convert an absolute or relative URL to a document identifier.
|
protected java.lang.String |
makeRobotsEventName(org.apache.manifoldcf.crawler.interfaces.INamingActivity versionActivities,
java.lang.String robotsKey)
Construct a name for the global web-connector robots event.
|
protected static java.lang.String |
makeRobotsKey(java.lang.String protocol,
java.lang.String hostName,
int port)
Construct the robots key for a host.
|
protected java.lang.String |
makeSessionLoginEventName(org.apache.manifoldcf.crawler.interfaces.INamingActivity activities,
java.lang.String sequenceKey)
Calculate the event name for session login.
|
void |
outputConfigurationBody(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.lang.String tabName)
Output the configuration body section.
|
void |
outputConfigurationHeader(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.util.List<java.lang.String> tabsArray)
Output the configuration header section.
|
void |
outputSpecificationBody(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber,
int actualSequenceNumber,
java.lang.String tabName)
Output the specification body section.
|
void |
outputSpecificationHeader(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber,
java.util.List<java.lang.String> tabsArray)
Output the specification header section.
|
void |
poll()
This method is periodically called for all connectors that are connected but not
in active use.
|
java.lang.String |
processConfigurationPost(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
Process a configuration post.
|
protected void |
processDocument(org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
java.lang.String documentIdentifier,
java.lang.String versionString,
boolean indexDocument,
java.util.Map<java.lang.String,java.util.Set<java.lang.String>> metaHash,
java.lang.String[] acls,
WebcrawlerConnector.DocumentURLFilter filter) |
void |
processDocuments(java.lang.String[] documentIdentifiers,
org.apache.manifoldcf.crawler.interfaces.IExistingVersions statuses,
org.apache.manifoldcf.core.interfaces.Specification spec,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
int jobMode,
boolean usesDefaultAuthority)
Process a set of documents.
|
java.lang.String |
processSpecificationPost(org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber)
Process a specification post.
|
protected static java.util.List<java.lang.String> |
stringToArray(java.lang.String input)
Read a string as a sequence of individual expressions, urls, etc.
|
void |
viewConfiguration(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
View configuration.
|
void |
viewSpecification(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber)
View specification.
|
getFormCheckJavascriptMethodName, getFormPresaveCheckJavascriptMethodName, requestInfoconnect, getConfiguration, isConnected, outputConfigurationBody, outputConfigurationHeader, outputConfigurationHeader, pack, packFixedList, packList, packList, processConfigurationPost, setThreadContext, unpack, unpackFixedList, unpackList, viewConfigurationpublic static final java.lang.String _rcsid
protected static final int RESULTSTATUS_FALSE
protected static final int RESULTSTATUS_TRUE
protected static final int RESULTSTATUS_NOTYETDETERMINED
protected static final java.lang.String[] interestingMimeTypeArray
protected static final java.util.Set<java.lang.String> interestingMimeTypeMap
protected static final java.util.Set<java.lang.String> understoodProtocols
protected static final int ROBOTS_NONE
protected static final int ROBOTS_DATA
protected static final int ROBOTS_ALL
protected static final int META_ROBOTS_NONE
protected static final int META_ROBOTS_ALL
public static final java.lang.String REL_LINK
public static final java.lang.String REL_REDIRECT
public static final java.lang.String ACTIVITY_FETCH
public static final java.lang.String ACTIVITY_PROCESS
public static final java.lang.String ACTIVITY_ROBOTSPARSE
public static final java.lang.String ACTIVITY_LOGON_START
public static final java.lang.String ACTIVITY_LOGON_END
protected static final java.lang.String FETCH_ROBOTS
protected static final java.lang.String FETCH_STANDARD
protected static final java.lang.String FETCH_LOGIN
protected static final java.util.Set<java.lang.String> reservedHeaders
protected static final java.util.List<java.lang.String> potentiallyExcludedHeaders
protected int robotsUsage
protected int metaRobotsTagsUsage
protected java.lang.String userAgent
protected java.lang.String from
protected int connectionTimeoutMilliseconds
protected int socketTimeoutMilliseconds
protected java.lang.String throttleGroupName
protected ThrottleDescription throttleDescription
protected CredentialsDescription credentialsDescription
protected TrustsDescription trustsDescription
protected RobotsManager robotsManager
protected DNSManager dnsManager
protected CookieManager cookieManager
protected boolean isInitialized
protected static DataCache cache
protected java.lang.String proxyHost
protected int proxyPort
protected java.lang.String proxyAuthDomain
protected java.lang.String proxyAuthUsername
protected java.lang.String proxyAuthPassword
protected static final int SESSIONSTATE_NORMAL
protected static final int SESSIONSTATE_LOGIN
protected static final int RESULT_NO_DOCUMENT
protected static final int RESULT_NO_VERSION
protected static final int RESULT_VERSION_NEEDED
protected static final int RESULT_RETRY_DOCUMENT
public int getConnectorModel()
getConnectorModel in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetConnectorModel in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorpublic void install(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
install in interface org.apache.manifoldcf.core.interfaces.IConnectorinstall in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the current thread context.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void deinstall(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
deinstall in interface org.apache.manifoldcf.core.interfaces.IConnectordeinstall in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the current thread context.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic java.lang.String[] getActivitiesList()
getActivitiesList in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetActivitiesList in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorpublic java.lang.String[] getRelationshipTypes()
getRelationshipTypes in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetRelationshipTypes in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorpublic void clearThreadContext()
clearThreadContext in interface org.apache.manifoldcf.core.interfaces.IConnectorclearThreadContext in class org.apache.manifoldcf.core.connector.BaseConnectorprotected void getSession()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void poll()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
poll in interface org.apache.manifoldcf.core.interfaces.IConnectorpoll in class org.apache.manifoldcf.core.connector.BaseConnectororg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic java.lang.String check()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
check in interface org.apache.manifoldcf.core.interfaces.IConnectorcheck in class org.apache.manifoldcf.core.connector.BaseConnectororg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void disconnect()
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
disconnect in interface org.apache.manifoldcf.core.interfaces.IConnectordisconnect in class org.apache.manifoldcf.core.connector.BaseConnectororg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic java.lang.String[] getBinNames(java.lang.String documentIdentifier)
getBinNames in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetBinNames in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectordocumentIdentifier - is the document identifier.public java.lang.String addSeedDocuments(org.apache.manifoldcf.crawler.interfaces.ISeedingActivity activities,
org.apache.manifoldcf.core.interfaces.Specification spec,
java.lang.String lastSeedVersion,
long seedTime,
int jobMode)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
addSeedDocuments in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectoraddSeedDocuments in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectoractivities - is the interface this method should use to perform whatever framework actions are desired.spec - is a document specification (that comes from the job).seedTime - is the end of the time range of documents to consider, exclusive.lastSeedVersion - is the last seeding version string for this job, or null if the job has no previous seeding version string.jobMode - is an integer describing how the job is being run, whether continuous or once-only.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic void processDocuments(java.lang.String[] documentIdentifiers,
org.apache.manifoldcf.crawler.interfaces.IExistingVersions statuses,
org.apache.manifoldcf.core.interfaces.Specification spec,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
int jobMode,
boolean usesDefaultAuthority)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
processDocuments in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorprocessDocuments in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectordocumentIdentifiers - is the set of document identifiers to process.statuses - are the currently-stored document versions for each document in the set of document identifiers
passed in above.activities - is the interface this method should use to queue up new document references
and ingest documents.jobMode - is an integer describing how the job is being run, whether continuous or once-only.usesDefaultAuthority - will be true only if the authority in use for these documents is the default one.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected void loginAndFetch(WebcrawlerConnector.FetchStatus fetchStatus, org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities, java.lang.String documentIdentifier, SequenceCredentials sessionCredential, java.lang.String globalSequenceEvent) throws org.apache.manifoldcf.core.interfaces.ManifoldCFException, org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected void processDocument(org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
java.lang.String documentIdentifier,
java.lang.String versionString,
boolean indexDocument,
java.util.Map<java.lang.String,java.util.Set<java.lang.String>> metaHash,
java.lang.String[] acls,
WebcrawlerConnector.DocumentURLFilter filter)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected static java.lang.String extractContentType(java.lang.String contentType)
protected static java.lang.String extractEncoding(java.lang.String contentType)
protected static java.lang.String extractMimeType(java.lang.String contentType)
protected static void handleIOException(java.io.IOException e,
java.lang.String context)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionpublic int getMaxDocumentRequest()
getMaxDocumentRequest in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorgetMaxDocumentRequest in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorpublic void outputConfigurationHeader(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.util.List<java.lang.String> tabsArray)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputConfigurationHeader in interface org.apache.manifoldcf.core.interfaces.IConnectoroutputConfigurationHeader in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.out - is the output to which any HTML should be sent.parameters - are the configuration parameters, as they currently exist, for this connection being configured.tabsArray - is an array of tab names. Add to this array any tab names that are specific to the connector.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic void outputConfigurationBody(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters,
java.lang.String tabName)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputConfigurationBody in interface org.apache.manifoldcf.core.interfaces.IConnectoroutputConfigurationBody in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.out - is the output to which any HTML should be sent.parameters - are the configuration parameters, as they currently exist, for this connection being configured.tabName - is the current tab name.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic java.lang.String processConfigurationPost(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
processConfigurationPost in interface org.apache.manifoldcf.core.interfaces.IConnectorprocessConfigurationPost in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.variableContext - is the set of variables available from the post, including binary file post information.parameters - are the configuration parameters, as they currently exist, for this connection being configured.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void viewConfiguration(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.ConfigParams parameters)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
viewConfiguration in interface org.apache.manifoldcf.core.interfaces.IConnectorviewConfiguration in class org.apache.manifoldcf.core.connector.BaseConnectorthreadContext - is the local thread context.out - is the output to which any HTML should be sent.parameters - are the configuration parameters, as they currently exist, for this connection being configured.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic void outputSpecificationHeader(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber,
java.util.List<java.lang.String> tabsArray)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputSpecificationHeader in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectoroutputSpecificationHeader in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorout - is the output to which any HTML should be sent.locale - is the locale the output is preferred to be in.ds - is the current document specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.tabsArray - is an array of tab names. Add to this array any tab names that are specific to the connector.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic void outputSpecificationBody(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber,
int actualSequenceNumber,
java.lang.String tabName)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
outputSpecificationBody in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectoroutputSpecificationBody in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorout - is the output to which any HTML should be sent.locale - is the locale the output is preferred to be in.ds - is the current document specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.actualSequenceNumber - is the connection within the job that has currently been selected.tabName - is the current tab name. (actualSequenceNumber, tabName) form a unique tuple within
the job.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionpublic java.lang.String processSpecificationPost(org.apache.manifoldcf.core.interfaces.IPostParameters variableContext,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
processSpecificationPost in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorprocessSpecificationPost in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorvariableContext - contains the post data, including binary file-upload information.locale - is the locale the output is preferred to be in.ds - is the current document specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic void viewSpecification(org.apache.manifoldcf.core.interfaces.IHTTPOutput out,
java.util.Locale locale,
org.apache.manifoldcf.core.interfaces.Specification ds,
int connectionSequenceNumber)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
java.io.IOException
viewSpecification in interface org.apache.manifoldcf.crawler.interfaces.IRepositoryConnectorviewSpecification in class org.apache.manifoldcf.crawler.connectors.BaseRepositoryConnectorout - is the output to which any HTML should be sent.locale - is the locale the output is preferred to be in.ds - is the current document specification for this job.connectionSequenceNumber - is the unique number of this connection within the job.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.io.IOExceptionprotected java.lang.String makeSessionLoginEventName(org.apache.manifoldcf.crawler.interfaces.INamingActivity activities,
java.lang.String sequenceKey)
protected java.lang.String makeDNSEventName(org.apache.manifoldcf.crawler.interfaces.INamingActivity activities,
java.lang.String hostNameKey)
protected int lookupIPAddress(java.lang.String documentIdentifier,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
java.lang.String hostName,
long currentTime,
java.lang.StringBuilder ipAddressBuffer)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected static java.lang.String makeRobotsKey(java.lang.String protocol,
java.lang.String hostName,
int port)
protected java.lang.String makeRobotsEventName(org.apache.manifoldcf.crawler.interfaces.INamingActivity versionActivities,
java.lang.String robotsKey)
protected int checkFetchAllowed(java.lang.String documentIdentifier,
java.lang.String protocol,
java.lang.String hostIPAddress,
int port,
PageCredentials credential,
org.apache.manifoldcf.connectorcommon.interfaces.IKeystoreManager trustStore,
java.lang.String hostName,
java.lang.String[] binNames,
long currentTime,
java.lang.String pathString,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity versionActivities,
int connectionLimit,
java.lang.String proxyHost,
int proxyPort,
java.lang.String proxyAuthDomain,
java.lang.String proxyAuthUsername,
java.lang.String proxyAuthPassword)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected java.lang.String makeDocumentIdentifier(java.lang.String parentIdentifier,
java.lang.String rawURL,
WebcrawlerConnector.DocumentURLFilter filter,
org.apache.manifoldcf.crawler.interfaces.IHistoryActivity activities)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
parentIdentifier - the identifier of the document in which the raw url was found, or null if none.rawURL - the starting, un-normalized, un-canonicalized URL.filter - the filter object, used to remove unmatching URLs.org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected java.lang.String doCanonicalization(WebcrawlerConnector.DocumentURLFilter filter, WebURL url) throws org.apache.manifoldcf.core.interfaces.ManifoldCFException, java.net.URISyntaxException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionjava.net.URISyntaxExceptionprotected boolean isContentInteresting(org.apache.manifoldcf.crawler.interfaces.IFingerprintActivity activities,
java.lang.String documentIdentifier,
int response,
java.lang.String contentType)
throws org.apache.manifoldcf.agents.interfaces.ServiceInterruption,
org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.agents.interfaces.ServiceInterruptionorg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected java.lang.String documentIdentifiertoFileName(java.lang.String documentIdentifier)
throws java.net.URISyntaxException
documentIdentifier - java.net.URISyntaxExceptionprotected java.lang.String findRedirectionURI(java.lang.String currentURI)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected FormData findHTMLForm(java.lang.String currentURI, LoginParameters lp) throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected java.lang.String findPreferredRedirectionURI(java.lang.String currentURI,
LoginParameters lp)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected java.lang.String findSpecifiedContent(java.lang.String currentURI,
LoginParameters lp)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected java.lang.String findHTMLLinkURI(java.lang.String currentURI,
LoginParameters lp)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected boolean extractLinks(java.lang.String documentIdentifier,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
WebcrawlerConnector.DocumentURLFilter filter)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected void handleRedirects(java.lang.String documentURI,
IRedirectionHandler handler)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected void handleXML(java.lang.String documentURI,
IXMLHandler handler)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected void handleHTML(java.lang.String documentURI,
IHTMLHandler handler)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected boolean isDocumentText(java.lang.String documentURI)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected static boolean isText(byte[] beginChunk,
int chunkLength)
protected static boolean isStrange(byte x)
protected static boolean isWhiteSpace(byte x)
protected static java.util.List<java.lang.String> stringToArray(java.lang.String input)
protected static void compileList(java.util.List<java.util.regex.Pattern> output,
java.util.List<java.lang.String> input)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected PageCredentials getPageCredential(java.lang.String documentIdentifier)
protected SequenceCredentials getSequenceCredential(java.lang.String documentIdentifier)
protected org.apache.manifoldcf.connectorcommon.interfaces.IKeystoreManager getTrustStore(java.lang.String documentIdentifier)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected static java.lang.String[] getAcls(org.apache.manifoldcf.core.interfaces.Specification spec)
spec - is the document specification.protected static java.util.Set<java.lang.String> findExcludedHeaders(org.apache.manifoldcf.core.interfaces.Specification spec)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected java.lang.String[] calculateDocumentEvents(org.apache.manifoldcf.crawler.interfaces.INamingActivity activities,
java.lang.String documentIdentifier)