protected class WebcrawlerConnector.DocumentURLFilter
extends java.lang.Object
| Modifier and Type | Field and Description |
|---|---|
protected WebcrawlerConnector.CanonicalizationPolicies |
canonicalizationPolicies
Canonicalization policies
|
protected java.util.List<java.util.regex.Pattern> |
excludeContentIndexPatterns
List of content exclusion pattern
|
protected java.util.List<java.util.regex.Pattern> |
excludeIndexPatterns
The arraylist of index exclude patterns
|
protected java.util.List<java.util.regex.Pattern> |
excludePatterns
The arraylist of exclude patterns
|
protected java.util.List<java.util.regex.Pattern> |
includeIndexPatterns
The arraylist of index include patterns
|
protected java.util.List<java.util.regex.Pattern> |
includePatterns
The arraylist of include patterns
|
protected WebcrawlerConnector.MappingRules |
mappings
Mapping rules
|
protected java.util.Set<java.lang.String> |
seedHosts
The hash map of seed hosts, to limit urls by, if non-null
|
protected java.lang.String |
versionString
The version string
|
| Constructor and Description |
|---|
DocumentURLFilter(org.apache.manifoldcf.core.interfaces.Specification spec)
Process a document specification to produce a filter.
|
| Modifier and Type | Method and Description |
|---|---|
protected java.lang.String |
findSpecifiedContent(java.lang.String currentURI,
java.util.List<java.util.regex.Pattern> patterns) |
WebcrawlerConnector.CanonicalizationPolicies |
getCanonicalizationPolicies()
Get canonicalization policies
|
java.lang.String |
getVersionString()
Get whatever contribution to the version string should come from this data.
|
boolean |
isDocumentAndHostLegal(java.lang.String url,
org.apache.manifoldcf.crawler.interfaces.IHistoryActivity activities)
Check if both a document and host are legal.
|
boolean |
isDocumentContentIndexable(java.lang.String documentIdentifier) |
java.lang.String |
isDocumentIndexable(java.lang.String url,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities)
Check if the document identifier is indexable, and return the indexing URL if found.
|
boolean |
isDocumentLegal(java.lang.String url,
org.apache.manifoldcf.crawler.interfaces.IHistoryActivity activities)
Check if the document identifier is legal.
|
boolean |
isHostLegal(java.lang.String host)
Check if a host is legal.
|
protected java.lang.String versionString
protected final WebcrawlerConnector.MappingRules mappings
protected final java.util.List<java.util.regex.Pattern> includePatterns
protected final java.util.List<java.util.regex.Pattern> excludePatterns
protected final java.util.List<java.util.regex.Pattern> includeIndexPatterns
protected final java.util.List<java.util.regex.Pattern> excludeIndexPatterns
protected java.util.Set<java.lang.String> seedHosts
protected final java.util.List<java.util.regex.Pattern> excludeContentIndexPatterns
protected final WebcrawlerConnector.CanonicalizationPolicies canonicalizationPolicies
public DocumentURLFilter(org.apache.manifoldcf.core.interfaces.Specification spec)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic java.lang.String getVersionString()
public boolean isDocumentAndHostLegal(java.lang.String url,
org.apache.manifoldcf.crawler.interfaces.IHistoryActivity activities)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic boolean isHostLegal(java.lang.String host)
public boolean isDocumentLegal(java.lang.String url,
org.apache.manifoldcf.crawler.interfaces.IHistoryActivity activities)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic java.lang.String isDocumentIndexable(java.lang.String url,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionpublic WebcrawlerConnector.CanonicalizationPolicies getCanonicalizationPolicies()
public boolean isDocumentContentIndexable(java.lang.String documentIdentifier)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFExceptionprotected java.lang.String findSpecifiedContent(java.lang.String currentURI,
java.util.List<java.util.regex.Pattern> patterns)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException
org.apache.manifoldcf.core.interfaces.ManifoldCFException