public class Robots
extends java.lang.Object
| Modifier and Type | Class and Description |
|---|---|
protected class |
Robots.Host
This class maintains status for a given host.
|
protected static class |
Robots.Record
This class represents a record in a robots.txt file.
|
| Modifier and Type | Field and Description |
|---|---|
static java.lang.String |
_rcsid |
protected java.util.Map |
cache
This is the cache hash - which is keyed by the protocol/host/port, and has a Host object as the
value.
|
protected ThrottledFetcher |
fetcher
Fetcher to use to get the data from wherever
|
protected int |
refCount
Reference count
|
protected static java.lang.String |
ROBOT_CONNECTION_TYPE
Robots connection type value
|
protected static java.lang.String |
ROBOT_FILE_NAME
Robot file name value
|
protected static int |
ROBOT_TIMEOUT_MILLISECONDS
Robots fetch timeout value
|
| Constructor and Description |
|---|
Robots(ThrottledFetcher fetcher)
Constructor.
|
| Modifier and Type | Method and Description |
|---|---|
protected static boolean |
doesPathMatch(java.lang.String path,
int pathIndex,
java.lang.String spec,
int specIndex)
Recursive method for matching specification to path.
|
protected static boolean |
doesPathMatch(java.lang.String path,
java.lang.String spec)
Check if path matches specification
|
boolean |
isFetchAllowed(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
java.lang.String throttleGroupName,
java.lang.String protocol,
int port,
java.lang.String hostName,
java.lang.String pathString,
java.lang.String userAgent,
java.lang.String from,
java.lang.String proxyHost,
int proxyPort,
java.lang.String proxyAuthDomain,
java.lang.String proxyAuthUsername,
java.lang.String proxyAuthPassword,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
int connectionLimit)
Decide whether a specific robot can crawl a specific URL.
|
protected static java.lang.String |
makeReadable(java.lang.String inputString)
Convert a string from the robots file into a readable form that does NOT contain NUL characters (since postgresql does not accept those).
|
void |
noteConnectionEstablished()
Note that a connection has been established.
|
void |
noteConnectionReleased()
Note that a connection has been released, and free resources if no reason
to retain them.
|
void |
poll()
Clean idle stuff out of cache
|
public static final java.lang.String _rcsid
protected static final int ROBOT_TIMEOUT_MILLISECONDS
protected static final java.lang.String ROBOT_CONNECTION_TYPE
protected static final java.lang.String ROBOT_FILE_NAME
protected ThrottledFetcher fetcher
protected int refCount
protected java.util.Map cache
public Robots(ThrottledFetcher fetcher)
public void noteConnectionEstablished()
public void noteConnectionReleased()
public void poll()
public boolean isFetchAllowed(org.apache.manifoldcf.core.interfaces.IThreadContext threadContext,
java.lang.String throttleGroupName,
java.lang.String protocol,
int port,
java.lang.String hostName,
java.lang.String pathString,
java.lang.String userAgent,
java.lang.String from,
java.lang.String proxyHost,
int proxyPort,
java.lang.String proxyAuthDomain,
java.lang.String proxyAuthUsername,
java.lang.String proxyAuthPassword,
org.apache.manifoldcf.crawler.interfaces.IProcessActivity activities,
int connectionLimit)
throws org.apache.manifoldcf.core.interfaces.ManifoldCFException,
org.apache.manifoldcf.agents.interfaces.ServiceInterruption
userAgent - is the user-agent string used by the robot.from - is the email address.protocol - is the name of the protocol (e.g. "http")port - is the port number (-1 being the default for the protocol)hostName - is the fqdn of the hostpathString - is the path (non-query) part of the URLorg.apache.manifoldcf.core.interfaces.ManifoldCFExceptionorg.apache.manifoldcf.agents.interfaces.ServiceInterruptionprotected static java.lang.String makeReadable(java.lang.String inputString)
protected static boolean doesPathMatch(java.lang.String path,
java.lang.String spec)
protected static boolean doesPathMatch(java.lang.String path,
int pathIndex,
java.lang.String spec,
int specIndex)