blob: ce999b374be5b9ecb8f0c4f8c4c47fa7df78cd56 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.protocol.http.api;
import java.lang.invoke.MethodHandles;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.net.Proxy;
import java.net.URI;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ThreadLocalRandom;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.protocols.ProtocolLogUtil;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.util.GZIPUtils;
import org.apache.nutch.util.MimeUtil;
import org.apache.nutch.util.DeflateUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import crawlercommons.robots.BaseRobotRules;
public abstract class HttpBase implements Protocol {
public static final Text RESPONSE_TIME = new Text("_rs_");
public static final Text COOKIE = new Text("Cookie");
public static final int BUFFER_SIZE = 8 * 1024;
private static final byte[] EMPTY_CONTENT = new byte[0];
private HttpRobotRulesParser robots = null;
private ArrayList<String> userAgentNames = null;
/** Mapping hostnames to cookies */
private Map<String, String> hostCookies = null;
/** The proxy hostname. */
protected String proxyHost = null;
/** The proxy port. */
protected int proxyPort = 8080;
/** The proxy port. */
protected Proxy.Type proxyType = Proxy.Type.HTTP;
/** The proxy exception list. */
protected HashMap<String, String> proxyException = new HashMap<>();
/** Indicates if a proxy is used */
protected boolean useProxy = false;
/** The network timeout in millisecond */
protected int timeout = 10000;
/** The length limit for downloaded content, in bytes. */
protected int maxContent = 1024 * 1024;
/** The time limit to download the entire content, in seconds. */
protected int maxDuration = 300;
/** Whether to save partial fetches as truncated content. */
protected boolean partialAsTruncated = false;
/** The Nutch 'User-Agent' request header */
protected String userAgent = getAgentString("NutchCVS", null, "Nutch",
"https://nutch.apache.org/bot.html", "agent@nutch.apache.org");
/** The "Accept-Language" request header value. */
protected String acceptLanguage = "en-us,en-gb,en;q=0.7,*;q=0.3";
/** The "Accept-Charset" request header value. */
protected String acceptCharset = "utf-8,iso-8859-1;q=0.7,*;q=0.7";
/** The "Accept" request header value. */
protected String accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
/** The default logger */
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
/** The specified logger */
private Logger logger = LOG;
/** The nutch configuration */
private Configuration conf = null;
/**
* Logging utility, used to suppress stack traces for common exceptions in a
* configurable way.
*/
private ProtocolLogUtil logUtil = new ProtocolLogUtil();
/**
* MimeUtil for MIME type detection. Note (see NUTCH-2578): MimeUtil object is
* used concurrently by parallel fetcher threads, methods to detect MIME type
* must be thread-safe.
*/
private MimeUtil mimeTypes = null;
/** Do we use HTTP/1.1? */
protected boolean useHttp11 = false;
/** Whether to use HTTP/2 */
protected boolean useHttp2 = false;
/**
* Record response time in CrawlDatum's meta data, see property
* http.store.responsetime.
*/
protected boolean responseTime = true;
/**
* Record the IP address of the responding server, see property
* <code>store.ip.address</code>.
*/
protected boolean storeIPAddress = false;
/**
* Record the HTTP request in the metadata, see property
* <code>store.http.request</code>.
*/
protected boolean storeHttpRequest = false;
/**
* Record the HTTP response header in the metadata, see property
* <code>store.http.headers</code>.
*/
protected boolean storeHttpHeaders = false;
/** Skip page if Crawl-Delay longer than this value. */
protected long maxCrawlDelay = -1L;
/** Whether to check TLS/SSL certificates */
protected boolean tlsCheckCertificate = false;
/** Which TLS/SSL protocols to support */
protected Set<String> tlsPreferredProtocols;
/** Which TLS/SSL cipher suites to support */
protected Set<String> tlsPreferredCipherSuites;
/** Configuration directive for If-Modified-Since HTTP header */
protected boolean enableIfModifiedsinceHeader = true;
/**
* Controls whether or not to set Cookie HTTP header based on CrawlDatum
* metadata
*/
protected boolean enableCookieHeader = true;
/** Creates a new instance of HttpBase */
public HttpBase() {
this(null);
}
/**
* Creates a new instance of HttpBase
* @param logger the {@link org.slf4j.Logger} to use
* in this HttpBase
*/
public HttpBase(Logger logger) {
if (logger != null) {
this.logger = logger;
}
robots = new HttpRobotRulesParser();
}
@Override
public void setConf(Configuration conf) {
this.conf = conf;
this.proxyHost = conf.get("http.proxy.host");
this.proxyPort = conf.getInt("http.proxy.port", 8080);
this.proxyType = Proxy.Type.valueOf(conf.get("http.proxy.type", "HTTP"));
this.proxyException = arrayToMap(
conf.getStrings("http.proxy.exception.list"));
this.useProxy = (proxyHost != null && proxyHost.length() > 0);
this.timeout = conf.getInt("http.timeout", 10000);
this.maxContent = conf.getInt("http.content.limit", 1024 * 1024);
this.maxDuration = conf.getInt("http.time.limit", -1);
this.partialAsTruncated = conf.getBoolean("http.partial.truncated", false);
this.userAgent = getAgentString(conf.get("http.agent.name"),
conf.get("http.agent.version"), conf.get("http.agent.description"),
conf.get("http.agent.url"), conf.get("http.agent.email"));
this.acceptLanguage = conf.get("http.accept.language", acceptLanguage)
.trim();
this.acceptCharset = conf.get("http.accept.charset", acceptCharset).trim();
this.accept = conf.get("http.accept", accept).trim();
this.mimeTypes = new MimeUtil(conf);
// backward-compatible default setting
this.useHttp11 = conf.getBoolean("http.useHttp11", true);
this.useHttp2 = conf.getBoolean("http.useHttp2", false);
this.tlsCheckCertificate = conf.getBoolean("http.tls.certificates.check",
false);
this.responseTime = conf.getBoolean("http.store.responsetime", true);
this.storeIPAddress = conf.getBoolean("store.ip.address", false);
this.storeHttpRequest = conf.getBoolean("store.http.request", false);
this.storeHttpHeaders = conf.getBoolean("store.http.headers", false);
this.enableIfModifiedsinceHeader = conf
.getBoolean("http.enable.if.modified.since.header", true);
this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header",
true);
this.robots.setConf(conf);
this.logUtil.setConf(conf);
// NUTCH-1941: read list of alternating agent names
if (conf.getBoolean("http.agent.rotate", false)) {
String agentsFile = conf.get("http.agent.rotate.file", "agents.txt");
BufferedReader br = null;
try {
Reader reader = conf.getConfResourceAsReader(agentsFile);
br = new BufferedReader(reader);
userAgentNames = new ArrayList<String>();
String word = "";
while ((word = br.readLine()) != null) {
if (!word.trim().isEmpty())
userAgentNames.add(word.trim());
}
if (userAgentNames.size() == 0) {
logger.warn("Empty list of user agents in http.agent.rotate.file {}",
agentsFile);
userAgentNames = null;
}
} catch (Exception e) {
logger.warn("Failed to read http.agent.rotate.file {}: {}", agentsFile,
StringUtils.stringifyException(e));
userAgentNames = null;
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
// ignore
}
}
}
if (userAgentNames == null) {
logger.warn(
"Falling back to fixed user agent set via property http.agent.name");
}
}
// If cookies are enabled, try to load a per-host cookie file
if (enableCookieHeader) {
String cookieFile = conf.get("http.agent.host.cookie.file",
"cookies.txt");
BufferedReader br = null;
try {
Reader reader = conf.getConfResourceAsReader(cookieFile);
br = new BufferedReader(reader);
hostCookies = new HashMap<String, String>();
String word = "";
while ((word = br.readLine()) != null) {
if (!word.trim().isEmpty()) {
if (word.indexOf("#") == -1) { // skip comment
String[] parts = word.split("\t");
if (parts.length == 2) {
hostCookies.put(parts[0], parts[1]);
} else {
LOG.warn("Unable to parse cookie file correctly at: " + word);
}
}
}
}
} catch (Exception e) {
logger.warn("Failed to read http.agent.host.cookie.file {}: {}",
cookieFile, StringUtils.stringifyException(e));
hostCookies = null;
} finally {
if (br != null) {
try {
br.close();
} catch (IOException e) {
// ignore
}
}
}
}
String[] protocols = conf.getStrings("http.tls.supported.protocols",
"TLSv1.3", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
"ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-GCM-SHA256",
"ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
"ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-CHACHA20-POLY1305",
"DHE-RSA-AES128-GCM-SHA256", "DHE-RSA-AES256-GCM-SHA384",
"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
"TLS_RSA_WITH_AES_256_CBC_SHA256",
"TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA384",
"TLS_ECDH_RSA_WITH_AES_256_CBC_SHA384",
"TLS_DHE_RSA_WITH_AES_256_CBC_SHA256",
"TLS_DHE_DSS_WITH_AES_256_CBC_SHA256",
"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
"TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
"TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA",
"TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
"TLS_RSA_WITH_AES_128_CBC_SHA256",
"TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA256",
"TLS_ECDH_RSA_WITH_AES_128_CBC_SHA256",
"TLS_DHE_RSA_WITH_AES_128_CBC_SHA256",
"TLS_DHE_DSS_WITH_AES_128_CBC_SHA256",
"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
"TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
"TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA",
"TLS_DHE_DSS_WITH_AES_128_CBC_SHA", "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
"TLS_ECDHE_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_RC4_128_SHA",
"TLS_ECDH_ECDSA_WITH_RC4_128_SHA", "TLS_ECDH_RSA_WITH_RC4_128_SHA",
"TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
"TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
"TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
"TLS_ECDH_RSA_WITH_3DES_EDE_CBC_SHA",
"SSL_DHE_RSA_WITH_3DES_EDE_CBC_SHA",
"SSL_DHE_DSS_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_RC4_128_MD5",
"TLS_EMPTY_RENEGOTIATION_INFO_SCSV", "TLS_RSA_WITH_NULL_SHA256",
"TLS_ECDHE_ECDSA_WITH_NULL_SHA", "TLS_ECDHE_RSA_WITH_NULL_SHA",
"SSL_RSA_WITH_NULL_SHA", "TLS_ECDH_ECDSA_WITH_NULL_SHA",
"TLS_ECDH_RSA_WITH_NULL_SHA", "SSL_RSA_WITH_NULL_MD5",
"SSL_RSA_WITH_DES_CBC_SHA", "SSL_DHE_RSA_WITH_DES_CBC_SHA",
"SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
"TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
"TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
"TLS_KRB5_WITH_DES_CBC_MD5", "TLS_AES_256_GCM_SHA384",
"TLS_CHACHA20_POLY1305_SHA256", "TLS_AES_128_GCM_SHA256",
"TLS_AES_128_CCM_8_SHA256", "TLS_AES_128_CCM_SHA256");
tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
logConf();
}
@Override
public Configuration getConf() {
return this.conf;
}
@Override
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
long startTime = System.currentTimeMillis();
Response response = getResponse(u, datum, false); // make a request
if (this.responseTime) {
int elapsedTime = (int) (System.currentTimeMillis() - startTime);
datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
}
int code = response.getCode();
datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
new Text(Integer.toString(code)));
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(),
(content == null ? EMPTY_CONTENT : content),
response.getHeader("Content-Type"), response.getHeaders(), mimeTypes);
if (code == 200) { // got a good response
return new ProtocolOutput(c); // return it
} else if (code >= 300 && code < 400) { // handle redirect
String location = response.getHeader("Location");
// some broken servers, such as MS IIS, use lowercase header name...
if (location == null)
location = response.getHeader("location");
if (location == null)
location = "";
u = new URL(u, location);
int protocolStatusCode;
switch (code) {
case 300: // multiple choices, preferred value in Location
protocolStatusCode = ProtocolStatus.MOVED;
break;
case 301: // moved permanently
case 305: // use proxy (Location is URL of proxy)
protocolStatusCode = ProtocolStatus.MOVED;
break;
case 302: // found (temporarily moved)
case 303: // see other (redirect after POST)
case 307: // temporary redirect
protocolStatusCode = ProtocolStatus.TEMP_MOVED;
break;
case 304: // not modified
protocolStatusCode = ProtocolStatus.NOTMODIFIED;
break;
default:
protocolStatusCode = ProtocolStatus.MOVED;
}
// handle this in the higher layer.
return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
} else if (code == 400) { // bad request, mark as GONE
if (logger.isTraceEnabled()) {
logger.trace("400 Bad request: " + u);
}
return new ProtocolOutput(c,
new ProtocolStatus(ProtocolStatus.GONE, u));
} else if (code == 401) { // requires authorization, but no valid auth
// provided.
if (logger.isTraceEnabled()) {
logger.trace("401 Authentication Required");
}
return new ProtocolOutput(c,
new ProtocolStatus(ProtocolStatus.ACCESS_DENIED,
"Authentication required: " + urlString));
} else if (code == 404) {
return new ProtocolOutput(c,
new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
} else if (code == 410) { // permanently GONE
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
"Http: " + code + " url=" + u));
} else {
return new ProtocolOutput(c, new ProtocolStatus(
ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
}
} catch (Throwable e) {
if (logger.isDebugEnabled() || !logUtil.logShort(e)) {
logger.error("Failed to get protocol output", e);
} else {
logger.error("Failed to get protocol output: {}",
e.getClass().getName());
}
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
/*
* -------------------------- * </implementation:Protocol> *
* --------------------------
*/
public String getProxyHost() {
return proxyHost;
}
public int getProxyPort() {
return proxyPort;
}
public boolean useProxy(URL url) {
return useProxy(url.getHost());
}
public boolean useProxy(URI uri) {
return useProxy(uri.getHost());
}
public boolean useProxy(String host) {
if (useProxy && proxyException.containsKey(host)) {
return false;
}
return useProxy;
}
public int getTimeout() {
return timeout;
}
public boolean isIfModifiedSinceEnabled() {
return enableIfModifiedsinceHeader;
}
public boolean isCookieEnabled() {
return enableCookieHeader;
}
public boolean isStoreIPAddress() {
return storeIPAddress;
}
public boolean isStoreHttpRequest() {
return storeHttpRequest;
}
public boolean isStoreHttpHeaders() {
return storeHttpHeaders;
}
public int getMaxContent() {
return maxContent;
}
/**
* The time limit to download the entire content, in seconds. See the property
* <code>http.time.limit</code>.
* @return the maximum duration
*/
public int getMaxDuration() {
return maxDuration;
}
/**
* Whether to save partial fetches as truncated content, cf. the property
* <code>http.partial.truncated</code>.
* @return true if partially fetched truncated content is stored
*/
public boolean isStorePartialAsTruncated() {
return partialAsTruncated;
}
public String getUserAgent() {
if (userAgentNames != null) {
return userAgentNames
.get(ThreadLocalRandom.current().nextInt(userAgentNames.size()));
}
return userAgent;
}
/**
* If per-host cookies are configured, this method will look it up for the
* given url.
*
* @param url
* the url to look-up a cookie for
* @return the cookie or null
*/
public String getCookie(URL url) {
if (hostCookies != null) {
return hostCookies.get(url.getHost());
}
return null;
}
/**
* Value of "Accept-Language" request header sent by Nutch.
*
* @return The value of the header "Accept-Language" header.
*/
public String getAcceptLanguage() {
return acceptLanguage;
}
public String getAcceptCharset() {
return acceptCharset;
}
public String getAccept() {
return accept;
}
public boolean getUseHttp11() {
return useHttp11;
}
public boolean isTlsCheckCertificates() {
return tlsCheckCertificate;
}
public Set<String> getTlsPreferredCipherSuites() {
return tlsPreferredCipherSuites;
}
public Set<String> getTlsPreferredProtocols() {
return tlsPreferredProtocols;
}
private static String getAgentString(String agentName, String agentVersion,
String agentDesc, String agentURL, String agentEmail) {
if ((agentName == null) || (agentName.trim().length() == 0)) {
// TODO : NUTCH-258
if (LOG.isErrorEnabled()) {
LOG.error("No User-Agent string set (http.agent.name)!");
}
}
StringBuffer buf = new StringBuffer();
buf.append(agentName);
if (agentVersion != null && !agentVersion.trim().isEmpty()) {
buf.append("/");
buf.append(agentVersion);
}
if (((agentDesc != null) && (agentDesc.length() != 0))
|| ((agentEmail != null) && (agentEmail.length() != 0))
|| ((agentURL != null) && (agentURL.length() != 0))) {
buf.append(" (");
if ((agentDesc != null) && (agentDesc.length() != 0)) {
buf.append(agentDesc);
if ((agentURL != null) || (agentEmail != null))
buf.append("; ");
}
if ((agentURL != null) && (agentURL.length() != 0)) {
buf.append(agentURL);
if (agentEmail != null)
buf.append("; ");
}
if ((agentEmail != null) && (agentEmail.length() != 0))
buf.append(agentEmail);
buf.append(")");
}
return buf.toString();
}
protected void logConf() {
if (logger.isInfoEnabled()) {
logger.info("http.proxy.host = " + proxyHost);
logger.info("http.proxy.port = " + proxyPort);
logger.info("http.proxy.exception.list = " + useProxy);
logger.info("http.timeout = " + timeout);
logger.info("http.content.limit = " + maxContent);
logger.info("http.agent = " + userAgent);
logger.info("http.accept.language = " + acceptLanguage);
logger.info("http.accept = " + accept);
logger.info("http.enable.cookie.header = " + isCookieEnabled());
}
}
public byte[] processGzipEncoded(byte[] compressed, URL url)
throws IOException {
if (LOG.isTraceEnabled()) {
LOG.trace("uncompressing....");
}
// content can be empty (i.e. redirection) in which case
// there is nothing to unzip
if (compressed.length == 0)
return compressed;
byte[] content;
if (getMaxContent() >= 0) {
content = GZIPUtils.unzipBestEffort(compressed, getMaxContent());
} else {
content = GZIPUtils.unzipBestEffort(compressed);
}
if (content == null)
throw new IOException("unzipBestEffort returned null");
if (LOG.isTraceEnabled()) {
LOG.trace("fetched " + compressed.length
+ " bytes of compressed content (expanded to " + content.length
+ " bytes) from " + url);
}
return content;
}
public byte[] processDeflateEncoded(byte[] compressed, URL url)
throws IOException {
// content can be empty (i.e. redirection) in which case
// there is nothing to deflate
if (compressed.length == 0)
return compressed;
if (LOG.isTraceEnabled()) {
LOG.trace("inflating....");
}
byte[] content;
if (getMaxContent() >= 0) {
content = DeflateUtils.inflateBestEffort(compressed, getMaxContent());
} else {
content = DeflateUtils.inflateBestEffort(compressed);
}
if (content == null)
throw new IOException("inflateBestEffort returned null");
if (LOG.isTraceEnabled()) {
LOG.trace("fetched " + compressed.length
+ " bytes of compressed content (expanded to " + content.length
+ " bytes) from " + url);
}
return content;
}
protected static void main(HttpBase http, String[] args) throws Exception {
String url = null;
String usage = "Usage: Http [-verbose] [-timeout N] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) { // parse command line
if (args[i].equals("-timeout")) { // found -timeout option
http.timeout = Integer.parseInt(args[++i]) * 1000;
} else if (args[i].equals("-verbose")) { // found -verbose option
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
} else
// root is required parameter
url = args[i];
}
ProtocolOutput out = http.getProtocolOutput(new Text(url),
new CrawlDatum());
Content content = out.getContent();
System.out.println("Status: " + out.getStatus());
if (content != null) {
System.out.println("Content Type: " + content.getContentType());
System.out.println("Content Length: "
+ content.getMetadata().get(Response.CONTENT_LENGTH));
System.out.println("Content:");
String text = new String(content.getContent());
System.out.println(text);
}
}
protected abstract Response getResponse(URL url, CrawlDatum datum,
boolean followRedirects) throws ProtocolException, IOException;
@Override
public BaseRobotRules getRobotRules(Text url, CrawlDatum datum,
List<Content> robotsTxtContent) {
return robots.getRobotRulesSet(this, url, robotsTxtContent);
}
/**
* Transforming a String[] into a HashMap for faster searching
*
* @param input
* String[]
* @return a new HashMap
*/
private HashMap<String, String> arrayToMap(String[] input) {
if (input == null || input.length == 0) {
return new HashMap<String, String>();
}
HashMap<String, String> hm = new HashMap<>();
for (int i = 0; i < input.length; i++) {
if (!"".equals(input[i].trim())) {
hm.put(input[i], input[i]);
}
}
return hm;
}
}