Merge pull request #538 from balashashanka/NUTCH-2782
NUTCH-2782: protocol-http / lib-http: support TLSv1.3
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 30e2432..8e96a26 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -61,7 +61,7 @@
public static final Text RESPONSE_TIME = new Text("_rs_");
public static final Text COOKIE = new Text("Cookie");
-
+
public static final int BUFFER_SIZE = 8 * 1024;
private static final byte[] EMPTY_CONTENT = new byte[0];
@@ -69,7 +69,7 @@
private HttpRobotRulesParser robots = null;
private ArrayList<String> userAgentNames = null;
-
+
/** Mapping hostnames to cookies */
private Map<String, String> hostCookies = null;
@@ -78,12 +78,12 @@
/** The proxy port. */
protected int proxyPort = 8080;
-
+
/** The proxy port. */
protected Proxy.Type proxyType = Proxy.Type.HTTP;
/** The proxy exception list. */
- protected HashMap<String,String> proxyException = new HashMap<>();
+ protected HashMap<String, String> proxyException = new HashMap<>();
/** Indicates if a proxy is used */
protected boolean useProxy = false;
@@ -177,11 +177,14 @@
/** Which TLS/SSL cipher suites to support */
protected Set<String> tlsPreferredCipherSuites;
-
+
/** Configuration directive for If-Modified-Since HTTP header */
protected boolean enableIfModifiedsinceHeader = true;
-
- /** Controls whether or not to set Cookie HTTP header based on CrawlDatum metadata */
+
+ /**
+ * Controls whether or not to set Cookie HTTP header based on CrawlDatum
+ * metadata
+ */
protected boolean enableCookieHeader = true;
/** Creates a new instance of HttpBase */
@@ -203,13 +206,13 @@
this.proxyHost = conf.get("http.proxy.host");
this.proxyPort = conf.getInt("http.proxy.port", 8080);
this.proxyType = Proxy.Type.valueOf(conf.get("http.proxy.type", "HTTP"));
- this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list"));
+ this.proxyException = arrayToMap(
+ conf.getStrings("http.proxy.exception.list"));
this.useProxy = (proxyHost != null && proxyHost.length() > 0);
this.timeout = conf.getInt("http.timeout", 10000);
this.maxContent = conf.getInt("http.content.limit", 1024 * 1024);
this.maxDuration = conf.getInt("http.time.limit", -1);
- this.partialAsTruncated = conf
- .getBoolean("http.partial.truncated", false);
+ this.partialAsTruncated = conf.getBoolean("http.partial.truncated", false);
this.userAgent = getAgentString(conf.get("http.agent.name"),
conf.get("http.agent.version"), conf.get("http.agent.description"),
conf.get("http.agent.url"), conf.get("http.agent.email"));
@@ -227,8 +230,10 @@
this.storeIPAddress = conf.getBoolean("store.ip.address", false);
this.storeHttpRequest = conf.getBoolean("store.http.request", false);
this.storeHttpHeaders = conf.getBoolean("store.http.headers", false);
- this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
- this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true);
+ this.enableIfModifiedsinceHeader = conf
+ .getBoolean("http.enable.if.modified.since.header", true);
+ this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header",
+ true);
this.robots.setConf(conf);
this.logUtil.setConf(conf);
@@ -267,19 +272,20 @@
}
}
if (userAgentNames == null) {
- logger
- .warn("Falling back to fixed user agent set via property http.agent.name");
+ logger.warn(
+ "Falling back to fixed user agent set via property http.agent.name");
}
}
-
+
// If cookies are enabled, try to load a per-host cookie file
if (enableCookieHeader) {
- String cookieFile = conf.get("http.agent.host.cookie.file", "cookies.txt");
+ String cookieFile = conf.get("http.agent.host.cookie.file",
+ "cookies.txt");
BufferedReader br = null;
try {
Reader reader = conf.getConfResourceAsReader(cookieFile);
br = new BufferedReader(reader);
- hostCookies = new HashMap<String,String>();
+ hostCookies = new HashMap<String, String>();
String word = "";
while ((word = br.readLine()) != null) {
if (!word.trim().isEmpty()) {
@@ -294,8 +300,8 @@
}
}
} catch (Exception e) {
- logger.warn("Failed to read http.agent.host.cookie.file {}: {}", cookieFile,
- StringUtils.stringifyException(e));
+ logger.warn("Failed to read http.agent.host.cookie.file {}: {}",
+ cookieFile, StringUtils.stringifyException(e));
hostCookies = null;
} finally {
if (br != null) {
@@ -309,16 +315,12 @@
}
String[] protocols = conf.getStrings("http.tls.supported.protocols",
- "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+ "TLSv1.3", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
- "ECDHE-ECDSA-AES128-GCM-SHA256",
- "ECDHE-RSA-AES128-GCM-SHA256",
- "ECDHE-ECDSA-AES256-GCM-SHA384",
- "ECDHE-RSA-AES256-GCM-SHA384",
- "ECDHE-ECDSA-CHACHA20-POLY1305",
- "ECDHE-RSA-CHACHA20-POLY1305",
- "DHE-RSA-AES128-GCM-SHA256",
- "DHE-RSA-AES256-GCM-SHA384",
+ "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-GCM-SHA256",
+ "ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
+ "ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-CHACHA20-POLY1305",
+ "DHE-RSA-AES128-GCM-SHA256", "DHE-RSA-AES256-GCM-SHA384",
"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
"TLS_RSA_WITH_AES_256_CBC_SHA256",
@@ -329,8 +331,8 @@
"TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
"TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
"TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
- "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
- "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+ "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA",
+ "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
"TLS_RSA_WITH_AES_128_CBC_SHA256",
@@ -341,11 +343,10 @@
"TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
"TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
"TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
- "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
- "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
- "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
- "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
- "TLS_ECDH_RSA_WITH_RC4_128_SHA",
+ "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA",
+ "TLS_DHE_DSS_WITH_AES_128_CBC_SHA", "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
+ "TLS_ECDHE_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_RC4_128_SHA",
+ "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", "TLS_ECDH_RSA_WITH_RC4_128_SHA",
"TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
"TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
"TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
@@ -360,7 +361,9 @@
"SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
"TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
"TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
- "TLS_KRB5_WITH_DES_CBC_MD5");
+ "TLS_KRB5_WITH_DES_CBC_MD5", "TLS_AES_256_GCM_SHA384",
+ "TLS_CHACHA20_POLY1305_SHA256", "TLS_AES_128_GCM_SHA256",
+ "TLS_AES_128_CCM_8_SHA256", "TLS_AES_128_CCM_SHA256");
tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
@@ -389,7 +392,7 @@
int code = response.getCode();
datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
- new Text(Integer.toString(code)));
+ new Text(Integer.toString(code)));
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(),
@@ -433,18 +436,19 @@
if (logger.isTraceEnabled()) {
logger.trace("400 Bad request: " + u);
}
- return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
+ return new ProtocolOutput(c,
+ new ProtocolStatus(ProtocolStatus.GONE, u));
} else if (code == 401) { // requires authorization, but no valid auth
// provided.
if (logger.isTraceEnabled()) {
logger.trace("401 Authentication Required");
}
- return new ProtocolOutput(c, new ProtocolStatus(
- ProtocolStatus.ACCESS_DENIED, "Authentication required: "
- + urlString));
+ return new ProtocolOutput(c,
+ new ProtocolStatus(ProtocolStatus.ACCESS_DENIED,
+ "Authentication required: " + urlString));
} else if (code == 404) {
- return new ProtocolOutput(c, new ProtocolStatus(
- ProtocolStatus.NOTFOUND, u));
+ return new ProtocolOutput(c,
+ new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
} else if (code == 410) { // permanently GONE
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
"Http: " + code + " url=" + u));
@@ -494,11 +498,11 @@
public int getTimeout() {
return timeout;
}
-
+
public boolean isIfModifiedSinceEnabled() {
return enableIfModifiedsinceHeader;
}
-
+
public boolean isCookieEnabled() {
return enableCookieHeader;
}
@@ -542,19 +546,20 @@
}
return userAgent;
}
-
+
/**
- * If per-host cookies are configured, this method will look it up
- * for the given url.
+ * If per-host cookies are configured, this method will look it up for the
+ * given url.
*
- * @param url the url to look-up a cookie for
+ * @param url
+ * the url to look-up a cookie for
* @return the cookie or null
*/
public String getCookie(URL url) {
if (hostCookies != null) {
return hostCookies.get(url.getHost());
}
-
+
return null;
}
@@ -729,8 +734,8 @@
url = args[i];
}
- ProtocolOutput out = http
- .getProtocolOutput(new Text(url), new CrawlDatum());
+ ProtocolOutput out = http.getProtocolOutput(new Text(url),
+ new CrawlDatum());
Content content = out.getContent();
System.out.println("Status: " + out.getStatus());
@@ -752,10 +757,12 @@
List<Content> robotsTxtContent) {
return robots.getRobotRulesSet(this, url, robotsTxtContent);
}
-
+
/**
* Transforming a String[] into a HashMap for faster searching
- * @param input String[]
+ *
+ * @param input
+ * String[]
* @return a new HashMap
*/
private HashMap<String, String> arrayToMap(String[] input) {