Merge pull request #538 from balashashanka/NUTCH-2782

NUTCH-2782: protocol-http / lib-http: support TLSv1.3
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 30e2432..8e96a26 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -61,7 +61,7 @@
   public static final Text RESPONSE_TIME = new Text("_rs_");
 
   public static final Text COOKIE = new Text("Cookie");
-  
+
   public static final int BUFFER_SIZE = 8 * 1024;
 
   private static final byte[] EMPTY_CONTENT = new byte[0];
@@ -69,7 +69,7 @@
   private HttpRobotRulesParser robots = null;
 
   private ArrayList<String> userAgentNames = null;
-  
+
   /** Mapping hostnames to cookies */
   private Map<String, String> hostCookies = null;
 
@@ -78,12 +78,12 @@
 
   /** The proxy port. */
   protected int proxyPort = 8080;
-  
+
   /** The proxy port. */
   protected Proxy.Type proxyType = Proxy.Type.HTTP;
 
   /** The proxy exception list. */
-  protected HashMap<String,String> proxyException = new HashMap<>();
+  protected HashMap<String, String> proxyException = new HashMap<>();
 
   /** Indicates if a proxy is used */
   protected boolean useProxy = false;
@@ -177,11 +177,14 @@
 
   /** Which TLS/SSL cipher suites to support */
   protected Set<String> tlsPreferredCipherSuites;
-  
+
   /** Configuration directive for If-Modified-Since HTTP header */
   protected boolean enableIfModifiedsinceHeader = true;
-  
-  /** Controls whether or not to set Cookie HTTP header based on CrawlDatum metadata */
+
+  /**
+   * Controls whether or not to set Cookie HTTP header based on CrawlDatum
+   * metadata
+   */
   protected boolean enableCookieHeader = true;
 
   /** Creates a new instance of HttpBase */
@@ -203,13 +206,13 @@
     this.proxyHost = conf.get("http.proxy.host");
     this.proxyPort = conf.getInt("http.proxy.port", 8080);
     this.proxyType = Proxy.Type.valueOf(conf.get("http.proxy.type", "HTTP"));
-    this.proxyException = arrayToMap(conf.getStrings("http.proxy.exception.list"));
+    this.proxyException = arrayToMap(
+        conf.getStrings("http.proxy.exception.list"));
     this.useProxy = (proxyHost != null && proxyHost.length() > 0);
     this.timeout = conf.getInt("http.timeout", 10000);
     this.maxContent = conf.getInt("http.content.limit", 1024 * 1024);
     this.maxDuration = conf.getInt("http.time.limit", -1);
-    this.partialAsTruncated = conf
-        .getBoolean("http.partial.truncated", false);
+    this.partialAsTruncated = conf.getBoolean("http.partial.truncated", false);
     this.userAgent = getAgentString(conf.get("http.agent.name"),
         conf.get("http.agent.version"), conf.get("http.agent.description"),
         conf.get("http.agent.url"), conf.get("http.agent.email"));
@@ -227,8 +230,10 @@
     this.storeIPAddress = conf.getBoolean("store.ip.address", false);
     this.storeHttpRequest = conf.getBoolean("store.http.request", false);
     this.storeHttpHeaders = conf.getBoolean("store.http.headers", false);
-    this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
-    this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true);
+    this.enableIfModifiedsinceHeader = conf
+        .getBoolean("http.enable.if.modified.since.header", true);
+    this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header",
+        true);
     this.robots.setConf(conf);
 
     this.logUtil.setConf(conf);
@@ -267,19 +272,20 @@
         }
       }
       if (userAgentNames == null) {
-        logger
-            .warn("Falling back to fixed user agent set via property http.agent.name");
+        logger.warn(
+            "Falling back to fixed user agent set via property http.agent.name");
       }
     }
-    
+
     // If cookies are enabled, try to load a per-host cookie file
     if (enableCookieHeader) {
-      String cookieFile = conf.get("http.agent.host.cookie.file", "cookies.txt");
+      String cookieFile = conf.get("http.agent.host.cookie.file",
+          "cookies.txt");
       BufferedReader br = null;
       try {
         Reader reader = conf.getConfResourceAsReader(cookieFile);
         br = new BufferedReader(reader);
-        hostCookies = new HashMap<String,String>();
+        hostCookies = new HashMap<String, String>();
         String word = "";
         while ((word = br.readLine()) != null) {
           if (!word.trim().isEmpty()) {
@@ -294,8 +300,8 @@
           }
         }
       } catch (Exception e) {
-        logger.warn("Failed to read http.agent.host.cookie.file {}: {}", cookieFile,
-            StringUtils.stringifyException(e));
+        logger.warn("Failed to read http.agent.host.cookie.file {}: {}",
+            cookieFile, StringUtils.stringifyException(e));
         hostCookies = null;
       } finally {
         if (br != null) {
@@ -309,16 +315,12 @@
     }
 
     String[] protocols = conf.getStrings("http.tls.supported.protocols",
-        "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
+        "TLSv1.3", "TLSv1.2", "TLSv1.1", "TLSv1", "SSLv3");
     String[] ciphers = conf.getStrings("http.tls.supported.cipher.suites",
-        "ECDHE-ECDSA-AES128-GCM-SHA256",
-        "ECDHE-RSA-AES128-GCM-SHA256",
-        "ECDHE-ECDSA-AES256-GCM-SHA384",
-        "ECDHE-RSA-AES256-GCM-SHA384",
-        "ECDHE-ECDSA-CHACHA20-POLY1305",
-        "ECDHE-RSA-CHACHA20-POLY1305",
-        "DHE-RSA-AES128-GCM-SHA256",
-        "DHE-RSA-AES256-GCM-SHA384",
+        "ECDHE-ECDSA-AES128-GCM-SHA256", "ECDHE-RSA-AES128-GCM-SHA256",
+        "ECDHE-ECDSA-AES256-GCM-SHA384", "ECDHE-RSA-AES256-GCM-SHA384",
+        "ECDHE-ECDSA-CHACHA20-POLY1305", "ECDHE-RSA-CHACHA20-POLY1305",
+        "DHE-RSA-AES128-GCM-SHA256", "DHE-RSA-AES256-GCM-SHA384",
         "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA384",
         "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA384",
         "TLS_RSA_WITH_AES_256_CBC_SHA256",
@@ -329,8 +331,8 @@
         "TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA",
         "TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA", "TLS_RSA_WITH_AES_256_CBC_SHA",
         "TLS_ECDH_ECDSA_WITH_AES_256_CBC_SHA",
-        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA",
-        "TLS_DHE_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
+        "TLS_ECDH_RSA_WITH_AES_256_CBC_SHA", "TLS_DHE_RSA_WITH_AES_256_CBC_SHA",
+        "TLS_DHE_DSS_WITH_AES_256_CBC_SHA",
         "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256",
         "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256",
         "TLS_RSA_WITH_AES_128_CBC_SHA256",
@@ -341,11 +343,10 @@
         "TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA",
         "TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA", "TLS_RSA_WITH_AES_128_CBC_SHA",
         "TLS_ECDH_ECDSA_WITH_AES_128_CBC_SHA",
-        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA",
-        "TLS_DHE_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_DSS_WITH_AES_128_CBC_SHA",
-        "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA", "TLS_ECDHE_RSA_WITH_RC4_128_SHA",
-        "SSL_RSA_WITH_RC4_128_SHA", "TLS_ECDH_ECDSA_WITH_RC4_128_SHA",
-        "TLS_ECDH_RSA_WITH_RC4_128_SHA",
+        "TLS_ECDH_RSA_WITH_AES_128_CBC_SHA", "TLS_DHE_RSA_WITH_AES_128_CBC_SHA",
+        "TLS_DHE_DSS_WITH_AES_128_CBC_SHA", "TLS_ECDHE_ECDSA_WITH_RC4_128_SHA",
+        "TLS_ECDHE_RSA_WITH_RC4_128_SHA", "SSL_RSA_WITH_RC4_128_SHA",
+        "TLS_ECDH_ECDSA_WITH_RC4_128_SHA", "TLS_ECDH_RSA_WITH_RC4_128_SHA",
         "TLS_ECDHE_ECDSA_WITH_3DES_EDE_CBC_SHA",
         "TLS_ECDHE_RSA_WITH_3DES_EDE_CBC_SHA", "SSL_RSA_WITH_3DES_EDE_CBC_SHA",
         "TLS_ECDH_ECDSA_WITH_3DES_EDE_CBC_SHA",
@@ -360,7 +361,9 @@
         "SSL_DHE_DSS_WITH_DES_CBC_SHA", "TLS_KRB5_WITH_RC4_128_SHA",
         "TLS_KRB5_WITH_RC4_128_MD5", "TLS_KRB5_WITH_3DES_EDE_CBC_SHA",
         "TLS_KRB5_WITH_3DES_EDE_CBC_MD5", "TLS_KRB5_WITH_DES_CBC_SHA",
-        "TLS_KRB5_WITH_DES_CBC_MD5");
+        "TLS_KRB5_WITH_DES_CBC_MD5", "TLS_AES_256_GCM_SHA384",
+        "TLS_CHACHA20_POLY1305_SHA256", "TLS_AES_128_GCM_SHA256",
+        "TLS_AES_128_CCM_8_SHA256", "TLS_AES_128_CCM_SHA256");
 
     tlsPreferredProtocols = new HashSet<String>(Arrays.asList(protocols));
     tlsPreferredCipherSuites = new HashSet<String>(Arrays.asList(ciphers));
@@ -389,7 +392,7 @@
 
       int code = response.getCode();
       datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
-        new Text(Integer.toString(code)));
+          new Text(Integer.toString(code)));
 
       byte[] content = response.getContent();
       Content c = new Content(u.toString(), u.toString(),
@@ -433,18 +436,19 @@
         if (logger.isTraceEnabled()) {
           logger.trace("400 Bad request: " + u);
         }
-        return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
+        return new ProtocolOutput(c,
+            new ProtocolStatus(ProtocolStatus.GONE, u));
       } else if (code == 401) { // requires authorization, but no valid auth
                                 // provided.
         if (logger.isTraceEnabled()) {
           logger.trace("401 Authentication Required");
         }
-        return new ProtocolOutput(c, new ProtocolStatus(
-            ProtocolStatus.ACCESS_DENIED, "Authentication required: "
-                + urlString));
+        return new ProtocolOutput(c,
+            new ProtocolStatus(ProtocolStatus.ACCESS_DENIED,
+                "Authentication required: " + urlString));
       } else if (code == 404) {
-        return new ProtocolOutput(c, new ProtocolStatus(
-            ProtocolStatus.NOTFOUND, u));
+        return new ProtocolOutput(c,
+            new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
       } else if (code == 410) { // permanently GONE
         return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE,
             "Http: " + code + " url=" + u));
@@ -494,11 +498,11 @@
   public int getTimeout() {
     return timeout;
   }
-  
+
   public boolean isIfModifiedSinceEnabled() {
     return enableIfModifiedsinceHeader;
   }
-  
+
   public boolean isCookieEnabled() {
     return enableCookieHeader;
   }
@@ -542,19 +546,20 @@
     }
     return userAgent;
   }
-  
+
   /**
-   * If per-host cookies are configured, this method will look it up
-   * for the given url.
+   * If per-host cookies are configured, this method will look it up for the
+   * given url.
    *
-   * @param url the url to look-up a cookie for
+   * @param url
+   *          the url to look-up a cookie for
    * @return the cookie or null
    */
   public String getCookie(URL url) {
     if (hostCookies != null) {
       return hostCookies.get(url.getHost());
     }
-    
+
     return null;
   }
 
@@ -729,8 +734,8 @@
         url = args[i];
     }
 
-    ProtocolOutput out = http
-        .getProtocolOutput(new Text(url), new CrawlDatum());
+    ProtocolOutput out = http.getProtocolOutput(new Text(url),
+        new CrawlDatum());
     Content content = out.getContent();
 
     System.out.println("Status: " + out.getStatus());
@@ -752,10 +757,12 @@
       List<Content> robotsTxtContent) {
     return robots.getRobotRulesSet(this, url, robotsTxtContent);
   }
-  
+
   /**
    * Transforming a String[] into a HashMap for faster searching
-   * @param input String[]
+   * 
+   * @param input
+   *          String[]
    * @return a new HashMap
    */
   private HashMap<String, String> arrayToMap(String[] input) {