Merge branch NUTCH-2716 of https://github.com/YossiTamari/nutch
- NUTCH-2716 protocol-http: Response headers are not stored for a compressed response
- NUTCH-2715 WARCExporter fails on large records
- upgrades lib-htmlunit to use version 3.141.5 of Selenium
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
index 02c2415..f401041 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
@@ -192,6 +192,7 @@
     ByteArrayOutputStream output = new ByteArrayOutputStream();
 
     String httpHeaders = metadata.get("_response.headers_");
+    httpHeaders = WARCUtils.fixHttpHeaders(httpHeaders, content.getContent().length);
 
     if (StringUtils.isNotBlank(httpHeaders)) {
       output.write(httpHeaders.getBytes());
diff --git a/src/java/org/apache/nutch/tools/WARCUtils.java b/src/java/org/apache/nutch/tools/WARCUtils.java
index a880783..1af6533 100644
--- a/src/java/org/apache/nutch/tools/WARCUtils.java
+++ b/src/java/org/apache/nutch/tools/WARCUtils.java
@@ -24,6 +24,7 @@
 import java.net.UnknownHostException;
 import java.util.Date;
 import java.util.List;
+import java.util.regex.Pattern;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.indexer.NutchDocument;
@@ -46,6 +47,11 @@
   public final static String CONFORMS_TO = "conformsTo";
   public final static String IP = "ip";
   public final static UUIDGenerator generator = new UUIDGenerator();
+  public static final String CRLF = "\r\n";
+  public static final String COLONSP = ": ";
+  protected static final Pattern PROBLEMATIC_HEADERS = Pattern
+      .compile("(?i)(?:Content-(?:Encoding|Length)|Transfer-Encoding)");
+  protected static final String X_HIDE_HEADER = "X-Crawler-";
 
   public static final ANVLRecord getWARCInfoContent(Configuration conf) {
     ANVLRecord record = new ANVLRecord();
@@ -167,4 +173,110 @@
 
     return record;
   }
+  
+  /**
+   * Modify verbatim HTTP response headers: fix, remove or replace headers
+   * <code>Content-Length</code>, <code>Content-Encoding</code> and
+   * <code>Transfer-Encoding</code> which may confuse WARC readers. Ensure that
+   * returned header end with a single empty line (<code>\r\n\r\n</code>).
+   * 
+   * @param headers
+   *          HTTP 1.1 or 1.0 response header string, CR-LF-separated lines,
+   *          first line is status line
+   * @return safe HTTP response header
+   */
+  public static final String fixHttpHeaders(String headers, int contentLength) {
+    if (headers==null) {
+      return null;
+    }
+    int start = 0, lineEnd = 0, last = 0, trailingCrLf= 0;
+    StringBuilder replace = new StringBuilder();
+    while (start < headers.length()) {
+      lineEnd = headers.indexOf(CRLF, start);
+      trailingCrLf = 1;
+      if (lineEnd == -1) {
+        lineEnd = headers.length();
+        trailingCrLf = 0;
+      }
+      int colonPos = -1;
+      for (int i = start; i < lineEnd; i++) {
+        if (headers.charAt(i) == ':') {
+          colonPos = i;
+          break;
+        }
+      }
+      if (colonPos == -1) {
+        boolean valid = true;
+        if (start == 0) {
+          // status line (without colon)
+          // TODO: http/2
+        } else if ((lineEnd + 4) == headers.length()
+            && headers.endsWith(CRLF + CRLF)) {
+          // ok, trailing empty line
+          trailingCrLf = 2;
+        } else {
+          valid = false;
+        }
+        if (!valid) {
+          if (last < start) {
+            replace.append(headers.substring(last, start));
+          }
+          last = lineEnd + 2 * trailingCrLf;
+        }
+        start = lineEnd + 2 * trailingCrLf;
+        /*
+         * skip over invalid header line, no further check for problematic
+         * headers required
+         */
+        continue;
+      }
+      String name = headers.substring(start, colonPos);
+      if (PROBLEMATIC_HEADERS.matcher(name).matches()) {
+        boolean needsFix = true;
+        if (name.equalsIgnoreCase("content-length")) {
+          String value = headers.substring(colonPos + 1, lineEnd).trim();
+          try {
+            int l = Integer.parseInt(value);
+            if (l == contentLength) {
+              needsFix = false;
+            }
+          } catch (NumberFormatException e) {
+            // needs to be fixed
+          }
+        }
+        if (needsFix) {
+          if (last < start) {
+            replace.append(headers.substring(last, start));
+          }
+          last = lineEnd + 2 * trailingCrLf;
+          replace.append(X_HIDE_HEADER)
+              .append(headers.substring(start, lineEnd + 2 * trailingCrLf));
+          if (trailingCrLf == 0) {
+            replace.append(CRLF);
+            trailingCrLf = 1;
+          }
+          if (name.equalsIgnoreCase("content-length")) {
+            // add effective uncompressed and unchunked length of content
+            replace.append("Content-Length").append(COLONSP)
+                .append(contentLength).append(CRLF);
+          }
+        }
+      }
+      start = lineEnd + 2 * trailingCrLf;
+    }
+    if (last > 0 || trailingCrLf != 2) {
+      if (last < headers.length()) {
+        // append trailing headers
+        replace.append(headers.substring(last));
+      }
+      while (trailingCrLf < 2) {
+        replace.append(CRLF);
+        trailingCrLf++;
+      }
+      return replace.toString();
+    }
+    return headers;
+  }
+
+  
 }
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index 0b0b4c2..d307000 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -51,6 +51,7 @@
 import org.apache.nutch.crawl.NutchWritable;
 import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.protocol.Content;
+import org.apache.nutch.tools.WARCUtils;
 import org.apache.nutch.util.HadoopFSUtil;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
@@ -144,6 +145,7 @@
 
         // were the headers stored as is? Can write a response element then
         String headersVerbatim = content.getMetadata().get("_response.headers_");
+        headersVerbatim = WARCUtils.fixHttpHeaders(headersVerbatim, content.getContent().length);
         byte[] httpheaders = new byte[0];
         if (StringUtils.isNotBlank(headersVerbatim)) {
           // check that ends with an empty line
@@ -241,7 +243,7 @@
           WARCRecord record = new WARCRecord(in);
           context.write(NullWritable.get(), new WARCWritable(record));
           context.getCounter("WARCExporter", "records generated").increment(1);
-        } catch (IOException exception) {
+        } catch (IOException | IllegalStateException exception) {
           LOG.error("Exception when generating WARC record for {} : {}", key,
               exception.getMessage());
           context.getCounter("WARCExporter", "exception").increment(1);
diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml
index 6430535..f54534c 100644
--- a/src/plugin/lib-htmlunit/ivy.xml
+++ b/src/plugin/lib-htmlunit/ivy.xml
@@ -37,7 +37,8 @@
 
   <dependencies>
     <!-- begin selenium dependencies -->
-    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
+    <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="3.141.59" />
+    <dependency org="org.seleniumhq.selenium" name="htmlunit-driver" rev="2.35.1" />
     
     <dependency org="com.opera" name="operadriver" rev="1.5">
       <exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
diff --git a/src/plugin/lib-htmlunit/plugin.xml b/src/plugin/lib-htmlunit/plugin.xml
index 290a137..bdfed92 100644
--- a/src/plugin/lib-htmlunit/plugin.xml
+++ b/src/plugin/lib-htmlunit/plugin.xml
@@ -29,76 +29,97 @@
         <export name="*"/>
      </library>
      <!-- all classes from dependent libraries are exported -->
-     <library name="cglib-nodep-2.1_3.jar">
+     <library name="animal-sniffer-annotations-1.14.jar">
        <export name="*"/>
      </library>
-     <library name="commons-codec-1.9.jar">
+     <library name="byte-buddy-1.8.15.jar">
        <export name="*"/>
      </library>
-     <library name="commons-collections-3.2.1.jar">
+     <library name="checker-compat-qual-2.0.0.jar">
        <export name="*"/>
      </library>
-     <library name="commons-exec-1.1.jar">
+     <library name="commons-codec-1.11.jar">
        <export name="*"/>
      </library>
-     <library name="commons-io-2.4.jar">
+     <library name="commons-exec-1.3.jar">
+       <export name="*"/>
+     </library>
+     <library name="commons-io-2.6.jar">
        <export name="*"/>
      </library>
      <library name="commons-jxpath-1.3.jar">
        <export name="*"/>
      </library>
-     <library name="commons-lang3-3.3.2.jar">
+     <library name="commons-lang3-3.9.jar">
        <export name="*"/>
      </library>
-     <library name="commons-logging-1.1.3.jar">
+     <library name="commons-logging-1.2.jar">
        <export name="*"/>
      </library>
-     <library name="cssparser-0.9.14.jar">
+     <library name="commons-net-3.6.jar">
        <export name="*"/>
      </library>
-     <library name="gson-2.3.jar">
+     <library name="commons-text-1.6.jar">
        <export name="*"/>
      </library>
-     <library name="guava-18.0.jar">
+     <library name="error_prone_annotations-2.1.3.jar">
        <export name="*"/>
      </library>
-     <library name="htmlunit-2.15.jar">
+     <library name="guava-25.0-jre.jar">
        <export name="*"/>
      </library>
-     <library name="htmlunit-core-js-2.15.jar">
+     <library name="htmlunit-2.35.0.jar">
        <export name="*"/>
      </library>
-     <library name="httpclient-4.3.4.jar">
+     <library name="htmlunit-core-js-2.35.0.jar">
        <export name="*"/>
      </library>
-     <library name="httpcore-4.3.2.jar">
+     <library name="htmlunit-cssparser-1.4.0.jar">
        <export name="*"/>
      </library>
-     <library name="httpmime-4.3.3.jar">
+     <library name="htmlunit-driver-2.35.1.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpclient-4.5.8.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpcore-4.4.11.jar">
+       <export name="*"/>
+     </library>
+     <library name="httpmime-4.5.8.jar">
        <export name="*"/>
      </library>
      <library name="ini4j-0.5.2.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-http-8.1.15.v20140411.jar">
+     <library name="j2objc-annotations-1.1.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-io-8.1.15.v20140411.jar">
+     <library name="jetty-client-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-util-8.1.15.v20140411.jar">
+     <library name="jetty-http-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="jetty-websocket-8.1.15.v20140411.jar">
+     <library name="jetty-io-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="jna-3.4.0.jar">
+     <library name="jetty-util-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="nekohtml-1.9.21.jar">
+     <library name="jetty-xml-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="netty-3.5.2.Final.jar">
+     <library name="jsr305-1.3.9.jar">
+       <export name="*"/>
+     </library>
+     <library name="neko-htmlunit-2.35.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="okhttp-3.11.0.jar">
+       <export name="*"/>
+     </library>
+     <library name="okio-1.14.0.jar">
        <export name="*"/>
      </library>
      <library name="operadriver-1.5.jar">
@@ -110,52 +131,55 @@
      <library name="phantomjsdriver-1.2.1.jar">
        <export name="*"/>
      </library>
-     <library name="platform-3.4.0.jar">
-       <export name="*"/>
-     </library>
      <library name="protobuf-java-2.4.1.jar">
        <export name="*"/>
      </library>
-     <library name="sac-1.3.jar">
+     <library name="selenium-api-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-api-2.44.0.jar">
+     <library name="selenium-chrome-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-chrome-driver-2.44.0.jar">
+     <library name="selenium-edge-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-firefox-driver-2.44.0.jar">
+     <library name="selenium-firefox-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-htmlunit-driver-2.44.0.jar">
+     <library name="selenium-ie-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-ie-driver-2.44.0.jar">
+     <library name="selenium-java-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-java-2.44.0.jar">
+     <library name="selenium-opera-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-remote-driver-2.44.0.jar">
+     <library name="selenium-remote-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-safari-driver-2.44.0.jar">
+     <library name="selenium-safari-driver-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="selenium-support-2.44.0.jar">
+     <library name="selenium-support-3.141.59.jar">
        <export name="*"/>
      </library>
-     <library name="serializer-2.7.1.jar">
+     <library name="serializer-2.7.2.jar">
        <export name="*"/>
      </library>
-     <library name="webbit-0.4.14.jar">
+     <library name="websocket-api-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="xalan-2.7.1.jar">
+     <library name="websocket-client-9.4.16.v20190411.jar">
        <export name="*"/>
      </library>
-     <library name="xercesImpl-2.11.0.jar">
+     <library name="websocket-common-9.4.16.v20190411.jar">
+       <export name="*"/>
+     </library>
+     <library name="xalan-2.7.2.jar">
+       <export name="*"/>
+     </library>
+     <library name="xercesImpl-2.12.0.jar">
        <export name="*"/>
      </library>
      <library name="xml-apis-1.4.01.jar">
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 6cc0c4b..e76bc04 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -262,16 +262,14 @@
           } else if ("deflate".equals(contentEncoding)) {
             content = http.processDeflateEncoded(content, url);
           } else {
-            // store the headers verbatim only if the response was not compressed
-            // as the content length reported with not match otherwise
-            if (httpHeaders != null) {
-              headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
-            }
             if (Http.LOG.isTraceEnabled()) {
               Http.LOG.trace("fetched " + content.length + " bytes from " + url);
             }
           }
         }
+        if (httpHeaders != null) {
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+        }
       }
 
     } finally {
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 25efb5e..5a4b1ef 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -294,16 +294,14 @@
         } else if ("deflate".equals(contentEncoding)) {
           content = http.processDeflateEncoded(content, url);
         } else {
-          // store the headers verbatim only if the response was not compressed
-          // as the content length reported does not match otherwise
-          if (httpHeaders != null) {
-            httpHeaders.append("\r\n");
-            headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
-          }
           if (Http.LOG.isTraceEnabled()) {
             Http.LOG.trace("fetched " + content.length + " bytes from " + url);
           }
         }
+        if (httpHeaders != null) {
+          httpHeaders.append("\r\n");
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+        }
       } catch (IOException | HttpException e) {
         // Headers parsing went fine, but an error occurred while trying to read
         // the body of the request (the body may be malformed)
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
index 516b2ec..4a20b04 100644
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -295,6 +295,9 @@
             }
           }
         }
+        if (httpHeaders != null) {
+          headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+        }
       } 
 
     } finally {