Merge branch NUTCH-2716 of https://github.com/YossiTamari/nutch
- NUTCH-2716 protocol-http: Response headers are not stored for a compressed response
- NUTCH-2715 WARCExporter fails on large records
- upgrades lib-htmlunit to use version 3.141.5 of Selenium
diff --git a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
index 02c2415..f401041 100644
--- a/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
+++ b/src/java/org/apache/nutch/tools/CommonCrawlFormatWARC.java
@@ -192,6 +192,7 @@
ByteArrayOutputStream output = new ByteArrayOutputStream();
String httpHeaders = metadata.get("_response.headers_");
+ httpHeaders = WARCUtils.fixHttpHeaders(httpHeaders, content.getContent().length);
if (StringUtils.isNotBlank(httpHeaders)) {
output.write(httpHeaders.getBytes());
diff --git a/src/java/org/apache/nutch/tools/WARCUtils.java b/src/java/org/apache/nutch/tools/WARCUtils.java
index a880783..1af6533 100644
--- a/src/java/org/apache/nutch/tools/WARCUtils.java
+++ b/src/java/org/apache/nutch/tools/WARCUtils.java
@@ -24,6 +24,7 @@
import java.net.UnknownHostException;
import java.util.Date;
import java.util.List;
+import java.util.regex.Pattern;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.indexer.NutchDocument;
@@ -46,6 +47,11 @@
public final static String CONFORMS_TO = "conformsTo";
public final static String IP = "ip";
public final static UUIDGenerator generator = new UUIDGenerator();
+ public static final String CRLF = "\r\n";
+ public static final String COLONSP = ": ";
+ protected static final Pattern PROBLEMATIC_HEADERS = Pattern
+ .compile("(?i)(?:Content-(?:Encoding|Length)|Transfer-Encoding)");
+ protected static final String X_HIDE_HEADER = "X-Crawler-";
public static final ANVLRecord getWARCInfoContent(Configuration conf) {
ANVLRecord record = new ANVLRecord();
@@ -167,4 +173,110 @@
return record;
}
+
+ /**
+ * Modify verbatim HTTP response headers: fix, remove or replace headers
+ * <code>Content-Length</code>, <code>Content-Encoding</code> and
+ * <code>Transfer-Encoding</code> which may confuse WARC readers. Ensure that
+ * returned header end with a single empty line (<code>\r\n\r\n</code>).
+ *
+ * @param headers
+ * HTTP 1.1 or 1.0 response header string, CR-LF-separated lines,
+ * first line is status line
+ * @return safe HTTP response header
+ */
+ public static final String fixHttpHeaders(String headers, int contentLength) {
+ if (headers==null) {
+ return null;
+ }
+ int start = 0, lineEnd = 0, last = 0, trailingCrLf= 0;
+ StringBuilder replace = new StringBuilder();
+ while (start < headers.length()) {
+ lineEnd = headers.indexOf(CRLF, start);
+ trailingCrLf = 1;
+ if (lineEnd == -1) {
+ lineEnd = headers.length();
+ trailingCrLf = 0;
+ }
+ int colonPos = -1;
+ for (int i = start; i < lineEnd; i++) {
+ if (headers.charAt(i) == ':') {
+ colonPos = i;
+ break;
+ }
+ }
+ if (colonPos == -1) {
+ boolean valid = true;
+ if (start == 0) {
+ // status line (without colon)
+ // TODO: http/2
+ } else if ((lineEnd + 4) == headers.length()
+ && headers.endsWith(CRLF + CRLF)) {
+ // ok, trailing empty line
+ trailingCrLf = 2;
+ } else {
+ valid = false;
+ }
+ if (!valid) {
+ if (last < start) {
+ replace.append(headers.substring(last, start));
+ }
+ last = lineEnd + 2 * trailingCrLf;
+ }
+ start = lineEnd + 2 * trailingCrLf;
+ /*
+ * skip over invalid header line, no further check for problematic
+ * headers required
+ */
+ continue;
+ }
+ String name = headers.substring(start, colonPos);
+ if (PROBLEMATIC_HEADERS.matcher(name).matches()) {
+ boolean needsFix = true;
+ if (name.equalsIgnoreCase("content-length")) {
+ String value = headers.substring(colonPos + 1, lineEnd).trim();
+ try {
+ int l = Integer.parseInt(value);
+ if (l == contentLength) {
+ needsFix = false;
+ }
+ } catch (NumberFormatException e) {
+ // needs to be fixed
+ }
+ }
+ if (needsFix) {
+ if (last < start) {
+ replace.append(headers.substring(last, start));
+ }
+ last = lineEnd + 2 * trailingCrLf;
+ replace.append(X_HIDE_HEADER)
+ .append(headers.substring(start, lineEnd + 2 * trailingCrLf));
+ if (trailingCrLf == 0) {
+ replace.append(CRLF);
+ trailingCrLf = 1;
+ }
+ if (name.equalsIgnoreCase("content-length")) {
+ // add effective uncompressed and unchunked length of content
+ replace.append("Content-Length").append(COLONSP)
+ .append(contentLength).append(CRLF);
+ }
+ }
+ }
+ start = lineEnd + 2 * trailingCrLf;
+ }
+ if (last > 0 || trailingCrLf != 2) {
+ if (last < headers.length()) {
+ // append trailing headers
+ replace.append(headers.substring(last));
+ }
+ while (trailingCrLf < 2) {
+ replace.append(CRLF);
+ trailingCrLf++;
+ }
+ return replace.toString();
+ }
+ return headers;
+ }
+
+
}
diff --git a/src/java/org/apache/nutch/tools/warc/WARCExporter.java b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
index 0b0b4c2..d307000 100644
--- a/src/java/org/apache/nutch/tools/warc/WARCExporter.java
+++ b/src/java/org/apache/nutch/tools/warc/WARCExporter.java
@@ -51,6 +51,7 @@
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.protocol.Content;
+import org.apache.nutch.tools.WARCUtils;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
@@ -144,6 +145,7 @@
// were the headers stored as is? Can write a response element then
String headersVerbatim = content.getMetadata().get("_response.headers_");
+ headersVerbatim = WARCUtils.fixHttpHeaders(headersVerbatim, content.getContent().length);
byte[] httpheaders = new byte[0];
if (StringUtils.isNotBlank(headersVerbatim)) {
// check that ends with an empty line
@@ -241,7 +243,7 @@
WARCRecord record = new WARCRecord(in);
context.write(NullWritable.get(), new WARCWritable(record));
context.getCounter("WARCExporter", "records generated").increment(1);
- } catch (IOException exception) {
+ } catch (IOException | IllegalStateException exception) {
LOG.error("Exception when generating WARC record for {} : {}", key,
exception.getMessage());
context.getCounter("WARCExporter", "exception").increment(1);
diff --git a/src/plugin/lib-htmlunit/ivy.xml b/src/plugin/lib-htmlunit/ivy.xml
index 6430535..f54534c 100644
--- a/src/plugin/lib-htmlunit/ivy.xml
+++ b/src/plugin/lib-htmlunit/ivy.xml
@@ -37,7 +37,8 @@
<dependencies>
<!-- begin selenium dependencies -->
- <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="2.44.0" />
+ <dependency org="org.seleniumhq.selenium" name="selenium-java" rev="3.141.59" />
+ <dependency org="org.seleniumhq.selenium" name="htmlunit-driver" rev="2.35.1" />
<dependency org="com.opera" name="operadriver" rev="1.5">
<exclude org="org.seleniumhq.selenium" name="selenium-remote-driver" />
diff --git a/src/plugin/lib-htmlunit/plugin.xml b/src/plugin/lib-htmlunit/plugin.xml
index 290a137..bdfed92 100644
--- a/src/plugin/lib-htmlunit/plugin.xml
+++ b/src/plugin/lib-htmlunit/plugin.xml
@@ -29,76 +29,97 @@
<export name="*"/>
</library>
<!-- all classes from dependent libraries are exported -->
- <library name="cglib-nodep-2.1_3.jar">
+ <library name="animal-sniffer-annotations-1.14.jar">
<export name="*"/>
</library>
- <library name="commons-codec-1.9.jar">
+ <library name="byte-buddy-1.8.15.jar">
<export name="*"/>
</library>
- <library name="commons-collections-3.2.1.jar">
+ <library name="checker-compat-qual-2.0.0.jar">
<export name="*"/>
</library>
- <library name="commons-exec-1.1.jar">
+ <library name="commons-codec-1.11.jar">
<export name="*"/>
</library>
- <library name="commons-io-2.4.jar">
+ <library name="commons-exec-1.3.jar">
+ <export name="*"/>
+ </library>
+ <library name="commons-io-2.6.jar">
<export name="*"/>
</library>
<library name="commons-jxpath-1.3.jar">
<export name="*"/>
</library>
- <library name="commons-lang3-3.3.2.jar">
+ <library name="commons-lang3-3.9.jar">
<export name="*"/>
</library>
- <library name="commons-logging-1.1.3.jar">
+ <library name="commons-logging-1.2.jar">
<export name="*"/>
</library>
- <library name="cssparser-0.9.14.jar">
+ <library name="commons-net-3.6.jar">
<export name="*"/>
</library>
- <library name="gson-2.3.jar">
+ <library name="commons-text-1.6.jar">
<export name="*"/>
</library>
- <library name="guava-18.0.jar">
+ <library name="error_prone_annotations-2.1.3.jar">
<export name="*"/>
</library>
- <library name="htmlunit-2.15.jar">
+ <library name="guava-25.0-jre.jar">
<export name="*"/>
</library>
- <library name="htmlunit-core-js-2.15.jar">
+ <library name="htmlunit-2.35.0.jar">
<export name="*"/>
</library>
- <library name="httpclient-4.3.4.jar">
+ <library name="htmlunit-core-js-2.35.0.jar">
<export name="*"/>
</library>
- <library name="httpcore-4.3.2.jar">
+ <library name="htmlunit-cssparser-1.4.0.jar">
<export name="*"/>
</library>
- <library name="httpmime-4.3.3.jar">
+ <library name="htmlunit-driver-2.35.1.jar">
+ <export name="*"/>
+ </library>
+ <library name="httpclient-4.5.8.jar">
+ <export name="*"/>
+ </library>
+ <library name="httpcore-4.4.11.jar">
+ <export name="*"/>
+ </library>
+ <library name="httpmime-4.5.8.jar">
<export name="*"/>
</library>
<library name="ini4j-0.5.2.jar">
<export name="*"/>
</library>
- <library name="jetty-http-8.1.15.v20140411.jar">
+ <library name="j2objc-annotations-1.1.jar">
<export name="*"/>
</library>
- <library name="jetty-io-8.1.15.v20140411.jar">
+ <library name="jetty-client-9.4.16.v20190411.jar">
<export name="*"/>
</library>
- <library name="jetty-util-8.1.15.v20140411.jar">
+ <library name="jetty-http-9.4.16.v20190411.jar">
<export name="*"/>
</library>
- <library name="jetty-websocket-8.1.15.v20140411.jar">
+ <library name="jetty-io-9.4.16.v20190411.jar">
<export name="*"/>
</library>
- <library name="jna-3.4.0.jar">
+ <library name="jetty-util-9.4.16.v20190411.jar">
<export name="*"/>
</library>
- <library name="nekohtml-1.9.21.jar">
+ <library name="jetty-xml-9.4.16.v20190411.jar">
<export name="*"/>
</library>
- <library name="netty-3.5.2.Final.jar">
+ <library name="jsr305-1.3.9.jar">
+ <export name="*"/>
+ </library>
+ <library name="neko-htmlunit-2.35.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="okhttp-3.11.0.jar">
+ <export name="*"/>
+ </library>
+ <library name="okio-1.14.0.jar">
<export name="*"/>
</library>
<library name="operadriver-1.5.jar">
@@ -110,52 +131,55 @@
<library name="phantomjsdriver-1.2.1.jar">
<export name="*"/>
</library>
- <library name="platform-3.4.0.jar">
- <export name="*"/>
- </library>
<library name="protobuf-java-2.4.1.jar">
<export name="*"/>
</library>
- <library name="sac-1.3.jar">
+ <library name="selenium-api-3.141.59.jar">
<export name="*"/>
</library>
- <library name="selenium-api-2.44.0.jar">
+ <library name="selenium-chrome-driver-3.141.59.jar">
<export name="*"/>
</library>
- <library name="selenium-chrome-driver-2.44.0.jar">
+ <library name="selenium-edge-driver-3.141.59.jar">
<export name="*"/>
</library>
- <library name="selenium-firefox-driver-2.44.0.jar">
+ <library name="selenium-firefox-driver-3.141.59.jar">
<export name="*"/>
</library>
- <library name="selenium-htmlunit-driver-2.44.0.jar">
+ <library name="selenium-ie-driver-3.141.59.jar">
<export name="*"/>
</library>
- <library name="selenium-ie-driver-2.44.0.jar">
+ <library name="selenium-java-3.141.59.jar">
<export name="*"/>
</library>
- <library name="selenium-java-2.44.0.jar">
+ <library name="selenium-opera-driver-3.141.59.jar">
<export name="*"/>
</library>
- <library name="selenium-remote-driver-2.44.0.jar">
+ <library name="selenium-remote-driver-3.141.59.jar">
<export name="*"/>
</library>
- <library name="selenium-safari-driver-2.44.0.jar">
+ <library name="selenium-safari-driver-3.141.59.jar">
<export name="*"/>
</library>
- <library name="selenium-support-2.44.0.jar">
+ <library name="selenium-support-3.141.59.jar">
<export name="*"/>
</library>
- <library name="serializer-2.7.1.jar">
+ <library name="serializer-2.7.2.jar">
<export name="*"/>
</library>
- <library name="webbit-0.4.14.jar">
+ <library name="websocket-api-9.4.16.v20190411.jar">
<export name="*"/>
</library>
- <library name="xalan-2.7.1.jar">
+ <library name="websocket-client-9.4.16.v20190411.jar">
<export name="*"/>
</library>
- <library name="xercesImpl-2.11.0.jar">
+ <library name="websocket-common-9.4.16.v20190411.jar">
+ <export name="*"/>
+ </library>
+ <library name="xalan-2.7.2.jar">
+ <export name="*"/>
+ </library>
+ <library name="xercesImpl-2.12.0.jar">
<export name="*"/>
</library>
<library name="xml-apis-1.4.01.jar">
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
index 6cc0c4b..e76bc04 100644
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
@@ -262,16 +262,14 @@
} else if ("deflate".equals(contentEncoding)) {
content = http.processDeflateEncoded(content, url);
} else {
- // store the headers verbatim only if the response was not compressed
- // as the content length reported with not match otherwise
- if (httpHeaders != null) {
- headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
- }
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace("fetched " + content.length + " bytes from " + url);
}
}
}
+ if (httpHeaders != null) {
+ headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+ }
}
} finally {
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index 25efb5e..5a4b1ef 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -294,16 +294,14 @@
} else if ("deflate".equals(contentEncoding)) {
content = http.processDeflateEncoded(content, url);
} else {
- // store the headers verbatim only if the response was not compressed
- // as the content length reported does not match otherwise
- if (httpHeaders != null) {
- httpHeaders.append("\r\n");
- headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
- }
if (Http.LOG.isTraceEnabled()) {
Http.LOG.trace("fetched " + content.length + " bytes from " + url);
}
}
+ if (httpHeaders != null) {
+ httpHeaders.append("\r\n");
+ headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+ }
} catch (IOException | HttpException e) {
// Headers parsing went fine, but an error occurred while trying to read
// the body of the request (the body may be malformed)
diff --git a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
index 516b2ec..4a20b04 100644
--- a/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
+++ b/src/plugin/protocol-selenium/src/java/org/apache/nutch/protocol/selenium/HttpResponse.java
@@ -295,6 +295,9 @@
}
}
}
+ if (httpHeaders != null) {
+ headers.add(Response.RESPONSE_HEADERS, httpHeaders.toString());
+ }
}
} finally {