Merge pull request #174 from kamaci/NUTCH-2171
NUTCH-2171 Nutch upgrade to Java 1.8
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index ea7df89..08fb8a0 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -344,6 +344,14 @@
</description>
</property>
+<property>
+ <name>http.enable.cookie.header</name>
+ <value>true</value>
+ <description>Whether Nutch sends an HTTP Cookie header. The cookie value
+ is read from the CrawlDatum Cookie metadata field.
+ </description>
+</property>
+
<!-- FTP properties -->
<property>
diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 41b63e3..eb3eb60 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
@@ -59,6 +59,8 @@
public static final Text RESPONSE_TIME = new Text("_rs_");
+ public static final Text COOKIE = new Text("Cookie");
+
public static final int BUFFER_SIZE = 8 * 1024;
private static final byte[] EMPTY_CONTENT = new byte[0];
@@ -124,7 +126,10 @@
protected Set<String> tlsPreferredCipherSuites;
/** Configuration directive for If-Modified-Since HTTP header */
- public boolean enableIfModifiedsinceHeader = true;
+ protected boolean enableIfModifiedsinceHeader = true;
+
+ /** Controls whether or not to set Cookie HTTP header based on CrawlDatum metadata */
+ protected boolean enableCookieHeader = true;
/** Creates a new instance of HttpBase */
public HttpBase() {
@@ -157,6 +162,7 @@
this.useHttp11 = conf.getBoolean("http.useHttp11", false);
this.responseTime = conf.getBoolean("http.store.responsetime", true);
this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
+ this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true);
this.robots.setConf(conf);
// NUTCH-1941: read list of alternating agent names
@@ -369,6 +375,10 @@
public boolean isIfModifiedSinceEnabled() {
return enableIfModifiedsinceHeader;
}
+
+ public boolean isCookieEnabled() {
+ return enableCookieHeader;
+ }
public int getMaxContent() {
return maxContent;
@@ -458,6 +468,7 @@
logger.info("http.agent = " + userAgent);
logger.info("http.accept.language = " + acceptLanguage);
logger.info("http.accept = " + accept);
+ logger.info("http.enable.cookie.header = " + isCookieEnabled());
}
}
@@ -584,4 +595,4 @@
}
return hm;
}
-}
+}
\ No newline at end of file
diff --git a/src/plugin/parsefilter-regex/README.txt b/src/plugin/parsefilter-regex/README.txt
new file mode 100644
index 0000000..1fac05f
--- /dev/null
+++ b/src/plugin/parsefilter-regex/README.txt
@@ -0,0 +1,41 @@
+Parsefilter-regex plugin
+
+Allow parsing and set custom defined fields using regex. Rules can be defined
+in a separate rule file or in the nutch configuration.
+
+If a rule file is used, should create a text file regex-parsefilter.txt (which
+is the default name of the rules file). To use a different filename, either
+update the file value in plugin’s build.xml or add parsefilter.regex.file
+config to the nutch config.
+
+ie:
+ <property>
+ <name>parsefilter.regex.file</name>
+ <value>
+ /path/to/rulefile
+ </value>
+ </property
+
+
+Format of rules: <name>\t<source>\t<regex>\n
+
+ie:
+ my_first_field html h1
+ my_second_field text my_pattern
+
+
+If a rule file is not used, rules can be directly set in the nutch config:
+
+ie:
+ <property>
+ <name>parsefilter.regex.rules</name>
+ <value>
+ my_first_field html h1
+ my_second_field text my_pattern
+ </value>
+ </property
+
+source can be either html or text. If source is html, the regex is applied to
+the entire HTML tree. If source is text, the regex is applied to the
+extracted text.
+
diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index 6955166..f799e5f 100644
--- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -179,13 +179,17 @@
while ((line = reader.readLine()) != null) {
if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
line = line.trim();
- String[] parts = line.split("\t");
+ String[] parts = line.split("\\s");
- String field = parts[0].trim();
- String source = parts[1].trim();
- String regex = parts[2].trim();
-
- rules.put(field, new RegexRule(source, regex));
+ if (parts.length == 3) {
+ String field = parts[0].trim();
+ String source = parts[1].trim();
+ String regex = parts[2].trim();
+
+ rules.put(field, new RegexRule(source, regex));
+ } else {
+ LOG.info("RegexParseFilter rule is invalid. " + line);
+ }
}
}
}
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index f6d7e4d..d984dc4 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
@@ -34,6 +34,7 @@
import javax.net.ssl.SSLSocketFactory;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -58,7 +59,7 @@
private Metadata headers = new SpellCheckedMetadata();
// used for storing the http headers verbatim
private StringBuffer httpHeaders;
-
+
protected enum Scheme {
HTTP, HTTPS,
}
@@ -195,6 +196,13 @@
reqStr.append("Accept: ");
reqStr.append(this.http.getAccept());
reqStr.append("\r\n");
+
+ if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
+ String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+ reqStr.append("Cookie: ");
+ reqStr.append(cookie);
+ reqStr.append("\r\n");
+ }
if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
reqStr.append("If-Modified-Since: " + HttpDateFormat
@@ -554,5 +562,4 @@
in.unread(value);
return value;
}
-
}
\ No newline at end of file
diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
index f074af2..6041e13 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
@@ -39,6 +39,7 @@
import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.hadoop.io.Text;
/**
* An HTTP response.
@@ -96,6 +97,12 @@
// XXX the request body was sent the method is not retried, so there is
// XXX little danger in retrying...
// params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
+
+ if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
+ String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+ get.addRequestHeader("Cookie", cookie);
+ }
+
try {
HttpClient client = Http.getClient();
client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941
diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 348efce..ffd22ce 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
@@ -112,7 +112,7 @@
if ("http".equals(protocol) || "https".equals(protocol)
|| "ftp".equals(protocol)) {
- if (host != null) {
+ if (host != null && url.getAuthority() != null) {
String newHost = host.toLowerCase(Locale.ROOT); // lowercase host
if (!host.equals(newHost)) {
host = newHost;
@@ -122,6 +122,9 @@
// etc.) which will likely cause a change if left away
changed = true;
}
+ } else {
+ // no host or authority: recompose the URL from components
+ changed = true;
}
if (port == url.getDefaultPort()) { // uses default port
diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index d62a3a9..2625ea3 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
@@ -164,6 +164,12 @@
"http://foo.com/aa/bb/foo.html");
normalizeTest("http://foo.com/aa?referer=http://bar.com",
"http://foo.com/aa?referer=http://bar.com");
+ // check for NPEs when normalizing URLs without host (authority)
+ normalizeTest("file:///foo/bar.txt", "file:///foo/bar.txt");
+ normalizeTest("ftp:/", "ftp:/");
+ normalizeTest("http:", "http:/");
+ normalizeTest("http:////", "http:/");
+ normalizeTest("http:///////", "http:/");
}
private void normalizeTest(String weird, String normal) throws Exception {