Merge pull request #174 from kamaci/NUTCH-2171 NUTCH-2171 Nutch upgrade to Java 1.8

commit: 3e2d3d456489bf52bc586dae0e2e71fb7aad8fe7 [log] [tgz]
author: Chris Mattmann <chris.mattmann@gmail.com> Wed Feb 22 16:31:50 2017 -0800
committer: GitHub <noreply@github.com> Wed Feb 22 16:31:50 2017 -0800
tree: 8db380647b7e3af4163c600e3b7cfcab2015bcb7
parent: 217fad16bfdea0494390e8f170d9350cf06657ef [diff]
parent: c4b895562716c5a37cb35328302d3b6801b04e48 [diff]
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index ea7df89..08fb8a0 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml

@@ -344,6 +344,14 @@
   </description>
 </property>
 
+<property>
+  <name>http.enable.cookie.header</name>
+  <value>true</value>
+  <description>Whether Nutch sends an HTTP Cookie header. The cookie value
+  is read from the CrawlDatum Cookie metadata field.
+  </description>
+</property>
+
 <!-- FTP properties -->
 
 <property>

diff --git a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
index 41b63e3..eb3eb60 100644
--- a/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
+++ b/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java

@@ -59,6 +59,8 @@
 
   public static final Text RESPONSE_TIME = new Text("_rs_");
 
+  public static final Text COOKIE = new Text("Cookie");
+  
   public static final int BUFFER_SIZE = 8 * 1024;
 
   private static final byte[] EMPTY_CONTENT = new byte[0];
@@ -124,7 +126,10 @@
   protected Set<String> tlsPreferredCipherSuites;
   
   /** Configuration directive for If-Modified-Since HTTP header */
-  public boolean enableIfModifiedsinceHeader = true;
+  protected boolean enableIfModifiedsinceHeader = true;
+  
+  /** Controls whether or not to set Cookie HTTP header based on CrawlDatum metadata */
+  protected boolean enableCookieHeader = true;
 
   /** Creates a new instance of HttpBase */
   public HttpBase() {
@@ -157,6 +162,7 @@
     this.useHttp11 = conf.getBoolean("http.useHttp11", false);
     this.responseTime = conf.getBoolean("http.store.responsetime", true);
     this.enableIfModifiedsinceHeader = conf.getBoolean("http.enable.if.modified.since.header", true);
+    this.enableCookieHeader = conf.getBoolean("http.enable.cookie.header", true);
     this.robots.setConf(conf);
 
     // NUTCH-1941: read list of alternating agent names
@@ -369,6 +375,10 @@
   public boolean isIfModifiedSinceEnabled() {
     return enableIfModifiedsinceHeader;
   }
+  
+  public boolean isCookieEnabled() {
+    return enableCookieHeader;
+  }
 
   public int getMaxContent() {
     return maxContent;
@@ -458,6 +468,7 @@
       logger.info("http.agent = " + userAgent);
       logger.info("http.accept.language = " + acceptLanguage);
       logger.info("http.accept = " + accept);
+      logger.info("http.enable.cookie.header = " + isCookieEnabled());
     }
   }
 
@@ -584,4 +595,4 @@
     }
     return hm;
   }
-}
+}
\ No newline at end of file

diff --git a/src/plugin/parsefilter-regex/README.txt b/src/plugin/parsefilter-regex/README.txt
new file mode 100644
index 0000000..1fac05f
--- /dev/null
+++ b/src/plugin/parsefilter-regex/README.txt

@@ -0,0 +1,41 @@
+Parsefilter-regex plugin
+
+Allow parsing and set custom defined fields using regex. Rules can be defined
+in a separate rule file or in the nutch configuration.
+
+If a rule file is used, should create a text file regex-parsefilter.txt (which
+is the default name of the rules file). To use a different filename, either
+update the file value in plugin’s build.xml or add parsefilter.regex.file
+config to the nutch config.
+
+ie:
+    <property>
+      <name>parsefilter.regex.file</name>
+      <value>
+	/path/to/rulefile
+      </value>
+    </property
+
+
+Format of rules: <name>\t<source>\t<regex>\n
+
+ie:
+	my_first_field		html	h1
+	my_second_field		text	my_pattern
+
+
+If a rule file is not used, rules can be directly set in the nutch config:
+
+ie:
+    <property>
+      <name>parsefilter.regex.rules</name>
+      <value>
+	my_first_field		html	h1
+	my_second_field		text	my_pattern
+      </value>
+    </property
+
+source can be either html or text. If source is html, the regex is applied to
+the entire HTML tree. If source is text, the regex is applied to the
+extracted text.
+

diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index 6955166..f799e5f 100644
--- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java

@@ -179,13 +179,17 @@
     while ((line = reader.readLine()) != null) {
       if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
         line = line.trim();
-        String[] parts = line.split("\t");
+        String[] parts = line.split("\\s");
 
-        String field = parts[0].trim();
-        String source = parts[1].trim();
-        String regex = parts[2].trim();
-        
-        rules.put(field, new RegexRule(source, regex));
+        if (parts.length == 3) {
+            String field = parts[0].trim();
+            String source = parts[1].trim();
+            String regex = parts[2].trim();
+            
+            rules.put(field, new RegexRule(source, regex));
+        } else {
+            LOG.info("RegexParseFilter rule is invalid. " + line);
+        }
       }
     }
   }

diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
index f6d7e4d..d984dc4 100644
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java
+++ b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/HttpResponse.java

@@ -34,6 +34,7 @@
 import javax.net.ssl.SSLSocketFactory;
 
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.metadata.SpellCheckedMetadata;
@@ -58,7 +59,7 @@
   private Metadata headers = new SpellCheckedMetadata();
   // used for storing the http headers verbatim
   private StringBuffer httpHeaders;
-
+  
   protected enum Scheme {
     HTTP, HTTPS,
   }
@@ -195,6 +196,13 @@
       reqStr.append("Accept: ");
       reqStr.append(this.http.getAccept());
       reqStr.append("\r\n");
+      
+      if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
+        String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+        reqStr.append("Cookie: ");
+        reqStr.append(cookie);
+        reqStr.append("\r\n");
+      }
 
       if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
         reqStr.append("If-Modified-Since: " + HttpDateFormat
@@ -554,5 +562,4 @@
     in.unread(value);
     return value;
   }
-
 }
\ No newline at end of file

diff --git a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
index f074af2..6041e13 100644
--- a/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java
+++ b/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/HttpResponse.java

@@ -39,6 +39,7 @@
 import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.net.protocols.Response;
 import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.hadoop.io.Text;
 
 /**
  * An HTTP response.
@@ -96,6 +97,12 @@
     // XXX the request body was sent the method is not retried, so there is
     // XXX little danger in retrying...
     // params.setParameter(HttpMethodParams.RETRY_HANDLER, null);
+    
+    if (http.isCookieEnabled() && datum.getMetaData().containsKey(http.COOKIE)) {
+      String cookie = ((Text)datum.getMetaData().get(http.COOKIE)).toString();
+      get.addRequestHeader("Cookie", cookie);
+    }
+    
     try {
       HttpClient client = Http.getClient();
       client.getParams().setParameter("http.useragent", http.getUserAgent()); // NUTCH-1941

diff --git a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
index 348efce..ffd22ce 100644
--- a/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/java/org/apache/nutch/net/urlnormalizer/basic/BasicURLNormalizer.java

@@ -112,7 +112,7 @@
     if ("http".equals(protocol) || "https".equals(protocol)
         || "ftp".equals(protocol)) {
 
-      if (host != null) {
+      if (host != null && url.getAuthority() != null) {
         String newHost = host.toLowerCase(Locale.ROOT); // lowercase host
         if (!host.equals(newHost)) {
           host = newHost;
@@ -122,6 +122,9 @@
           // etc.) which will likely cause a change if left away
           changed = true;
         }
+      } else {
+        // no host or authority: recompose the URL from components
+        changed = true;
       }
 
       if (port == url.getDefaultPort()) { // uses default port

diff --git a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
index d62a3a9..2625ea3 100644
--- a/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java
+++ b/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java

@@ -164,6 +164,12 @@
         "http://foo.com/aa/bb/foo.html");
     normalizeTest("http://foo.com/aa?referer=http://bar.com",
         "http://foo.com/aa?referer=http://bar.com");
+    // check for NPEs when normalizing URLs without host (authority)
+    normalizeTest("file:///foo/bar.txt", "file:///foo/bar.txt");
+    normalizeTest("ftp:/", "ftp:/");
+    normalizeTest("http:", "http:/");
+    normalizeTest("http:////", "http:/");
+    normalizeTest("http:///////", "http:/");
   }
 
   private void normalizeTest(String weird, String normal) throws Exception {
commit	3e2d3d456489bf52bc586dae0e2e71fb7aad8fe7	[log] [tgz]
author	Chris Mattmann <chris.mattmann@gmail.com>	Wed Feb 22 16:31:50 2017 -0800
committer	GitHub <noreply@github.com>	Wed Feb 22 16:31:50 2017 -0800
tree	8db380647b7e3af4163c600e3b7cfcab2015bcb7
parent	217fad16bfdea0494390e8f170d9350cf06657ef [diff]
parent	c4b895562716c5a37cb35328302d3b6801b04e48 [diff]