Nutch 2.3.1 release maintenance branch. git-svn-id: https://svn.apache.org/repos/asf/nutch/branches/branch-2.3.1@1726084 13f79535-47bb-0310-9956-ffa450edef68

commit: 102b150f77406cb34b1b4fd32111336f912c359a [log] [tgz]
author: Lewis John McGibbney <lewismc@apache.org> Thu Jan 21 20:10:48 2016 +0000
committer: Lewis John McGibbney <lewismc@apache.org> Thu Jan 21 20:10:48 2016 +0000
tree: 2aaa31fe414b1efb8b965258ebbc9524e6750d36
parent: 9db3b3edd77b502c4114e79e5eacb815089bf010 [diff]
parent: 79230a7472470f6717759ee069a52c4148a9a1b7 [diff]
diff --git a/CHANGES.txt b/CHANGES.txt
index 0b37c1b..1154ef0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt

@@ -1,6 +1,21 @@
 Nutch Change Log
 
-Current Development 2.4-SNAPSHOT
+Nutch 2.3.1 Release 22092015 (ddmmyyyy)
+Release Report - http://s.apache.org/nutch_2.3.1
+
+* NUTCH-2168 Parse-tika fails to retrieve parser (snagel, Auro Miralles, lewismc)
+
+* NUTCH-2169 Integrate index-html into Nutch build (snagel)
+
+* NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, lewismc, snagel)
+
+* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
+
+* NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)
+
+* NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban Drulea, lewismc, snagel)
+
+* NUTCH-2018 Ensure that the Docker containers for Nutch 2.X are part of the Release Management Documentation (lewismc)
 
 * NUTCH-2105 Update Nutch Cassandra Dockerfile to work with Gora Nutch 2.3.1 (lewismc)
 

diff --git a/build.xml b/build.xml
index ee4ed2e..4ae31eb 100644
--- a/build.xml
+++ b/build.xml

@@ -169,6 +169,7 @@
       <!--packageset dir="${plugins.dir}/feed/src/java"/-->
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
+      <packageset dir="${plugins.dir}/index-html/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java"/>
@@ -599,6 +600,7 @@
       <!--packageset dir="${plugins.dir}/feed/src/java"/-->
       <packageset dir="${plugins.dir}/index-anchor/src/java"/>
       <packageset dir="${plugins.dir}/index-basic/src/java"/>
+      <packageset dir="${plugins.dir}/index-html/src/java"/>
       <packageset dir="${plugins.dir}/index-metadata/src/java"/>
       <packageset dir="${plugins.dir}/index-more/src/java"/>
       <packageset dir="${plugins.dir}/indexer-elastic/src/java"/>
@@ -967,6 +969,7 @@
         <source path="${basedir}/src/plugin/index-anchor/src/test/" />
         <source path="${basedir}/src/plugin/index-basic/src/java/" />
         <source path="${basedir}/src/plugin/index-basic/src/test/" />
+        <source path="${basedir}/src/plugin/index-html/src/java/" />
         <source path="${basedir}/src/plugin/index-metadata/src/java/" />
         <source path="${basedir}/src/plugin/index-more/src/java/" />
         <source path="${basedir}/src/plugin/index-more/src/test/" />

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 559b00f..14fd4c3 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml

@@ -156,7 +156,7 @@
 
 <property>
   <name>http.agent.version</name>
-  <value>Nutch-2.4-SNAPSHOT</value>
+  <value>Nutch-2.3.1</value>
   <description>A version string to advertise in the User-Agent 
    header.</description>
 </property>

diff --git a/conf/schema.xml b/conf/schema.xml
index 9a5e939..72b7659 100644
--- a/conf/schema.xml
+++ b/conf/schema.xml

@@ -32,6 +32,7 @@
     <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
     <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
 
+    <fieldtype name="binary" class="solr.BinaryField"/>
 
     <!--
       Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
@@ -357,6 +358,12 @@
 
     <!-- fields for tld plugin -->    
     <field name="tld" type="string" stored="false" indexed="false"/>
+
+    <!-- fields for index-html plugin
+         Note: although raw document content may be binary,
+               index-html adds a String to the index field -->
+    <field name="rawcontent" type="string" stored="true" indexed="false"/>
+
  </fields>
  <uniqueKey>id</uniqueKey>
  <defaultSearchField>text</defaultSearchField>
@@ -367,7 +374,6 @@
         or to add multiple fields to the same field for easier/faster searching.  -->
 
  <copyField source="content" dest="text"/>
- <copyField source="rawcontent" dest="text"/>
  <copyField source="url" dest="text"/>
  <copyField source="title" dest="text"/>
  <copyField source="anchor" dest="text"/>

diff --git a/default.properties b/default.properties
index 998c6aa..10917da 100644
--- a/default.properties
+++ b/default.properties

@@ -15,7 +15,7 @@
 
 
 name=apache-nutch
-version=2.4-SNAPSHOT
+version=2.3.1
 final.name=${name}-${version}
 year=2015
 
@@ -146,6 +146,7 @@
    org.apache.nutch.indexer.anchor*:\
    org.apache.nutch.indexer.basic*:\
    org.apache.nutch.indexer.feed*:\
+   org.apache.nutch.indexer.html*:\
    org.apache.nutch.indexer.metadata*:\
    org.apache.nutch.indexer.more*:\
    org.apache.nutch.indexer.subcollection*:\

diff --git a/docker/hbase/Dockerfile b/docker/hbase/Dockerfile
index b642d1d..61d9379 100644
--- a/docker/hbase/Dockerfile
+++ b/docker/hbase/Dockerfile

@@ -13,8 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from stackbrew/ubuntu:saucy
-MAINTAINER Radoslaw Stankiewicz <rrydziu@gmail.com>
+from ubuntu:14.04
+MAINTAINER Nutch Developers <dev@nutch.apache.org>
 
 WORKDIR /root/
 

diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java b/src/java/org/apache/nutch/crawl/GeneratorJob.java
index a0633f9..6081010 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorJob.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java

@@ -163,17 +163,20 @@
     return fields;
   }
 
+  /** Generate a random batch id */
+  public static String randomBatchId() {
+    long curTime = System.currentTimeMillis();
+    int randomSeed = Math.abs(new Random().nextInt());
+    String batchId = (curTime / 1000) + "-" + randomSeed;
+    return batchId;
+  }
+  
   public Map<String, Object> run(Map<String, Object> args) throws Exception {
     String batchId = (String) args.get(Nutch.ARG_BATCH);
-    if (batchId != null) {
-      getConf().set(GeneratorJob.BATCH_ID, batchId);
-    } else {
-      // generate batchId
-      long curTime = System.currentTimeMillis();
-      int randomSeed = Math.abs(new Random().nextInt());
-      batchId = (curTime / 1000) + "-" + randomSeed;
-      getConf().set(BATCH_ID, batchId);
+    if (batchId == null) {
+      batchId = randomBatchId();
     }
+    getConf().set(BATCH_ID, batchId);
 
     // map to inverted subset due for fetch, sort by score
     Long topN = null;
@@ -249,10 +252,15 @@
     if (topN != Long.MAX_VALUE) {
       LOG.info("GeneratorJob: topN: " + topN);
     }
+    String batchId = getConf().get(BATCH_ID);
     Map<String, Object> results = run(ToolUtil.toArgMap(Nutch.ARG_TOPN, topN,
         Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter,
-        Nutch.ARG_NORMALIZE, norm));
-    String batchId = getConf().get(BATCH_ID);
+        Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId));
+    if (batchId == null) {
+      // use generated random batch id
+      batchId = (String) results.get(BATCH_ID);
+    }
+
     long finish = System.currentTimeMillis();
     long generateCount = (Long) results.get(GENERATE_COUNT);
     LOG.info("GeneratorJob: finished at " + sdf.format(finish)
@@ -290,11 +298,6 @@
     long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
     boolean filter = true, norm = true;
 
-    // generate batchId
-    int randomSeed = Math.abs(new Random().nextInt());
-    String batchId = (curTime / 1000) + "-" + randomSeed;
-    getConf().set(BATCH_ID, batchId);
-
     for (int i = 0; i < args.length; i++) {
       if ("-topN".equals(args[i])) {
         topN = Long.parseLong(args[++i]);
@@ -307,9 +310,9 @@
       } else if ("-adddays".equals(args[i])) {
         long numDays = Integer.parseInt(args[++i]);
         curTime += numDays * 1000L * 60 * 60 * 24;
-      } else if ("-batchId".equals(args[i]))
+      } else if ("-batchId".equals(args[i])) {
         getConf().set(BATCH_ID, args[++i]);
-      else {
+      } else {
         System.err.println("Unrecognized arg " + args[i]);
         return -1;
       }

diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 3c8df80..73386a9 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml

@@ -29,6 +29,7 @@
      <ant dir="creativecommons" target="deploy"/>
      <ant dir="index-anchor" target="deploy"/>
      <ant dir="index-basic" target="deploy"/>
+     <ant dir="index-html" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
      <ant dir="index-metadata" target="deploy"/>
      <ant dir="indexer-solr" target="deploy"/>
@@ -116,6 +117,7 @@
     <ant dir="feed" target="clean"/>
     <ant dir="index-anchor" target="clean"/>
     <ant dir="index-basic" target="clean"/>
+    <ant dir="index-html" target="clean"/>
     <ant dir="index-more" target="clean"/>
     <ant dir="index-metadata" target="clean"/>
     <ant dir="indexer-solr" target="clean"/>

diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
index 1015714..6db3bea 100644
--- a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
+++ b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java

@@ -16,45 +16,26 @@
  */
 package org.apache.nutch.indexer.html;
 
-import java.util.Scanner;
-import java.nio.ByteBuffer;
 import java.io.ByteArrayInputStream;
-
-import java.text.ParseException;
+import java.nio.ByteBuffer;
 import java.util.Collection;
-import java.util.Date;
 import java.util.HashSet;
+import java.util.Scanner;
 
-import org.apache.avro.util.Utf8;
-import org.apache.commons.lang.StringUtils;
-import org.apache.nutch.util.StringUtil;
-
-import org.apache.commons.lang.time.DateUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.IndexingFilter;
 import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.HttpHeaders;
-import org.apache.nutch.net.protocols.HttpDateFormat;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.storage.WebPage.Field;
 import org.apache.nutch.util.MimeUtil;
-import org.apache.nutch.util.TableUtil;
-import org.apache.oro.text.regex.MalformedPatternException;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.solr.common.util.DateUtil;
+import org.apache.nutch.util.StringUtil;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 
 /**
- * Add HTML of page the document element so it can be indexed in scheme.xml
- *
- * @author Mohamed Meabed <mo.meabed@gmail.com>
+ * Add raw HTML content of a document to the index.
  */
 
 public class HtmlIndexingFilter implements IndexingFilter {
@@ -93,6 +74,7 @@
                 data = scanner.next();
             }
             doc.add("rawcontent", StringUtil.cleanField(data));
+            scanner.close();
         }
         return doc;
     }

diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/README.md b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/README.md
deleted file mode 100644
index 7504baf..0000000
--- a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/README.md
+++ /dev/null

@@ -1,69 +0,0 @@
-Index-html Plugin for apache nutch 2.x
-=================
-
-Index HTML content of the pages in Apaache Nutch 2.x ( 2.2.1 )
-
-##Instruction: 
-
-- ###Compile from Source
-
- - Download the plugin folder "index-html" and copy it to you Apache nutch 2 plugin directory ( ex: apache-nutch-2.2.1/src/plugin )
- - Add the ( index-html ) plugin to The plugin folder build.xml ( apache-nutch-2.2.1/src/plugin/build.xml ) in target ( deploy and clean ) so the file will look like 
- ```xml
-  <target name="deploy">
-     .......
-     <ant dir="index-basic" target="deploy"/>
-     <ant dir="index-more" target="deploy"/>
-     <ant dir="index-html" target="deploy"/>
-     <ant dir="language-identifier" target="deploy"/>
-    .........
-  </target>
-
-  <target name="clean">
-     .......
-     <ant dir="index-basic" target="deploy"/>
-     <ant dir="index-more" target="deploy"/>
-     <ant dir="index-html" target="deploy"/>
-     <ant dir="language-identifier" target="deploy"/>
-    .........
-  </target>
-  
-  ```
-  
-  - Run ( ant runtime ) in apache nutch 2 root folder to start the build
-  - You should have index-html.jar in build folder
-  - Enable the plugin by adding it to nutch-sites.xml ( or nutch-default.xml ) like beloe :
-  ```xml
-      <configuration>
-          ..........
-          <property>
-              <name>plugin.includes</name>
-              <value>...........someplugins....|index-html</value>
-          </property>
-          ..........
-      </configuration>
-  ```
-  - The plugin will add new Field "rawcontent" to the Nutch Doc, To index this field you need to add it to ( scheme.xml or schema-solr4.xml ) like 
-  ```xml
-    <field name="rawcontent" type="text" sstored="true" indexed="true" multiValued="false"/>
-  ```
-  - Run the crawler and you should see the new field rawcontent in index!
-  
-- ###Use Pre-Compiled Library 
-  - In The repo there is Build folder contain compiled .jar library ready for use. 
-  - Copy the library to your runtime path local if you are running the plugin locally ( apache-nutch-2.2.1/runtime/local/plugins ) Then follow the above steps to configure nutch-sites.xml
-  
-
-###Screen Shot 
-![Screen Shot](http://i.imgur.com/rPWAcoQ.png)
-
-## Need Help ?
-
-I'm Always glad ot help and assist. so if you have an idea that could make this project better
-
-Submit git issue or contact me [www.meabed.net](http://meabed.net)
-
-
-### How to contribute
-
-Make a fork, commit to develop branch and make a pull request

diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java
new file mode 100644
index 0000000..13fbc52
--- /dev/null
+++ b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java

@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index raw HTML content.
+ * 
+ * The plugin index-html adds the field &quot;rawcontent&quot; to the index.
+ * This field contains the raw (HTML) content of a document converted to a String.
+ */
+package org.apache.nutch.indexer.html;
+

diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
deleted file mode 100644
index 90b7af3..0000000
--- a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
+++ /dev/null

@@ -1,91 +0,0 @@
-<html>
-<body>
-<p>A HTML indexing plugin.</p>
-
-<div id="readme" class="blob instapaper_body">
-    <article class="markdown-body entry-content" itemprop="mainContentOfPage"><h1>
-        <a id="user-content-index-html-plugin-for-apache-nutch-2x" class="anchor" href="#index-html-plugin-for-apache-nutch-2x" aria-hidden="true"><span class="octicon octicon-link"></span></a>Index-html Plugin for apache nutch 2.x</h1>
-        <p>Index HTML content of the pages in Apaache Nutch 2.x ( https://github.com/Meabed/nutch2-index-html.git )</p>
-        <h2>
-            <a id="user-content-instruction" class="anchor" href="#instruction" aria-hidden="true"><span class="octicon octicon-link"></span></a>Instruction:</h2>
-
-        <ul class="task-list">
-            <li>
-                <h3>
-                    <a id="user-content-compile-from-source" class="anchor" href="#compile-from-source" aria-hidden="true"><span class="octicon octicon-link"></span></a>Compile from Source</h3>
-
-                <ul class="task-list">
-                    <li>Download the plugin folder "index-html" and copy it to you Apache nutch 2 plugin directory ( ex: apache-nutch-2.x.x/src/plugin )</li>
-                    <li>Add the ( index-html ) plugin to The plugin folder build.xml ( apache-nutch-2.x.x/src/plugin/build.xml ) in target ( deploy and clean ) so the file will look like</li>
-                </ul>
-
-                <div class="highlight highlight-xml"><pre>&lt;<span class="pl-ent">target</span> <span class="pl-e">name</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>&gt;
-   .......
-   &lt;<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-basic<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/&gt;
-   &lt;<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-more<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/&gt;
-   &lt;<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-html<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/&gt;
-   &lt;<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>language-identifier<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/&gt;
-  .........
-&lt;/<span class="pl-ent">target</span>&gt;
-
-&lt;<span class="pl-ent">target</span> <span class="pl-e">name</span>=<span class="pl-s1"><span class="pl-pds">"</span>clean<span class="pl-pds">"</span></span>&gt;
-   .......
-   &lt;<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-basic<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/&gt;
-   &lt;<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-more<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/&gt;
-   &lt;<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-html<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/&gt;
-   &lt;<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>language-identifier<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/&gt;
-  .........
-&lt;/<span class="pl-ent">target</span>&gt;
-</pre>
-                </div>
-
-                <ul class="task-list">
-                    <li>Run ( ant runtime ) in apache nutch 2 root folder to start the build</li>
-                    <li>You should have index-html.jar in build folder</li>
-                    <li>Enable the plugin by adding it to nutch-sites.xml ( or nutch-default.xml ) like beloe :</li>
-                </ul>
-
-                <div class="highlight highlight-xml"><pre>    &lt;<span class="pl-ent">configuration</span>&gt;
-        ..........
-        &lt;<span class="pl-ent">property</span>&gt;
-            &lt;<span class="pl-ent">name</span>&gt;plugin.includes&lt;/<span class="pl-ent">name</span>&gt;
-            &lt;<span class="pl-ent">value</span>&gt;...........someplugins....|index-html&lt;/<span class="pl-ent">value</span>&gt;
-        &lt;/<span class="pl-ent">property</span>&gt;
-        ..........
-    &lt;/<span class="pl-ent">configuration</span>&gt;</pre>
-                </div>
-
-                <ul class="task-list">
-                    <li>The plugin will add new Field "rawcontent" to the Nutch Doc, To index this field you need to add it to ( scheme.xml or schema-solr4.xml ) like</li>
-                </ul>
-
-                <div class="highlight highlight-xml">
-                    <pre>  &lt;<span class="pl-ent">field</span> <span class="pl-e">name</span>=<span class="pl-s1"><span class="pl-pds">"</span>rawcontent<span class="pl-pds">"</span></span> <span class="pl-e">type</span>=<span class="pl-s1"><span class="pl-pds">"</span>text<span class="pl-pds">"</span></span> <span class="pl-e">sstored</span>=<span class="pl-s1"><span class="pl-pds">"</span>true<span
-                            class="pl-pds">"</span></span> <span class="pl-e">indexed</span>=<span class="pl-s1"><span class="pl-pds">"</span>true<span class="pl-pds">"</span></span> <span class="pl-e">multiValued</span>=<span class="pl-s1"><span class="pl-pds">"</span>false<span class="pl-pds">"</span></span>/&gt;</pre>
-                </div>
-
-                <ul class="task-list">
-                    <li>Run the crawler and you should see the new field rawcontent in index!</li>
-                </ul>
-            </li>
-            <li>
-                <h3>
-                    <a id="user-content-use-pre-compiled-library" class="anchor" href="#use-pre-compiled-library" aria-hidden="true"><span class="octicon octicon-link"></span></a>Use Pre-Compiled Library</h3>
-
-                <ul class="task-list">
-                    <li>In The repo there is Build folder contain compiled .jar library ready for use.</li>
-                    <li>Copy the library to your runtime path local if you are running the plugin locally ( apache-nutch-2.x.x/runtime/local/plugins ) Then follow the above steps to configure nutch-sites.xml</li>
-                </ul>
-            </li>
-        </ul>
-
-        <h3>
-            <a id="user-content-screen-shot" class="anchor" href="#screen-shot" aria-hidden="true"><span class="octicon octicon-link"></span></a>Screen Shot</h3>
-
-        <p><a href="https://camo.githubusercontent.com/ed517e50080ffc0e925fd7fd8a90c9db2c96511d/687474703a2f2f692e696d6775722e636f6d2f72505741636f512e706e67" target="_blank"><img src="https://camo.githubusercontent.com/ed517e50080ffc0e925fd7fd8a90c9db2c96511d/687474703a2f2f692e696d6775722e636f6d2f72505741636f512e706e67" alt="Screen Shot" data-canonical-src="http://i.imgur.com/rPWAcoQ.png"
-                                                                                                                                                                                   style="max-width:100%;"></a></p>
-
-    </article>
-</div>
-</body>
-</html>

diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index 5102113..0527d5e 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java

@@ -27,6 +27,7 @@
 import java.net.URL;
 import java.nio.ByteBuffer;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
@@ -67,7 +68,8 @@
   // I used 1000 bytes at first, but found that some documents have
   // meta tag well past the first 1000 bytes.
   // (e.g. http://cn.promo.yahoo.com/customcare/music.html)
-  private static final int CHUNK_SIZE = 2000;
+  // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+  private static final int CHUNK_SIZE = 8192;
 
   // NUTCH-1006 Meta equiv with single quotes not accepted
   private static Pattern metaPattern = Pattern.compile(
@@ -111,14 +113,8 @@
     // to just inflate each byte to a 16-bit value by padding.
     // For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
     // {U+0041, U+0082, U+00B7}.
-    String str = "";
-    try {
-      str = new String(content.array(), content.arrayOffset()
-          + content.position(), length, Charset.forName("ASCII").toString());
-    } catch (UnsupportedEncodingException e) {
-      // code should never come here, but just in case...
-      return null;
-    }
+    String str = new String(content.array(), content.arrayOffset()
+        + content.position(), length, StandardCharsets.US_ASCII);
 
     Matcher metaMatcher = metaPattern.matcher(str);
     String encoding = null;

diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index d3df699..00aa30b 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

@@ -207,7 +207,7 @@
     this.tikaConfig = null;
 
     try {
-      tikaConfig = TikaConfig.getDefaultConfig();
+      tikaConfig = new TikaConfig(this.getClass().getClassLoader());
     } catch (Exception e2) {
       String message = "Problem loading default Tika configuration";
       LOG.error(message, e2);

diff --git a/src/plugin/subcollection/plugin.xml b/src/plugin/subcollection/plugin.xml
index 250e078..ca2cf2f 100644
--- a/src/plugin/subcollection/plugin.xml
+++ b/src/plugin/subcollection/plugin.xml

@@ -21,16 +21,16 @@
    version="1.0.0"
    provider-name="apache.org">
 
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-   </requires>
-
    <runtime>
       <library name="subcollection.jar">
          <export name="*"/>
       </library>
    </runtime>
    
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
    <extension id="org.apache.nutch.indexer.subcollection.indexing"
               name="Subcollection Indexing Filter"
               point="org.apache.nutch.indexer.IndexingFilter">

diff --git a/src/plugin/urlnormalizer-regex/plugin.xml b/src/plugin/urlnormalizer-regex/plugin.xml
index 20161ce..e75096f 100644
--- a/src/plugin/urlnormalizer-regex/plugin.xml
+++ b/src/plugin/urlnormalizer-regex/plugin.xml

@@ -28,7 +28,7 @@
    </runtime>
 
    <requires>
-    <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+      <import plugin="nutch-extensionpoints"/>
    </requires>
 
    <extension id="org.apache.nutch.net.urlnormalizer.regex"
commit	102b150f77406cb34b1b4fd32111336f912c359a	[log] [tgz]
author	Lewis John McGibbney <lewismc@apache.org>	Thu Jan 21 20:10:48 2016 +0000
committer	Lewis John McGibbney <lewismc@apache.org>	Thu Jan 21 20:10:48 2016 +0000
tree	2aaa31fe414b1efb8b965258ebbc9524e6750d36
parent	9db3b3edd77b502c4114e79e5eacb815089bf010 [diff]
parent	79230a7472470f6717759ee069a52c4148a9a1b7 [diff]