Nutch 2.3.1 release maintenance branch.
git-svn-id: https://svn.apache.org/repos/asf/nutch/branches/branch-2.3.1@1726084 13f79535-47bb-0310-9956-ffa450edef68
diff --git a/CHANGES.txt b/CHANGES.txt
index 0b37c1b..1154ef0 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,6 +1,21 @@
Nutch Change Log
-Current Development 2.4-SNAPSHOT
+Nutch 2.3.1 Release 22092015 (ddmmyyyy)
+Release Report - http://s.apache.org/nutch_2.3.1
+
+* NUTCH-2168 Parse-tika fails to retrieve parser (snagel, Auro Miralles, lewismc)
+
+* NUTCH-2169 Integrate index-html into Nutch build (snagel)
+
+* NUTCH-2143 GeneratorJob ignores batch id passed as argument (liuqibj, lewismc, snagel)
+
+* NUTCH-2042 parse-html increase chunk size used to detect charset (snagel)
+
+* NUTCH-2107 plugin.xml to validate against plugin.dtd (snagel)
+
+* NUTCH-2130 copyField rawcontent creates error within schema.xml (Sherban Drulea, lewismc, snagel)
+
+* NUTCH-2018 Ensure that the Docker containers for Nutch 2.X are part of the Release Management Documentation (lewismc)
* NUTCH-2105 Update Nutch Cassandra Dockerfile to work with Gora Nutch 2.3.1 (lewismc)
diff --git a/build.xml b/build.xml
index ee4ed2e..4ae31eb 100644
--- a/build.xml
+++ b/build.xml
@@ -169,6 +169,7 @@
<!--packageset dir="${plugins.dir}/feed/src/java"/-->
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
+ <packageset dir="${plugins.dir}/index-html/src/java"/>
<packageset dir="${plugins.dir}/index-metadata/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java"/>
@@ -599,6 +600,7 @@
<!--packageset dir="${plugins.dir}/feed/src/java"/-->
<packageset dir="${plugins.dir}/index-anchor/src/java"/>
<packageset dir="${plugins.dir}/index-basic/src/java"/>
+ <packageset dir="${plugins.dir}/index-html/src/java"/>
<packageset dir="${plugins.dir}/index-metadata/src/java"/>
<packageset dir="${plugins.dir}/index-more/src/java"/>
<packageset dir="${plugins.dir}/indexer-elastic/src/java"/>
@@ -967,6 +969,7 @@
<source path="${basedir}/src/plugin/index-anchor/src/test/" />
<source path="${basedir}/src/plugin/index-basic/src/java/" />
<source path="${basedir}/src/plugin/index-basic/src/test/" />
+ <source path="${basedir}/src/plugin/index-html/src/java/" />
<source path="${basedir}/src/plugin/index-metadata/src/java/" />
<source path="${basedir}/src/plugin/index-more/src/java/" />
<source path="${basedir}/src/plugin/index-more/src/test/" />
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 559b00f..14fd4c3 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -156,7 +156,7 @@
<property>
<name>http.agent.version</name>
- <value>Nutch-2.4-SNAPSHOT</value>
+ <value>Nutch-2.3.1</value>
<description>A version string to advertise in the User-Agent
header.</description>
</property>
diff --git a/conf/schema.xml b/conf/schema.xml
index 9a5e939..72b7659 100644
--- a/conf/schema.xml
+++ b/conf/schema.xml
@@ -32,6 +32,7 @@
<!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
<fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/>
+ <fieldtype name="binary" class="solr.BinaryField"/>
<!--
Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types.
@@ -357,6 +358,12 @@
<!-- fields for tld plugin -->
<field name="tld" type="string" stored="false" indexed="false"/>
+
+ <!-- fields for index-html plugin
+ Note: although raw document content may be binary,
+ index-html adds a String to the index field -->
+ <field name="rawcontent" type="string" stored="true" indexed="false"/>
+
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>text</defaultSearchField>
@@ -367,7 +374,6 @@
or to add multiple fields to the same field for easier/faster searching. -->
<copyField source="content" dest="text"/>
- <copyField source="rawcontent" dest="text"/>
<copyField source="url" dest="text"/>
<copyField source="title" dest="text"/>
<copyField source="anchor" dest="text"/>
diff --git a/default.properties b/default.properties
index 998c6aa..10917da 100644
--- a/default.properties
+++ b/default.properties
@@ -15,7 +15,7 @@
name=apache-nutch
-version=2.4-SNAPSHOT
+version=2.3.1
final.name=${name}-${version}
year=2015
@@ -146,6 +146,7 @@
org.apache.nutch.indexer.anchor*:\
org.apache.nutch.indexer.basic*:\
org.apache.nutch.indexer.feed*:\
+ org.apache.nutch.indexer.html*:\
org.apache.nutch.indexer.metadata*:\
org.apache.nutch.indexer.more*:\
org.apache.nutch.indexer.subcollection*:\
diff --git a/docker/hbase/Dockerfile b/docker/hbase/Dockerfile
index b642d1d..61d9379 100644
--- a/docker/hbase/Dockerfile
+++ b/docker/hbase/Dockerfile
@@ -13,8 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-from stackbrew/ubuntu:saucy
-MAINTAINER Radoslaw Stankiewicz <rrydziu@gmail.com>
+from ubuntu:14.04
+MAINTAINER Nutch Developers <dev@nutch.apache.org>
WORKDIR /root/
diff --git a/src/java/org/apache/nutch/crawl/GeneratorJob.java b/src/java/org/apache/nutch/crawl/GeneratorJob.java
index a0633f9..6081010 100644
--- a/src/java/org/apache/nutch/crawl/GeneratorJob.java
+++ b/src/java/org/apache/nutch/crawl/GeneratorJob.java
@@ -163,17 +163,20 @@
return fields;
}
+ /** Generate a random batch id */
+ public static String randomBatchId() {
+ long curTime = System.currentTimeMillis();
+ int randomSeed = Math.abs(new Random().nextInt());
+ String batchId = (curTime / 1000) + "-" + randomSeed;
+ return batchId;
+ }
+
public Map<String, Object> run(Map<String, Object> args) throws Exception {
String batchId = (String) args.get(Nutch.ARG_BATCH);
- if (batchId != null) {
- getConf().set(GeneratorJob.BATCH_ID, batchId);
- } else {
- // generate batchId
- long curTime = System.currentTimeMillis();
- int randomSeed = Math.abs(new Random().nextInt());
- batchId = (curTime / 1000) + "-" + randomSeed;
- getConf().set(BATCH_ID, batchId);
+ if (batchId == null) {
+ batchId = randomBatchId();
}
+ getConf().set(BATCH_ID, batchId);
// map to inverted subset due for fetch, sort by score
Long topN = null;
@@ -249,10 +252,15 @@
if (topN != Long.MAX_VALUE) {
LOG.info("GeneratorJob: topN: " + topN);
}
+ String batchId = getConf().get(BATCH_ID);
Map<String, Object> results = run(ToolUtil.toArgMap(Nutch.ARG_TOPN, topN,
Nutch.ARG_CURTIME, curTime, Nutch.ARG_FILTER, filter,
- Nutch.ARG_NORMALIZE, norm));
- String batchId = getConf().get(BATCH_ID);
+ Nutch.ARG_NORMALIZE, norm, Nutch.ARG_BATCH, batchId));
+ if (batchId == null) {
+ // use generated random batch id
+ batchId = (String) results.get(BATCH_ID);
+ }
+
long finish = System.currentTimeMillis();
long generateCount = (Long) results.get(GENERATE_COUNT);
LOG.info("GeneratorJob: finished at " + sdf.format(finish)
@@ -290,11 +298,6 @@
long curTime = System.currentTimeMillis(), topN = Long.MAX_VALUE;
boolean filter = true, norm = true;
- // generate batchId
- int randomSeed = Math.abs(new Random().nextInt());
- String batchId = (curTime / 1000) + "-" + randomSeed;
- getConf().set(BATCH_ID, batchId);
-
for (int i = 0; i < args.length; i++) {
if ("-topN".equals(args[i])) {
topN = Long.parseLong(args[++i]);
@@ -307,9 +310,9 @@
} else if ("-adddays".equals(args[i])) {
long numDays = Integer.parseInt(args[++i]);
curTime += numDays * 1000L * 60 * 60 * 24;
- } else if ("-batchId".equals(args[i]))
+ } else if ("-batchId".equals(args[i])) {
getConf().set(BATCH_ID, args[++i]);
- else {
+ } else {
System.err.println("Unrecognized arg " + args[i]);
return -1;
}
diff --git a/src/plugin/build.xml b/src/plugin/build.xml
index 3c8df80..73386a9 100755
--- a/src/plugin/build.xml
+++ b/src/plugin/build.xml
@@ -29,6 +29,7 @@
<ant dir="creativecommons" target="deploy"/>
<ant dir="index-anchor" target="deploy"/>
<ant dir="index-basic" target="deploy"/>
+ <ant dir="index-html" target="deploy"/>
<ant dir="index-more" target="deploy"/>
<ant dir="index-metadata" target="deploy"/>
<ant dir="indexer-solr" target="deploy"/>
@@ -116,6 +117,7 @@
<ant dir="feed" target="clean"/>
<ant dir="index-anchor" target="clean"/>
<ant dir="index-basic" target="clean"/>
+ <ant dir="index-html" target="clean"/>
<ant dir="index-more" target="clean"/>
<ant dir="index-metadata" target="clean"/>
<ant dir="indexer-solr" target="clean"/>
diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
index 1015714..6db3bea 100644
--- a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
+++ b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/HtmlIndexingFilter.java
@@ -16,45 +16,26 @@
*/
package org.apache.nutch.indexer.html;
-import java.util.Scanner;
-import java.nio.ByteBuffer;
import java.io.ByteArrayInputStream;
-
-import java.text.ParseException;
+import java.nio.ByteBuffer;
import java.util.Collection;
-import java.util.Date;
import java.util.HashSet;
+import java.util.Scanner;
-import org.apache.avro.util.Utf8;
-import org.apache.commons.lang.StringUtils;
-import org.apache.nutch.util.StringUtil;
-
-import org.apache.commons.lang.time.DateUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
-import org.apache.nutch.metadata.HttpHeaders;
-import org.apache.nutch.net.protocols.HttpDateFormat;
import org.apache.nutch.storage.WebPage;
import org.apache.nutch.storage.WebPage.Field;
import org.apache.nutch.util.MimeUtil;
-import org.apache.nutch.util.TableUtil;
-import org.apache.oro.text.regex.MalformedPatternException;
-import org.apache.oro.text.regex.MatchResult;
-import org.apache.oro.text.regex.PatternMatcher;
-import org.apache.oro.text.regex.Perl5Compiler;
-import org.apache.oro.text.regex.Perl5Matcher;
-import org.apache.oro.text.regex.Perl5Pattern;
-import org.apache.solr.common.util.DateUtil;
+import org.apache.nutch.util.StringUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
- * Add HTML of page the document element so it can be indexed in scheme.xml
- *
- * @author Mohamed Meabed <mo.meabed@gmail.com>
+ * Add raw HTML content of a document to the index.
*/
public class HtmlIndexingFilter implements IndexingFilter {
@@ -93,6 +74,7 @@
data = scanner.next();
}
doc.add("rawcontent", StringUtil.cleanField(data));
+ scanner.close();
}
return doc;
}
diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/README.md b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/README.md
deleted file mode 100644
index 7504baf..0000000
--- a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/README.md
+++ /dev/null
@@ -1,69 +0,0 @@
-Index-html Plugin for apache nutch 2.x
-=================
-
-Index HTML content of the pages in Apaache Nutch 2.x ( 2.2.1 )
-
-##Instruction:
-
-- ###Compile from Source
-
- - Download the plugin folder "index-html" and copy it to you Apache nutch 2 plugin directory ( ex: apache-nutch-2.2.1/src/plugin )
- - Add the ( index-html ) plugin to The plugin folder build.xml ( apache-nutch-2.2.1/src/plugin/build.xml ) in target ( deploy and clean ) so the file will look like
- ```xml
- <target name="deploy">
- .......
- <ant dir="index-basic" target="deploy"/>
- <ant dir="index-more" target="deploy"/>
- <ant dir="index-html" target="deploy"/>
- <ant dir="language-identifier" target="deploy"/>
- .........
- </target>
-
- <target name="clean">
- .......
- <ant dir="index-basic" target="deploy"/>
- <ant dir="index-more" target="deploy"/>
- <ant dir="index-html" target="deploy"/>
- <ant dir="language-identifier" target="deploy"/>
- .........
- </target>
-
- ```
-
- - Run ( ant runtime ) in apache nutch 2 root folder to start the build
- - You should have index-html.jar in build folder
- - Enable the plugin by adding it to nutch-sites.xml ( or nutch-default.xml ) like beloe :
- ```xml
- <configuration>
- ..........
- <property>
- <name>plugin.includes</name>
- <value>...........someplugins....|index-html</value>
- </property>
- ..........
- </configuration>
- ```
- - The plugin will add new Field "rawcontent" to the Nutch Doc, To index this field you need to add it to ( scheme.xml or schema-solr4.xml ) like
- ```xml
- <field name="rawcontent" type="text" sstored="true" indexed="true" multiValued="false"/>
- ```
- - Run the crawler and you should see the new field rawcontent in index!
-
-- ###Use Pre-Compiled Library
- - In The repo there is Build folder contain compiled .jar library ready for use.
- - Copy the library to your runtime path local if you are running the plugin locally ( apache-nutch-2.2.1/runtime/local/plugins ) Then follow the above steps to configure nutch-sites.xml
-
-
-###Screen Shot
-![Screen Shot](http://i.imgur.com/rPWAcoQ.png)
-
-## Need Help ?
-
-I'm Always glad ot help and assist. so if you have an idea that could make this project better
-
-Submit git issue or contact me [www.meabed.net](http://meabed.net)
-
-
-### How to contribute
-
-Make a fork, commit to develop branch and make a pull request
diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java
new file mode 100644
index 0000000..13fbc52
--- /dev/null
+++ b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Index raw HTML content.
+ *
+ * The plugin index-html adds the field "rawcontent" to the index.
+ * This field contains the raw (HTML) content of a document converted to a String.
+ */
+package org.apache.nutch.indexer.html;
+
diff --git a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html b/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
deleted file mode 100644
index 90b7af3..0000000
--- a/src/plugin/index-html/src/java/org/apache/nutch/indexer/html/package.html
+++ /dev/null
@@ -1,91 +0,0 @@
-<html>
-<body>
-<p>A HTML indexing plugin.</p>
-
-<div id="readme" class="blob instapaper_body">
- <article class="markdown-body entry-content" itemprop="mainContentOfPage"><h1>
- <a id="user-content-index-html-plugin-for-apache-nutch-2x" class="anchor" href="#index-html-plugin-for-apache-nutch-2x" aria-hidden="true"><span class="octicon octicon-link"></span></a>Index-html Plugin for apache nutch 2.x</h1>
- <p>Index HTML content of the pages in Apaache Nutch 2.x ( https://github.com/Meabed/nutch2-index-html.git )</p>
- <h2>
- <a id="user-content-instruction" class="anchor" href="#instruction" aria-hidden="true"><span class="octicon octicon-link"></span></a>Instruction:</h2>
-
- <ul class="task-list">
- <li>
- <h3>
- <a id="user-content-compile-from-source" class="anchor" href="#compile-from-source" aria-hidden="true"><span class="octicon octicon-link"></span></a>Compile from Source</h3>
-
- <ul class="task-list">
- <li>Download the plugin folder "index-html" and copy it to you Apache nutch 2 plugin directory ( ex: apache-nutch-2.x.x/src/plugin )</li>
- <li>Add the ( index-html ) plugin to The plugin folder build.xml ( apache-nutch-2.x.x/src/plugin/build.xml ) in target ( deploy and clean ) so the file will look like</li>
- </ul>
-
- <div class="highlight highlight-xml"><pre><<span class="pl-ent">target</span> <span class="pl-e">name</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>>
- .......
- <<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-basic<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/>
- <<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-more<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/>
- <<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-html<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/>
- <<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>language-identifier<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/>
- .........
-</<span class="pl-ent">target</span>>
-
-<<span class="pl-ent">target</span> <span class="pl-e">name</span>=<span class="pl-s1"><span class="pl-pds">"</span>clean<span class="pl-pds">"</span></span>>
- .......
- <<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-basic<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/>
- <<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-more<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/>
- <<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>index-html<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/>
- <<span class="pl-ent">ant</span> <span class="pl-e">dir</span>=<span class="pl-s1"><span class="pl-pds">"</span>language-identifier<span class="pl-pds">"</span></span> <span class="pl-e">target</span>=<span class="pl-s1"><span class="pl-pds">"</span>deploy<span class="pl-pds">"</span></span>/>
- .........
-</<span class="pl-ent">target</span>>
-</pre>
- </div>
-
- <ul class="task-list">
- <li>Run ( ant runtime ) in apache nutch 2 root folder to start the build</li>
- <li>You should have index-html.jar in build folder</li>
- <li>Enable the plugin by adding it to nutch-sites.xml ( or nutch-default.xml ) like beloe :</li>
- </ul>
-
- <div class="highlight highlight-xml"><pre> <<span class="pl-ent">configuration</span>>
- ..........
- <<span class="pl-ent">property</span>>
- <<span class="pl-ent">name</span>>plugin.includes</<span class="pl-ent">name</span>>
- <<span class="pl-ent">value</span>>...........someplugins....|index-html</<span class="pl-ent">value</span>>
- </<span class="pl-ent">property</span>>
- ..........
- </<span class="pl-ent">configuration</span>></pre>
- </div>
-
- <ul class="task-list">
- <li>The plugin will add new Field "rawcontent" to the Nutch Doc, To index this field you need to add it to ( scheme.xml or schema-solr4.xml ) like</li>
- </ul>
-
- <div class="highlight highlight-xml">
- <pre> <<span class="pl-ent">field</span> <span class="pl-e">name</span>=<span class="pl-s1"><span class="pl-pds">"</span>rawcontent<span class="pl-pds">"</span></span> <span class="pl-e">type</span>=<span class="pl-s1"><span class="pl-pds">"</span>text<span class="pl-pds">"</span></span> <span class="pl-e">sstored</span>=<span class="pl-s1"><span class="pl-pds">"</span>true<span
- class="pl-pds">"</span></span> <span class="pl-e">indexed</span>=<span class="pl-s1"><span class="pl-pds">"</span>true<span class="pl-pds">"</span></span> <span class="pl-e">multiValued</span>=<span class="pl-s1"><span class="pl-pds">"</span>false<span class="pl-pds">"</span></span>/></pre>
- </div>
-
- <ul class="task-list">
- <li>Run the crawler and you should see the new field rawcontent in index!</li>
- </ul>
- </li>
- <li>
- <h3>
- <a id="user-content-use-pre-compiled-library" class="anchor" href="#use-pre-compiled-library" aria-hidden="true"><span class="octicon octicon-link"></span></a>Use Pre-Compiled Library</h3>
-
- <ul class="task-list">
- <li>In The repo there is Build folder contain compiled .jar library ready for use.</li>
- <li>Copy the library to your runtime path local if you are running the plugin locally ( apache-nutch-2.x.x/runtime/local/plugins ) Then follow the above steps to configure nutch-sites.xml</li>
- </ul>
- </li>
- </ul>
-
- <h3>
- <a id="user-content-screen-shot" class="anchor" href="#screen-shot" aria-hidden="true"><span class="octicon octicon-link"></span></a>Screen Shot</h3>
-
- <p><a href="https://camo.githubusercontent.com/ed517e50080ffc0e925fd7fd8a90c9db2c96511d/687474703a2f2f692e696d6775722e636f6d2f72505741636f512e706e67" target="_blank"><img src="https://camo.githubusercontent.com/ed517e50080ffc0e925fd7fd8a90c9db2c96511d/687474703a2f2f692e696d6775722e636f6d2f72505741636f512e706e67" alt="Screen Shot" data-canonical-src="http://i.imgur.com/rPWAcoQ.png"
- style="max-width:100%;"></a></p>
-
- </article>
-</div>
-</body>
-</html>
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
index 5102113..0527d5e 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -27,6 +27,7 @@
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -67,7 +68,8 @@
// I used 1000 bytes at first, but found that some documents have
// meta tag well past the first 1000 bytes.
// (e.g. http://cn.promo.yahoo.com/customcare/music.html)
- private static final int CHUNK_SIZE = 2000;
+ // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
+ private static final int CHUNK_SIZE = 8192;
// NUTCH-1006 Meta equiv with single quotes not accepted
private static Pattern metaPattern = Pattern.compile(
@@ -111,14 +113,8 @@
// to just inflate each byte to a 16-bit value by padding.
// For instance, the sequence {0x41, 0x82, 0xb7} will be turned into
// {U+0041, U+0082, U+00B7}.
- String str = "";
- try {
- str = new String(content.array(), content.arrayOffset()
- + content.position(), length, Charset.forName("ASCII").toString());
- } catch (UnsupportedEncodingException e) {
- // code should never come here, but just in case...
- return null;
- }
+ String str = new String(content.array(), content.arrayOffset()
+ + content.position(), length, StandardCharsets.US_ASCII);
Matcher metaMatcher = metaPattern.matcher(str);
String encoding = null;
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index d3df699..00aa30b 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -207,7 +207,7 @@
this.tikaConfig = null;
try {
- tikaConfig = TikaConfig.getDefaultConfig();
+ tikaConfig = new TikaConfig(this.getClass().getClassLoader());
} catch (Exception e2) {
String message = "Problem loading default Tika configuration";
LOG.error(message, e2);
diff --git a/src/plugin/subcollection/plugin.xml b/src/plugin/subcollection/plugin.xml
index 250e078..ca2cf2f 100644
--- a/src/plugin/subcollection/plugin.xml
+++ b/src/plugin/subcollection/plugin.xml
@@ -21,16 +21,16 @@
version="1.0.0"
provider-name="apache.org">
- <requires>
- <import plugin="nutch-extensionpoints"/>
- </requires>
-
<runtime>
<library name="subcollection.jar">
<export name="*"/>
</library>
</runtime>
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ </requires>
+
<extension id="org.apache.nutch.indexer.subcollection.indexing"
name="Subcollection Indexing Filter"
point="org.apache.nutch.indexer.IndexingFilter">
diff --git a/src/plugin/urlnormalizer-regex/plugin.xml b/src/plugin/urlnormalizer-regex/plugin.xml
index 20161ce..e75096f 100644
--- a/src/plugin/urlnormalizer-regex/plugin.xml
+++ b/src/plugin/urlnormalizer-regex/plugin.xml
@@ -28,7 +28,7 @@
</runtime>
<requires>
- <include file="${nutch.root}/ivy/ivy-configurations.xml"/>
+ <import plugin="nutch-extensionpoints"/>
</requires>
<extension id="org.apache.nutch.net.urlnormalizer.regex"