Merge pull request #474 from sebastian-nagel/NUTCH-2457-parse-tika-embedded-docs
NUTCH-2457 Embedded documents likely not correctly parsed by Tika
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index c5359bc..6bbf7dd 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1627,6 +1627,14 @@
</description>
</property>
+<property>
+ <name>tika.parse.embedded</name>
+ <value>true</value>
+ <description>
+ Whether parse-tika shall parse embedded documents (even recursively).
+ </description>
+</property>
+
<!-- urlfilter plugin properties -->
<property>
diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml
index bda9e89..b17643d 100644
--- a/src/plugin/parse-tika/build.xml
+++ b/src/plugin/parse-tika/build.xml
@@ -35,6 +35,7 @@
<include name="ootest.*"/>
<include name="*.doc"/>
<include name="*.gif"/>
+ <include name="*.docx"/>
</fileset>
</copy>
diff --git a/src/plugin/parse-tika/sample/test_recursive_embedded.docx b/src/plugin/parse-tika/sample/test_recursive_embedded.docx
new file mode 100644
index 0000000..cd562cb
--- /dev/null
+++ b/src/plugin/parse-tika/sample/test_recursive_embedded.docx
Binary files differ
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 3a48c98..f2461fe 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -45,6 +45,7 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -73,9 +74,10 @@
private HtmlParseFilters htmlParseFilters;
private String cachingPolicy;
private HtmlMapper HTMLMapper;
+ private boolean parseEmbedded = true;
private boolean upperCaseElementNames = true;
- private String boilerpipeExtractorName;
private boolean useBoilerpipe;
+ private String boilerpipeExtractorName;
private Set<String> boilerpipeMimeTypes;
public ParseResult getParse(Content content) {
@@ -134,6 +136,10 @@
LinkContentHandler linkContentHandler = new LinkContentHandler();
ParseContext context = new ParseContext();
+ if (parseEmbedded) {
+ context.set(Parser.class, new AutoDetectParser(tikaConfig));
+ }
+
TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler,
linkContentHandler);
@@ -309,6 +315,7 @@
boilerpipeMimeTypes = new HashSet<>(Arrays
.asList(conf.getTrimmedStrings("tika.extractor.boilerpipe.mime.types",
"text/html", "application/xhtml+xml")));
+ parseEmbedded = conf.getBoolean("tika.parse.embedded", true);
}
public Configuration getConf() {
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
new file mode 100644
index 0000000..cecf251
--- /dev/null
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * Unit tests for MSWordParser.
+ *
+ * @author John Xing
+ */
+public class TestEmbeddedDocuments {
+
+ private String fileSeparator = System.getProperty("file.separator");
+ // This system property is defined in ./src/plugin/build-plugin.xml
+ private String sampleDir = System.getProperty("test.data", ".");
+ // Make sure sample files are copied to "test.data" as specified in
+ // ./src/plugin/parse-tika/build.xml during plugin compilation.
+ private String[] sampleFiles = { "test_recursive_embedded.docx" };
+
+ private String expectedText = "When in the Course of human events";
+
+ private Configuration conf;
+
+ @Before
+ public void setUp() {
+ conf = NutchConfiguration.create();
+ conf.set("file.content.limit", "-1");
+ conf.setBoolean("tika.parse.embedded", true);
+ }
+
+ public String getTextContent(String fileName) throws ProtocolException,
+ ParseException {
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
+ Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ Content content = protocol.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+ Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+ .get(content.getUrl());
+ return parse.getText();
+ }
+
+ @Test
+ public void testIt() throws ProtocolException, ParseException {
+ for (int i = 0; i < sampleFiles.length; i++) {
+ String found = getTextContent(sampleFiles[i]);
+ Assert.assertTrue("text found : '" + found + "'",
+ found.contains(expectedText));
+ }
+ }
+
+}
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
index 7183ceb..37c536c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
@@ -44,8 +44,7 @@
// This system property is defined in ./src/plugin/build-plugin.xml
private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-msword/build.xml during plugin compilation.
- // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+ // ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "word97.doc" };
private String expectedText = "This is a sample doc file prepared for nutch.";
@@ -84,7 +83,7 @@
for (int i = 0; i < filenames.length; i++) {
if (filenames[i].endsWith(".doc") == false)
continue;
- Assert.assertTrue("cann't read content of " + filenames[i],
+ Assert.assertTrue("can't read content of " + filenames[i],
getTextContent(filenames[i]).length() > 0);
}
}
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
index b0226d9..93c0a2c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
@@ -41,7 +41,7 @@
// This system property is defined in ./src/plugin/build-plugin.xml
private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-oo/build.xml during plugin compilation.
+ // ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
private String expectedText;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
index 36b2ecf..fff6e9a 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
@@ -41,8 +41,7 @@
// This system property is defined in ./src/plugin/build-plugin.xml
private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-pdf/build.xml during plugin compilation.
- // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+ // ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
private String expectedText = "A VERY SMALL PDF FILE";
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index b45a20f..115220b 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
@@ -47,8 +47,7 @@
// This system property is defined in ./src/plugin/build-plugin.xml
private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
- // ./src/plugin/parse-rtf/build.xml during plugin compilation.
- // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+ // ./src/plugin/parse-tika/build.xml during plugin compilation.
private String rtfFile = "test.rtf";
@Ignore("There seems to be an issue with line 71 e.g. text.trim()")