Merge pull request #474 from sebastian-nagel/NUTCH-2457-parse-tika-embedded-docs NUTCH-2457 Embedded documents likely not correctly parsed by Tika

commit: 9e49c3f6849baca6aa0af4a5536c2376347eebd5 [log] [tgz]
author: Sebastian Nagel <snagel@apache.org> Mon Sep 30 13:30:40 2019 +0200
committer: GitHub <noreply@github.com> Mon Sep 30 13:30:40 2019 +0200
tree: ee3330440310d258ee8193753710a0de02ae4c44
parent: 0f4692739889dd0276172782257978f4eadf5ec0 [diff]
parent: c9238a1b51093d847bd37f738826dd03f710981c [diff]
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index c5359bc..6bbf7dd 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml

@@ -1627,6 +1627,14 @@
   </description>
 </property>
 
+<property>
+  <name>tika.parse.embedded</name>
+  <value>true</value>
+  <description>
+    Whether parse-tika shall parse embedded documents (even recursively).
+  </description>
+</property>
+
 <!-- urlfilter plugin properties -->
 
 <property>

diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml
index bda9e89..b17643d 100644
--- a/src/plugin/parse-tika/build.xml
+++ b/src/plugin/parse-tika/build.xml

@@ -35,6 +35,7 @@
       <include name="ootest.*"/>
       <include name="*.doc"/>
       <include name="*.gif"/>
+      <include name="*.docx"/>
     </fileset>
   </copy>
   

diff --git a/src/plugin/parse-tika/sample/test_recursive_embedded.docx b/src/plugin/parse-tika/sample/test_recursive_embedded.docx
new file mode 100644
index 0000000..cd562cb
--- /dev/null
+++ b/src/plugin/parse-tika/sample/test_recursive_embedded.docx
Binary files differ

diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index 3a48c98..f2461fe 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java

@@ -45,6 +45,7 @@
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.CompositeParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -73,9 +74,10 @@
   private HtmlParseFilters htmlParseFilters;
   private String cachingPolicy;
   private HtmlMapper HTMLMapper;
+  private boolean parseEmbedded = true;
   private boolean upperCaseElementNames = true;
-  private String boilerpipeExtractorName;
   private boolean useBoilerpipe;
+  private String boilerpipeExtractorName;
   private Set<String> boilerpipeMimeTypes;
 
   public ParseResult getParse(Content content) {
@@ -134,6 +136,10 @@
     LinkContentHandler linkContentHandler = new LinkContentHandler();
 
     ParseContext context = new ParseContext();
+    if (parseEmbedded) {
+      context.set(Parser.class, new AutoDetectParser(tikaConfig));
+    }
+
     TeeContentHandler teeContentHandler = new TeeContentHandler(domHandler,
         linkContentHandler);
 
@@ -309,6 +315,7 @@
     boilerpipeMimeTypes = new HashSet<>(Arrays
         .asList(conf.getTrimmedStrings("tika.extractor.boilerpipe.mime.types",
             "text/html", "application/xhtml+xml")));
+    parseEmbedded = conf.getBoolean("tika.parse.embedded", true);
   }
 
   public Configuration getConf() {

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
new file mode 100644
index 0000000..cecf251
--- /dev/null
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java

@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * Unit tests for MSWordParser.
+ * 
+ * @author John Xing
+ */
+public class TestEmbeddedDocuments {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
+  private String[] sampleFiles = { "test_recursive_embedded.docx" };
+
+  private String expectedText = "When in the Course of human events";
+
+  private Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+    conf.setBoolean("tika.parse.embedded", true);
+  }
+
+  public String getTextContent(String fileName) throws ProtocolException,
+      ParseException {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    Content content = protocol.getProtocolOutput(new Text(urlString),
+        new CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+        .get(content.getUrl());
+    return parse.getText();
+  }
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      String found = getTextContent(sampleFiles[i]);
+      Assert.assertTrue("text found : '" + found + "'",
+          found.contains(expectedText));
+    }
+  }
+
+}

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
index 7183ceb..37c536c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java

@@ -44,8 +44,7 @@
   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-msword/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "word97.doc" };
 
   private String expectedText = "This is a sample doc file prepared for nutch.";
@@ -84,7 +83,7 @@
     for (int i = 0; i < filenames.length; i++) {
       if (filenames[i].endsWith(".doc") == false)
         continue;
-      Assert.assertTrue("cann't read content of " + filenames[i],
+      Assert.assertTrue("can't read content of " + filenames[i],
           getTextContent(filenames[i]).length() > 0);
     }
   }

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
index b0226d9..93c0a2c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java

@@ -41,7 +41,7 @@
   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-oo/build.xml during plugin compilation.
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
 
   private String expectedText;

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
index 36b2ecf..fff6e9a 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java

@@ -41,8 +41,7 @@
   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
 
   private String expectedText = "A VERY SMALL PDF FILE";

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index b45a20f..115220b 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java

@@ -47,8 +47,7 @@
   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-rtf/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String rtfFile = "test.rtf";
 
   @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
commit	9e49c3f6849baca6aa0af4a5536c2376347eebd5	[log] [tgz]
author	Sebastian Nagel <snagel@apache.org>	Mon Sep 30 13:30:40 2019 +0200
committer	GitHub <noreply@github.com>	Mon Sep 30 13:30:40 2019 +0200
tree	ee3330440310d258ee8193753710a0de02ae4c44
parent	0f4692739889dd0276172782257978f4eadf5ec0 [diff]
parent	c9238a1b51093d847bd37f738826dd03f710981c [diff]