Merge pull request #525 from sebastian-nagel/NUTCH-1945 NUTCH-1945 Test for XLSX parser

commit: e61a8a3b0af5540ffe23f862e2af69092114f506 [log] [tgz]
author: Sebastian Nagel <snagel@apache.org> Tue May 12 15:35:09 2020 +0200
committer: GitHub <noreply@github.com> Tue May 12 15:35:09 2020 +0200
tree: 6c63331240c3356816d0d9899205e0ee8192e83b
parent: ec93b3359d207fef62378cbd15bb63c7acc33f66 [diff]
parent: 0341f0dfa156d3963e88b2cb9507013b0eef8668 [diff]
diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml
index b17643d..af3e610 100644
--- a/src/plugin/parse-tika/build.xml
+++ b/src/plugin/parse-tika/build.xml

@@ -36,6 +36,7 @@
       <include name="*.doc"/>
       <include name="*.gif"/>
       <include name="*.docx"/>
+      <include name="*.xlsx"/>
     </fileset>
   </copy>
   

diff --git a/src/plugin/parse-tika/sample/test.xlsx b/src/plugin/parse-tika/sample/test.xlsx
new file mode 100644
index 0000000..de33f28
--- /dev/null
+++ b/src/plugin/parse-tika/sample/test.xlsx
Binary files differ

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
index cecf251..79ed286 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java

@@ -16,59 +16,29 @@
  */
 package org.apache.nutch.parse.tika;
 
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
-import java.io.File;
-
 /**
  * Unit tests for MSWordParser.
- * 
- * @author John Xing
  */
-public class TestEmbeddedDocuments {
+public class TestEmbeddedDocuments extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "test_recursive_embedded.docx" };
 
   private String expectedText = "When in the Course of human events";
 
-  private Configuration conf;
-
   @Before
   public void setUp() {
-    conf = NutchConfiguration.create();
-    conf.set("file.content.limit", "-1");
+    super.setUp();
     conf.setBoolean("tika.parse.embedded", true);
   }
 
-  public String getTextContent(String fileName) throws ProtocolException,
-      ParseException {
-    String urlString = "file:" + sampleDir + fileSeparator + fileName;
-    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    Content content = protocol.getProtocolOutput(new Text(urlString),
-        new CrawlDatum()).getContent();
-    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-        .get(content.getUrl());
-    return parse.getText();
-  }
-
   @Test
   public void testIt() throws ProtocolException, ParseException {
     for (int i = 0; i < sampleFiles.length; i++) {

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
index 87b452c..94eec53 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java

@@ -26,7 +26,6 @@
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseException;
 import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.tika.TikaParser;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolException;
@@ -34,18 +33,9 @@
 import org.apache.nutch.util.NutchConfiguration;
 
 /**
- * 
- * @author mattmann / jnioche
- * 
- *         Test Suite for the RSS feeds with the {@link TikaParser}.
- * 
+ * Test Suite for the RSS feeds with the {@link TikaParser}.
  */
-public class TestFeedParser {
-
-  private String fileSeparator = System.getProperty("file.separator");
-
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
+public class TestFeedParser extends TikaParserTest {
 
   private String[] sampleFiles = { "rsstest.rss" };
 

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
index 4924511..781e891 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java

@@ -22,7 +22,6 @@
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.tika.TikaParser;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.Parser;

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index 779278c..0f1505d 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java

@@ -33,11 +33,8 @@
 /**
  * Test extraction of image metadata
  */
-public class TestImageMetadata {
+public class TestImageMetadata extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   private String[] sampleFiles = { "nutch_logo_tm.gif", };
 

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
index 37c536c..c5062f6 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java

@@ -16,58 +16,24 @@
  */
 package org.apache.nutch.parse.tika;
 
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
 import java.io.File;
 
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Assert;
+import org.junit.Test;
+
 /**
  * Unit tests for MSWordParser.
- * 
- * @author John Xing
  */
-public class TestMSWordParser {
+public class TestMSWordParser extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "word97.doc" };
 
   private String expectedText = "This is a sample doc file prepared for nutch.";
 
-  private Configuration conf;
-
-  @Before
-  public void setUp() {
-    conf = NutchConfiguration.create();
-    conf.set("file.content.limit", "-1");
-  }
-
-  public String getTextContent(String fileName) throws ProtocolException,
-      ParseException {
-    String urlString = "file:" + sampleDir + fileSeparator + fileName;
-    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
-    Content content = protocol.getProtocolOutput(new Text(urlString),
-        new CrawlDatum()).getContent();
-    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-        .get(content.getUrl());
-    return parse.getText();
-  }
-
   @Test
   public void testIt() throws ProtocolException, ParseException {
     for (int i = 0; i < sampleFiles.length; i++) {

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
index 93c0a2c..41c47e9 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java

@@ -19,27 +19,16 @@
 import java.io.FileInputStream;
 import java.io.InputStreamReader;
 
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.protocol.*;
-import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.protocol.ProtocolException;
 import org.junit.Assert;
 import org.junit.Test;
 
 /**
  * Unit tests for OOParser.
- * 
- * @author Andrzej Bialecki
  */
-public class TestOOParser {
+public class TestOOParser extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
@@ -50,28 +39,16 @@
 
   @Test
   public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Content content;
-    Parse parse;
-    Configuration conf = NutchConfiguration.create();
-    Protocol protocol;
-    ProtocolFactory factory = new ProtocolFactory(conf);
 
     System.out.println("Expected : " + expectedText);
 
     for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
+ 
       if (sampleFiles[i].startsWith("ootest") == false)
         continue;
 
-      protocol = factory.getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+      String text = getTextContent(sampleFiles[i]).replaceAll("[ \t\r\n]+", " ")
+          .trim();
 
       // simply test for the presence of a text - the ordering of the elements
       // may differ from what was expected

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
index fff6e9a..784b55c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java

@@ -16,30 +16,16 @@
  */
 package org.apache.nutch.parse.tika;
 
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
 import org.junit.Assert;
 import org.junit.Test;
 
 /**
  * Unit tests for PdfParser.
- * 
- * @author John Xing
  */
-public class TestPdfParser {
+public class TestPdfParser extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
@@ -48,22 +34,8 @@
 
   @Test
   public void testIt() throws ProtocolException, ParseException {
-    String urlString;
-    Protocol protocol;
-    Content content;
-    Parse parse;
-
     for (int i = 0; i < sampleFiles.length; i++) {
-      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
-      Configuration conf = NutchConfiguration.create();
-      protocol = new ProtocolFactory(conf).getProtocol(urlString);
-      content = protocol.getProtocolOutput(new Text(urlString),
-          new CrawlDatum()).getContent();
-      parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
-          .get(content.getUrl());
-
-      int index = parse.getText().indexOf(expectedText);
+      int index = getTextContent(sampleFiles[i]).indexOf(expectedText);
       Assert.assertTrue(index > 0);
     }
   }

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index 115220b..4de9d85 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java

@@ -16,41 +16,29 @@
  */
 package org.apache.nutch.parse.tika;
 
-// Nutch imports
+import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.metadata.DublinCore;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolException;
 import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
 import org.junit.Assert;
-import org.junit.Ignore;
 import org.junit.Test;
 
 /**
- * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
- * 
- * @author Andy Hedges
+ * Unit tests for TestRTFParser.
  */
-public class TestRTFParser {
+public class TestRTFParser extends TikaParserTest {
 
-  private String fileSeparator = System.getProperty("file.separator");
-  // This system property is defined in ./src/plugin/build-plugin.xml
-  private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
   // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String rtfFile = "test.rtf";
 
-  @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
   @Test
   public void testIt() throws ProtocolException, ParseException {
 
@@ -59,7 +47,6 @@
     Content content;
     Parse parse;
 
-    Configuration conf = NutchConfiguration.create();
     urlString = "file:" + sampleDir + fileSeparator + rtfFile;
     protocol = new ProtocolFactory(conf).getProtocol(urlString);
     content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
@@ -67,8 +54,7 @@
     parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
         content.getUrl());
     String text = parse.getText();
-    Assert.assertEquals("The quick brown fox jumps over the lazy dog",
-        text.trim());
+    Assert.assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
 
     String title = parse.getData().getTitle();
     Metadata meta = parse.getData().getParseMeta();

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java
new file mode 100644
index 0000000..85427db
--- /dev/null
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java

@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Test;
+
+public class TestXlsxParser extends TikaParserTest {
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException, IOException {
+    String found = getTextContent("test.xlsx");
+    String expected = "test.txt This is a test for spreadsheets xlsx";
+    // text is distributed over columns and rows, need to normalize white space
+    found = found.replaceAll("\\s+", " ").trim();
+    assertEquals(found, expected);
+  }
+
+}

diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java
new file mode 100644
index 0000000..781debb
--- /dev/null
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java

@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Before;
+
+/**
+ * Base class to extend Tika parser tests from.
+ */
+public class TikaParserTest {
+
+  protected String fileSeparator = System.getProperty("file.separator");
+
+  /**
+   * Folder with test data, defined in src/plugin/build-plugin.xml. Make sure
+   * that all sample files are copied to "test.data", they must be listed in
+   * src/plugin/parse-tika/build.xml
+   */
+  protected String sampleDir = System.getProperty("test.data", ".");
+
+  protected Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+  }
+
+  public String getTextContent(String fileName)
+      throws ProtocolException, ParseException {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    Content content = protocol
+        .getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+        .get(content.getUrl());
+    return parse.getText();
+  }
+
+}
commit	e61a8a3b0af5540ffe23f862e2af69092114f506	[log] [tgz]
author	Sebastian Nagel <snagel@apache.org>	Tue May 12 15:35:09 2020 +0200
committer	GitHub <noreply@github.com>	Tue May 12 15:35:09 2020 +0200
tree	6c63331240c3356816d0d9899205e0ee8192e83b
parent	ec93b3359d207fef62378cbd15bb63c7acc33f66 [diff]
parent	0341f0dfa156d3963e88b2cb9507013b0eef8668 [diff]