Merge pull request #525 from sebastian-nagel/NUTCH-1945
NUTCH-1945 Test for XLSX parser
diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml
index b17643d..af3e610 100644
--- a/src/plugin/parse-tika/build.xml
+++ b/src/plugin/parse-tika/build.xml
@@ -36,6 +36,7 @@
<include name="*.doc"/>
<include name="*.gif"/>
<include name="*.docx"/>
+ <include name="*.xlsx"/>
</fileset>
</copy>
diff --git a/src/plugin/parse-tika/sample/test.xlsx b/src/plugin/parse-tika/sample/test.xlsx
new file mode 100644
index 0000000..de33f28
--- /dev/null
+++ b/src/plugin/parse-tika/sample/test.xlsx
Binary files differ
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
index cecf251..79ed286 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
@@ -16,59 +16,29 @@
*/
package org.apache.nutch.parse.tika;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
-import java.io.File;
-
/**
* Unit tests for MSWordParser.
- *
- * @author John Xing
*/
-public class TestEmbeddedDocuments {
+public class TestEmbeddedDocuments extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "test_recursive_embedded.docx" };
private String expectedText = "When in the Course of human events";
- private Configuration conf;
-
@Before
public void setUp() {
- conf = NutchConfiguration.create();
- conf.set("file.content.limit", "-1");
+ super.setUp();
conf.setBoolean("tika.parse.embedded", true);
}
- public String getTextContent(String fileName) throws ProtocolException,
- ParseException {
- String urlString = "file:" + sampleDir + fileSeparator + fileName;
- Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
- Content content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
- return parse.getText();
- }
-
@Test
public void testIt() throws ProtocolException, ParseException {
for (int i = 0; i < sampleFiles.length; i++) {
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
index 87b452c..94eec53 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestFeedParser.java
@@ -26,7 +26,6 @@
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseException;
import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.tika.TikaParser;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolException;
@@ -34,18 +33,9 @@
import org.apache.nutch.util.NutchConfiguration;
/**
- *
- * @author mattmann / jnioche
- *
- * Test Suite for the RSS feeds with the {@link TikaParser}.
- *
+ * Test Suite for the RSS feeds with the {@link TikaParser}.
*/
-public class TestFeedParser {
-
- private String fileSeparator = System.getProperty("file.separator");
-
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
+public class TestFeedParser extends TikaParserTest {
private String[] sampleFiles = { "rsstest.rss" };
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
index 4924511..781e891 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestHtmlParser.java
@@ -22,7 +22,6 @@
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.parse.tika.TikaParser;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.Parser;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
index 779278c..0f1505d 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestImageMetadata.java
@@ -33,11 +33,8 @@
/**
* Test extraction of image metadata
*/
-public class TestImageMetadata {
+public class TestImageMetadata extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
private String[] sampleFiles = { "nutch_logo_tm.gif", };
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
index 37c536c..c5062f6 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
@@ -16,58 +16,24 @@
*/
package org.apache.nutch.parse.tika;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.junit.Assert;
-import org.junit.Before;
-import org.junit.Test;
-
import java.io.File;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Assert;
+import org.junit.Test;
+
/**
* Unit tests for MSWordParser.
- *
- * @author John Xing
*/
-public class TestMSWordParser {
+public class TestMSWordParser extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "word97.doc" };
private String expectedText = "This is a sample doc file prepared for nutch.";
- private Configuration conf;
-
- @Before
- public void setUp() {
- conf = NutchConfiguration.create();
- conf.set("file.content.limit", "-1");
- }
-
- public String getTextContent(String fileName) throws ProtocolException,
- ParseException {
- String urlString = "file:" + sampleDir + fileSeparator + fileName;
- Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
- Content content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
- return parse.getText();
- }
-
@Test
public void testIt() throws ProtocolException, ParseException {
for (int i = 0; i < sampleFiles.length; i++) {
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
index 93c0a2c..41c47e9 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
@@ -19,27 +19,16 @@
import java.io.FileInputStream;
import java.io.InputStreamReader;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.protocol.*;
-import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseException;
-import org.apache.nutch.parse.ParseUtil;
-import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.protocol.ProtocolException;
import org.junit.Assert;
import org.junit.Test;
/**
* Unit tests for OOParser.
- *
- * @author Andrzej Bialecki
*/
-public class TestOOParser {
+public class TestOOParser extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
@@ -50,28 +39,16 @@
@Test
public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Content content;
- Parse parse;
- Configuration conf = NutchConfiguration.create();
- Protocol protocol;
- ProtocolFactory factory = new ProtocolFactory(conf);
System.out.println("Expected : " + expectedText);
for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
+
if (sampleFiles[i].startsWith("ootest") == false)
continue;
- protocol = factory.getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
-
- String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();
+ String text = getTextContent(sampleFiles[i]).replaceAll("[ \t\r\n]+", " ")
+ .trim();
// simply test for the presence of a text - the ordering of the elements
// may differ from what was expected
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
index fff6e9a..784b55c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
@@ -16,30 +16,16 @@
*/
package org.apache.nutch.parse.tika;
-import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.util.NutchConfiguration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.ProtocolException;
import org.junit.Assert;
import org.junit.Test;
/**
* Unit tests for PdfParser.
- *
- * @author John Xing
*/
-public class TestPdfParser {
+public class TestPdfParser extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
@@ -48,22 +34,8 @@
@Test
public void testIt() throws ProtocolException, ParseException {
- String urlString;
- Protocol protocol;
- Content content;
- Parse parse;
-
for (int i = 0; i < sampleFiles.length; i++) {
- urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
-
- Configuration conf = NutchConfiguration.create();
- protocol = new ProtocolFactory(conf).getProtocol(urlString);
- content = protocol.getProtocolOutput(new Text(urlString),
- new CrawlDatum()).getContent();
- parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
- .get(content.getUrl());
-
- int index = parse.getText().indexOf(expectedText);
+ int index = getTextContent(sampleFiles[i]).indexOf(expectedText);
Assert.assertTrue(index > 0);
}
}
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index 115220b..4de9d85 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
@@ -16,41 +16,29 @@
*/
package org.apache.nutch.parse.tika;
-// Nutch imports
+import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.DublinCore;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
-import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.ProtocolFactory;
-import org.apache.nutch.util.NutchConfiguration;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
import org.junit.Assert;
-import org.junit.Ignore;
import org.junit.Test;
/**
- * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).
- *
- * @author Andy Hedges
+ * Unit tests for TestRTFParser.
*/
-public class TestRTFParser {
+public class TestRTFParser extends TikaParserTest {
- private String fileSeparator = System.getProperty("file.separator");
- // This system property is defined in ./src/plugin/build-plugin.xml
- private String sampleDir = System.getProperty("test.data", ".");
// Make sure sample files are copied to "test.data" as specified in
// ./src/plugin/parse-tika/build.xml during plugin compilation.
private String rtfFile = "test.rtf";
- @Ignore("There seems to be an issue with line 71 e.g. text.trim()")
@Test
public void testIt() throws ProtocolException, ParseException {
@@ -59,7 +47,6 @@
Content content;
Parse parse;
- Configuration conf = NutchConfiguration.create();
urlString = "file:" + sampleDir + fileSeparator + rtfFile;
protocol = new ProtocolFactory(conf).getProtocol(urlString);
content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum())
@@ -67,8 +54,7 @@
parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content).get(
content.getUrl());
String text = parse.getText();
- Assert.assertEquals("The quick brown fox jumps over the lazy dog",
- text.trim());
+ Assert.assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
String title = parse.getData().getTitle();
Metadata meta = parse.getData().getParseMeta();
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java
new file mode 100644
index 0000000..85427db
--- /dev/null
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestXlsxParser.java
@@ -0,0 +1,38 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.protocol.ProtocolException;
+import org.junit.Test;
+
+public class TestXlsxParser extends TikaParserTest {
+
+ @Test
+ public void testIt() throws ProtocolException, ParseException, IOException {
+ String found = getTextContent("test.xlsx");
+ String expected = "test.txt This is a test for spreadsheets xlsx";
+ // text is distributed over columns and rows, need to normalize white space
+ found = found.replaceAll("\\s+", " ").trim();
+ assertEquals(found, expected);
+ }
+
+}
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java
new file mode 100644
index 0000000..781debb
--- /dev/null
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TikaParserTest.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Before;
+
+/**
+ * Base class to extend Tika parser tests from.
+ */
+public class TikaParserTest {
+
+ protected String fileSeparator = System.getProperty("file.separator");
+
+ /**
+ * Folder with test data, defined in src/plugin/build-plugin.xml. Make sure
+ * that all sample files are copied to "test.data", they must be listed in
+ * src/plugin/parse-tika/build.xml
+ */
+ protected String sampleDir = System.getProperty("test.data", ".");
+
+ protected Configuration conf;
+
+ @Before
+ public void setUp() {
+ conf = NutchConfiguration.create();
+ conf.set("file.content.limit", "-1");
+ }
+
+ public String getTextContent(String fileName)
+ throws ProtocolException, ParseException {
+ String urlString = "file:" + sampleDir + fileSeparator + fileName;
+ Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+ Content content = protocol
+ .getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
+ Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+ .get(content.getUrl());
+ return parse.getText();
+ }
+
+}