NUTCH-2457 Embedded documents likely not correctly parsed by Tika
- add unit test for embedded documents
diff --git a/src/plugin/parse-tika/build.xml b/src/plugin/parse-tika/build.xml
index bda9e89..b17643d 100644
--- a/src/plugin/parse-tika/build.xml
+++ b/src/plugin/parse-tika/build.xml
@@ -35,6 +35,7 @@
       <include name="ootest.*"/>
       <include name="*.doc"/>
       <include name="*.gif"/>
+      <include name="*.docx"/>
     </fileset>
   </copy>
   
diff --git a/src/plugin/parse-tika/sample/test_recursive_embedded.docx b/src/plugin/parse-tika/sample/test_recursive_embedded.docx
new file mode 100644
index 0000000..cd562cb
--- /dev/null
+++ b/src/plugin/parse-tika/sample/test_recursive_embedded.docx
Binary files differ
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
new file mode 100644
index 0000000..f82dacd
--- /dev/null
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestEmbeddedDocuments.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse.tika;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.File;
+
+/**
+ * Unit tests for MSWordParser.
+ * 
+ * @author John Xing
+ */
+public class TestEmbeddedDocuments {
+
+  private String fileSeparator = System.getProperty("file.separator");
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
+  private String[] sampleFiles = { "test_recursive_embedded.docx" };
+
+  private String expectedText = "When in the Course of human events";
+
+  private Configuration conf;
+
+  @Before
+  public void setUp() {
+    conf = NutchConfiguration.create();
+    conf.set("file.content.limit", "-1");
+  }
+
+  public String getTextContent(String fileName) throws ProtocolException,
+      ParseException {
+    String urlString = "file:" + sampleDir + fileSeparator + fileName;
+    Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+    Content content = protocol.getProtocolOutput(new Text(urlString),
+        new CrawlDatum()).getContent();
+    Parse parse = new ParseUtil(conf).parseByExtensionId("parse-tika", content)
+        .get(content.getUrl());
+    return parse.getText();
+  }
+
+  @Test
+  public void testIt() throws ProtocolException, ParseException {
+    for (int i = 0; i < sampleFiles.length; i++) {
+      String found = getTextContent(sampleFiles[i]);
+      Assert.assertTrue("text found : '" + found + "'",
+          found.contains(expectedText));
+    }
+  }
+
+  @Test
+  public void testOpeningDocs() throws ProtocolException, ParseException {
+    String[] filenames = new File(sampleDir).list();
+    for (int i = 0; i < filenames.length; i++) {
+      Assert.assertTrue("can't read content of " + filenames[i],
+          getTextContent(filenames[i]).length() > 0);
+    }
+  }
+}
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
index 7183ceb..37c536c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java
@@ -44,8 +44,7 @@
   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-msword/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-msword/sample/README.txt for what they are.
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "word97.doc" };
 
   private String expectedText = "This is a sample doc file prepared for nutch.";
@@ -84,7 +83,7 @@
     for (int i = 0; i < filenames.length; i++) {
       if (filenames[i].endsWith(".doc") == false)
         continue;
-      Assert.assertTrue("cann't read content of " + filenames[i],
+      Assert.assertTrue("can't read content of " + filenames[i],
           getTextContent(filenames[i]).length() > 0);
     }
   }
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
index b0226d9..93c0a2c 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java
@@ -41,7 +41,7 @@
   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-oo/build.xml during plugin compilation.
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };
 
   private String expectedText;
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
index 36b2ecf..fff6e9a 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java
@@ -41,8 +41,7 @@
   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-pdf/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-pdf/sample/README.txt for what they are.
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String[] sampleFiles = { "pdftest.pdf", "encrypted.pdf" };
 
   private String expectedText = "A VERY SMALL PDF FILE";
diff --git a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
index b45a20f..115220b 100644
--- a/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
+++ b/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java
@@ -47,8 +47,7 @@
   // This system property is defined in ./src/plugin/build-plugin.xml
   private String sampleDir = System.getProperty("test.data", ".");
   // Make sure sample files are copied to "test.data" as specified in
-  // ./src/plugin/parse-rtf/build.xml during plugin compilation.
-  // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.
+  // ./src/plugin/parse-tika/build.xml during plugin compilation.
   private String rtfFile = "test.rtf";
 
   @Ignore("There seems to be an issue with line 71 e.g. text.trim()")