TIKA-2310 -- epub parser should parse contents ending in .xml
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 557d183..a38d45e 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -151,7 +151,8 @@
meta.parse(zip, new DefaultHandler(), metadata, context);
} else if (entry.getName().endsWith(".htm") ||
entry.getName().endsWith(".html") ||
- entry.getName().endsWith(".xhtml")) {
+ entry.getName().endsWith(".xhtml") ||
+ entry.getName().endsWith(".xml")) {
content.parse(zip, bodyHandler, metadata, context);
}
entry = zip.getNextZipEntry();
@@ -276,10 +277,20 @@
Set<String> processed = new HashSet<>();
for (String id : contentOrderScraper.contentItems) {
HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
- if (hRefMediaPair != null &&
- hRefMediaPair.href != null) {
+ if (hRefMediaPair != null && hRefMediaPair.href != null) {
+ //we need to test for xhtml/xml because the content parser
+ //expects that.
+ boolean shouldParse = false;
String href = hRefMediaPair.href.toLowerCase(Locale.US);
- if (href.endsWith("htm") || href.endsWith("html")) {
+ if (hRefMediaPair.media != null) {
+ String mediaType = hRefMediaPair.media.toLowerCase(Locale.US);
+ if (mediaType.contains("html")) {
+ shouldParse = true;
+ }
+ } else if (href.endsWith("htm") || href.endsWith("html") || href.endsWith(".xml")) {
+ shouldParse = true;
+ }
+ if (shouldParse) {
zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
if (zae != null) {
try (InputStream is = zipFile.getInputStream(zae)) {
@@ -319,6 +330,8 @@
return false;
} else if (lc.contains("x-ibooks")) {
return false;
+ } else if (lc.equals("application/x-dtbncx+xml")) {
+ return false;
}
return true;
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
index b3d2401..a16732c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/epub/EpubParserTest.java
@@ -70,7 +70,8 @@
List<Metadata> metadataList = getRecursiveMetadata("testEPUB.epub");
//test attachments
- assertEquals(3, metadataList.size());
+ assertEquals(2, metadataList.size());
+ assertEquals("image/jpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE));
String xml = metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT);
int tocIndex = xml.indexOf("h3 class=\"toc_heading\">Table of Contents<");
int ch1 = xml.indexOf("<h1>Chapter 1");
@@ -106,4 +107,13 @@
int ch2 = xml.indexOf("<h1>Chapter 2");
assert(ch1 < ch2);
}
+
+ @Test
+ public void testContentsWXMLExtensions() throws Exception {
+ //TIKA-2310
+ List<Metadata> metadataList = getRecursiveMetadata("testEPUB_xml_ext.epub");
+ assertEquals(1, metadataList.size());
+ assertContains("It was a bright cold day in April",
+ metadataList.get(0).get(RecursiveParserWrapperHandler.TIKA_CONTENT));
+ }
}
diff --git a/tika-parsers/src/test/resources/test-documents/testEPUB_xml_ext.epub b/tika-parsers/src/test/resources/test-documents/testEPUB_xml_ext.epub
new file mode 100644
index 0000000..dca56d0
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testEPUB_xml_ext.epub
Binary files differ