blob: 006e72a457cd7dc78c30d46ed57ef3d1871db309 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.odf;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.apache.tika.TikaTest;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.EmptyParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
public class ODFParserTest extends TikaTest {
/**
* For now, allow us to run some tests against both
* the old and the new parser
*/
private Parser[] getParsers() {
return new Parser[]{new OpenDocumentParser()};
}
@Test
public void testOO3() throws Exception {
for (Parser parser : getParsers()) {
try (InputStream input = getResourceAsStream("/test-documents/testODFwithOOo3.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(input, handler, metadata, new ParseContext());
assertEquals("application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Tika is part of the Lucene project.", content);
assertContains("Solr", content);
assertContains("one embedded", content);
assertContains("Rectangle Title", content);
assertContains("a blue background and dark border", content);
}
}
}
@Test
public void testOO2() throws Exception {
for (Parser parser : getParsers()) {
try (InputStream input = getResourceAsStream("/test-documents/testOpenOffice2.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(input, handler, metadata, new ParseContext());
assertEquals("application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("en-US", metadata.get(TikaCoreProperties.LANGUAGE));
assertEquals("PT1M7S", metadata.get(OfficeOpenXMLExtended.TOTAL_TIME));
assertEquals("NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
metadata.get("generator"));
// Check date metadata, both old-style and new-style
assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
// Check the document statistics
assertEquals("1", metadata.get(Office.PAGE_COUNT));
assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
assertEquals("14", metadata.get(Office.WORD_COUNT));
assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
assertEquals("0", metadata.get(Office.TABLE_COUNT));
assertEquals("0", metadata.get(Office.OBJECT_COUNT));
assertEquals("0", metadata.get(Office.IMAGE_COUNT));
// Custom metadata tags present but without values
assertEquals(null, metadata.get("custom:Info 1"));
assertEquals(null, metadata.get("custom:Info 2"));
assertEquals(null, metadata.get("custom:Info 3"));
assertEquals(null, metadata.get("custom:Info 4"));
String content = handler.toString();
assertTrue(content.contains("This is a sample Open Office document," +
" written in NeoOffice 2.2.1 for the Mac."));
}
}
}
/**
* Similar to {@link #testOO2()}, but using a different
* OO2 file with different metadata in it
*/
@Test
public void testOO2Metadata() throws Exception {
try (InputStream input = getResourceAsStream("/test-documents/testOpenOffice2.odf")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new OpenDocumentParser().parse(input, handler, metadata);
assertEquals("application/vnd.oasis.opendocument.formula",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals(null, metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2006-01-27T11:55:22", metadata.get(TikaCoreProperties.CREATED));
assertEquals("The quick brown fox jumps over the lazy dog",
metadata.get(TikaCoreProperties.TITLE));
assertEquals("Gym class featuring a brown fox and lazy dog",
metadata.get(OfficeOpenXMLCore.SUBJECT));
assertContains("Gym class featuring a brown fox and lazy dog",
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT)));
assertEquals("PT0S", metadata.get(OfficeOpenXMLExtended.TOTAL_TIME));
assertEquals("1", metadata.get("editing-cycles"));
assertEquals("OpenOffice.org/2.2$Win32 OpenOffice.org_project/680m14$Build-9134",
metadata.get("generator"));
assertEquals("Pangram, fox, dog", metadata.get(TikaCoreProperties.SUBJECT));
// User defined metadata
assertEquals("Text 1", metadata.get("custom:Info 1"));
assertEquals("2", metadata.get("custom:Info 2"));
assertEquals("false", metadata.get("custom:Info 3"));
assertEquals("true", metadata.get("custom:Info 4"));
// No statistics present
assertEquals(null, metadata.get(Office.PAGE_COUNT));
assertEquals(null, metadata.get(Office.PARAGRAPH_COUNT));
assertEquals(null, metadata.get(Office.WORD_COUNT));
assertEquals(null, metadata.get(Office.CHARACTER_COUNT));
assertEquals(null, metadata.get(Office.TABLE_COUNT));
assertEquals(null, metadata.get(Office.OBJECT_COUNT));
assertEquals(null, metadata.get(Office.IMAGE_COUNT));
assertEquals(null, metadata.get("nbTab"));
assertEquals(null, metadata.get("nbObject"));
assertEquals(null, metadata.get("nbImg"));
assertEquals(null, metadata.get("nbPage"));
assertEquals(null, metadata.get("nbPara"));
assertEquals(null, metadata.get("nbWord"));
assertEquals(null, metadata.get("nbCharacter"));
// Note - contents of maths files not currently supported
String content = handler.toString().trim();
assertEquals("", content);
}
}
/**
* Similar to {@link #testOO2()} )}, but using an OO3 file
*/
@Test
public void testOO3Metadata() throws Exception {
try (InputStream input = getResourceAsStream("/test-documents/testODFwithOOo3.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new OpenDocumentParser().parse(input, handler, metadata);
assertEquals("application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
assertEquals("2009-10-05T21:22:38", metadata.get(TikaCoreProperties.MODIFIED));
assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
assertEquals("2009-10-05T19:04:01", metadata.get(TikaCoreProperties.CREATED));
assertEquals("Apache Tika", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Test document", metadata.get(OfficeOpenXMLCore.SUBJECT));
assertContains("Test document",
Arrays.asList(metadata.getValues(TikaCoreProperties.SUBJECT)));
assertEquals("A rather complex document", metadata.get(TikaCoreProperties.DESCRIPTION));
assertEquals("Bart Hanssens", metadata.get(TikaCoreProperties.CREATOR));
assertEquals("2", metadata.get("editing-cycles"));
assertEquals("PT02H03M24S", metadata.get(OfficeOpenXMLExtended.TOTAL_TIME));
assertEquals("OpenOffice.org/3.1$Unix OpenOffice.org_project/310m19$Build-9420",
metadata.get("generator"));
assertEquals("Apache, Lucene, Tika", metadata.get(TikaCoreProperties.SUBJECT));
// User defined metadata
assertEquals("Bart Hanssens", metadata.get("custom:Editor"));
assertEquals(null, metadata.get("custom:Info 2"));
assertEquals(null, metadata.get("custom:Info 3"));
assertEquals(null, metadata.get("custom:Info 4"));
// Check the document statistics
assertEquals("2", metadata.get(Office.PAGE_COUNT));
assertEquals("13", metadata.get(Office.PARAGRAPH_COUNT));
assertEquals("54", metadata.get(Office.WORD_COUNT));
assertEquals("351", metadata.get(Office.CHARACTER_COUNT));
assertEquals("0", metadata.get(Office.TABLE_COUNT));
assertEquals("2", metadata.get(Office.OBJECT_COUNT));
assertEquals("0", metadata.get(Office.IMAGE_COUNT));
String content = handler.toString();
assertTrue(content.contains("Apache Tika Tika is part of the Lucene project."));
}
}
@Test
public void testODPMasterFooter() throws Exception {
try (InputStream input = getResourceAsStream("/test-documents/testMasterFooter.odp")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
AUTO_DETECT_PARSER.parse(input, handler, metadata, new ParseContext());
String content = handler.toString();
assertContains("Master footer is here", content);
}
}
@Test
public void testODTFooter() throws Exception {
try (InputStream input = getResourceAsStream("/test-documents/testFooter.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
AUTO_DETECT_PARSER.parse(input, handler, metadata, new ParseContext());
String content = handler.toString();
assertContains("Here is some text...", content);
assertContains("Here is some text on page 2", content);
assertContains("Here is footer text", content);
}
}
@Test
public void testODSFooter() throws Exception {
try (InputStream input = getResourceAsStream("/test-documents/testFooter.ods")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
AUTO_DETECT_PARSER.parse(input, handler, metadata, new ParseContext());
String content = handler.toString();
assertContains("Here is a footer in the center area", content);
}
}
@Test
public void testFromFile() throws Exception {
try (TikaInputStream tis = TikaInputStream
.get(getResourceAsUrl("/test-documents/testODFwithOOo3.odt"))) {
assertEquals(true, tis.hasFile());
OpenDocumentParser parser = new OpenDocumentParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(tis, handler, metadata, new ParseContext());
assertEquals("application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Tika is part of the Lucene project.", content);
}
}
@Test
public void testNPEFromFile() throws Exception {
OpenDocumentParser parser = new OpenDocumentParser();
try (TikaInputStream tis = TikaInputStream
.get(getResourceAsUrl("/test-documents/testNPEOpenDocument.odt"))) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(tis, handler, metadata, new ParseContext());
assertEquals("application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("primero hay que generar un par de claves", content);
}
}
// TIKA-1063: Test basic style support.
@Test
public void testODTStyles() throws Exception {
String xml = getXML("testStyles.odt").xml;
assertContains("This <i>is</i> <b>just</b> a <u>test</u>", xml);
assertContains("<p>And <b>another <i>test</i> is</b> here.</p>", xml);
assertContains("<ol>\t<li><p>One</p>", xml);
assertContains("</ol>", xml);
assertContains("<ul>\t<li><p>First</p>", xml);
assertContains("</ul>", xml);
}
//TIKA-1600: Test that null pointer doesn't break parsing.
@Test
public void testNullStylesInODTFooter() throws Exception {
Parser parser = new OpenDocumentParser();
try (InputStream input = getResourceAsStream("/test-documents/testODT-TIKA-6000.odt")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(input, handler, metadata, getNonRecursingParseContext());
assertEquals("application/vnd.oasis.opendocument.text",
metadata.get(Metadata.CONTENT_TYPE));
String content = handler.toString();
assertContains("Utilisation de ce document", content);
assertContains("Copyright and License", content);
assertContains("Changer la langue", content);
assertContains("La page d’accueil permet de faire une recherche simple", content);
}
}
@Test //TIKA-1916
public void testMissingMeta() throws Exception {
String xml = getXML("testODTNoMeta.odt").xml;
assertContains("Test text", xml);
}
@Test //TIKA-2242
public void testParagraphLevelFontStyles() throws Exception {
String xml = getXML("testODTStyles2.odt", getNonRecursingParseContext()).xml;
//test text span font-style properties
assertContains("<p><b>name</b>, advocaat", xml);
//test paragraph's font-style properties
assertContains("<p><b>Publicatie Onbekwaamverklaring", xml);
}
@Test //TIKA-2242
public void testAnnotationsAndPDepthGt1() throws Exception {
//not allowed in html: <p> <annotation> <p> this is an annotation </p> </annotation> </p>
String xml = getXML("testODTStyles3.odt").xml;
assertContains(
"<p><b>WOUTERS Rolf</b><span class=\"annotation\"> Beschermde persoon is " +
"overleden </annotation>",
xml);
}
@Test
public void testEmbedded() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testODTEmbedded.odt");
assertEquals(3, metadataList.size());
}
@Test
public void testEmbeddedImageAndLink() throws Exception {
String xml = getXML("testODTEmbeddedImageLink.odt").xml;
assertContains("<a href=\"https://tika.apache.org/\">" +
"<img src=\"embedded:Pictures/10000201000001240000006457F5B1D1243E0671.png\" />" +
"<span>Visit Tika</span></a>", xml);
}
@Test(expected = IOException.class)
public void testInvalidFromStream() throws Exception {
try (InputStream is = getResourceAsUrl("/test-documents/testODTnotaZipFile.odt")
.openStream()) {
OpenDocumentParser parser = new OpenDocumentParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(is, handler, metadata, new ParseContext());
}
}
@Test(expected = IOException.class)
public void testInvalidFromFile() throws Exception {
try (TikaInputStream tis = TikaInputStream
.get(getResourceAsUrl("/test-documents/testODTnotaZipFile.odt"))) {
OpenDocumentParser parser = new OpenDocumentParser();
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
parser.parse(tis, handler, metadata, new ParseContext());
}
}
@Test(expected = EncryptedDocumentException.class)
public void testEncryptedODTFile() throws Exception {
//the password to this file is "tika"
Path p =
Paths.get(
ODFParserTest.class.getResource(
"/test-documents/testODTEncrypted.odt").toURI());
getRecursiveMetadata(p, false);
}
//this, of course, should throw an EncryptedDocumentException
//but the file can't be read by Java's ZipInputStream or
//by commons compress, unless you enable descriptors.
//https://issues.apache.org/jira/browse/ODFTOOLKIT-402
@Test(expected = TikaException.class)
public void testEncryptedODTStream() throws Exception {
try (InputStream is = ODFParserTest.class.getResourceAsStream(
"/test-documents/testODTEncrypted.odt")) {
getRecursiveMetadata(is, false);
}
}
private ParseContext getNonRecursingParseContext() {
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
return parseContext;
}
@Test
public void testMultiThreaded() throws Exception {
int numThreads = 10;
ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
ExecutorCompletionService<Integer> executorCompletionService =
new ExecutorCompletionService<>(executorService);
for (int i = 0; i < numThreads; i++) {
executorCompletionService.submit(() -> {
for (int i1 = 0; i1 < 10; i1++) {
List<Metadata> metadataList = getRecursiveMetadata("testODTEmbedded.odt");
assertEquals(3, metadataList.size());
assertEquals("THUMBNAIL",
metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}
return 1;
});
}
try {
int finished = 0;
while (finished < numThreads) {
Future<Integer> future = executorCompletionService.take();
future.get();
finished++;
}
} finally {
executorService.shutdownNow();
}
}
}