blob: db09efe6277983cd1f0b6a81fc463d7fd4be5d1e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml.xwpf.ml2006;
import static org.junit.Assert.assertEquals;
import java.util.List;
import org.junit.AfterClass;
import org.junit.Test;
import org.apache.tika.MultiThreadedTikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.utils.XMLReaderUtils;
public class Word2006MLParserTest extends MultiThreadedTikaTest {
@AfterClass
public static void tearDown() throws TikaException {
XMLReaderUtils.setPoolSize(XMLReaderUtils.DEFAULT_POOL_SIZE);
}
@Test
public void basicTest() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testWORD_2006ml.xml");
assertEquals(9, metadataList.size());
Metadata m = metadataList.get(0);
assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.CREATED));
assertEquals("2016-11-29T17:54:00Z", m.get(TikaCoreProperties.MODIFIED));
assertEquals("My Document Title", m.get(TikaCoreProperties.TITLE));
assertEquals("This is the Author", m.get(TikaCoreProperties.CREATOR));
assertEquals("2", m.get(OfficeOpenXMLCore.REVISION));
assertEquals("Allison, Timothy B.", m.get(TikaCoreProperties.MODIFIER));
assertEquals("0", m.get(OfficeOpenXMLExtended.DOC_SECURITY));
assertEquals("260", m.get(Office.WORD_COUNT));
assertEquals("3", m.get(Office.PARAGRAPH_COUNT));
assertEquals("1742", m.get(Office.CHARACTER_COUNT_WITH_SPACES));
assertEquals("12", m.get(Office.LINE_COUNT));
assertEquals("16.0000", m.get(OfficeOpenXMLExtended.APP_VERSION));
String content = m.get(TikaCoreProperties.TIKA_CONTENT);
assertContainsCount("This is the Author", content, 1);
assertContainsCount("This is an engaging title page", content, 1);
assertContains("My Document Title", content);
assertContains("My Document Subtitle", content);
assertContains("<p>\t<a href=\"#_Toc467647605\">Heading1\t3</a></p>", content);
//TODO: integrate numbering
assertContains("Really basic 2.", content);
assertContainsCount("This is a text box", content, 1);
// assertContains("<p>This is a hyperlink: <a href=\"http://tika.apache.org\">tika</a></p>", content);
// assertContains("<p>This is a link to a local file: <a href=\"file:///C:\\data\\test.png\">test.png</a></p>", content);
assertContains("<p>This is 10 spaces</p>", content);
//caption
assertContains("<p>\t<a href=\"#_Toc467647797\">Table 1: Table1 Caption\t2</a>", content);
//embedded table
//TODO: figure out how to handle embedded tables in html
assertContains("<td>Embedded table r1c1", content);
//shape
assertContainsCount("<p>This is text within a shape", content, 1);
//sdt rich text
assertContains("<p>Rich text content control", content);
//sdt simple text
assertContains("<p>Simple text content control", content);
//sdt repeating
assertContains("Repeating content", content);
//sdt dropdown
//TODO: get options for dropdown
assertContains("Drop down1", content);
//sdt date
assertContains("<p>11/16/2016</p>", content);
//test that <tab/> works
assertContains("tab\ttab", content);
assertContainsCount("serious word art", content, 1);
assertContainsCount("Wordartr1c1", content, 1);
//glossary document contents
assertContains("Click or tap to enter a date", content);
//basic formatting
assertContains("<p>The <i>quick</i> brown <b>fox </b>j<i>um</i><b><i>ped</i></b> over",
content);
//TODO: add chart parsing
// assertContains("This is the chart", content);
assertContains("This is a comment", content);
assertContains("This is an endnote", content);
assertContains("this is the footnote", content);
assertContains("First page header", content);
assertContains("Even page header", content);
assertContains("Odd page header", content);
assertContains("First page footer", content);
assertContains("Even page footer", content);
assertContains("Odd page footer", content);
//test default ignores deleted
assertNotContained("frog", content);
assertContains("Mattmann", content);
//test default -- do not include moveFrom
assertContainsCount("Second paragraph", content, 1);
//TODO: figure out how to get this
//assertContains("This is the chart title", content);
}
@Test
public void testSkipDeletedAndMoveFrom() throws Exception {
ParseContext pc = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setIncludeDeletedContent(true);
officeParserConfig.setIncludeMoveFromContent(true);
pc.set(OfficeParserConfig.class, officeParserConfig);
XMLResult r = getXML("testWORD_2006ml.xml", pc);
assertContains("frog", r.xml);
assertContainsCount("Second paragraph", r.xml, 2);
}
@Test(timeout = 60000)
public void testMultiThreaded() throws Exception {
XMLReaderUtils.setPoolSize(4);
int numThreads = XMLReaderUtils.getPoolSize() * 2;
ParseContext[] contexts = new ParseContext[numThreads];
for (int i = 0; i < contexts.length; i++) {
contexts[i] = new ParseContext();
}
testMultiThreaded(AUTO_DETECT_PARSER, contexts, numThreads, 2,
pathname -> pathname.getName().equals("testWORD_2006ml.xml"));
}
}