| /* ==================================================================== |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==================================================================== */ |
| |
| package org.apache.poi.hwpf.extractor; |
| |
| import static org.apache.poi.POITestCase.assertContains; |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertNotNull; |
| import static org.junit.Assert.assertTrue; |
| |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| |
| import org.apache.poi.POIDataSamples; |
| import org.apache.poi.POITextExtractor; |
| import org.apache.poi.hwpf.HWPFDocument; |
| import org.apache.poi.hwpf.HWPFTestDataSamples; |
| import org.apache.poi.hwpf.OldWordFileFormatException; |
| import org.apache.poi.poifs.filesystem.DirectoryNode; |
| import org.apache.poi.poifs.filesystem.Entry; |
| import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; |
| import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; |
| import org.apache.poi.poifs.filesystem.POIFSFileSystem; |
| import org.apache.poi.util.StringUtil; |
| import org.junit.Test; |
| |
| /** |
| * Test the different routes to extracting text |
| */ |
| public final class TestWordExtractor { |
| |
| private static POIDataSamples docTests = POIDataSamples.getDocumentInstance(); |
| |
| public static void assertEqualsTrim( String expected, String actual ) |
| { |
| String newExpected = expected.replaceAll( "\r\n", "\n" ) |
| .replaceAll( "\r", "\n" ).trim(); |
| String newActual = actual.replaceAll( "\r\n", "\n" ) |
| .replaceAll( "\r", "\n" ).trim(); |
| assertEquals( newExpected, newActual ); |
| } |
| |
| private static void assertExtractedContains(String[] extracted, String needle) { |
| String endnote = StringUtil.join(extracted, ""); |
| assertContains(endnote, needle); |
| } |
| |
| private final String[] p_text1 = new String[] { |
| "This is a simple word document\r\n", |
| "\r\n", |
| "It has a number of paragraphs in it\r\n", |
| "\r\n", |
| "Some of them even feature bold, italic and underlined text\r\n", |
| "\r\n", |
| "\r\n", |
| "This bit is in a different font and size\r\n", |
| "\r\n", |
| "\r\n", |
| "This bit features some red text.\r\n", |
| "\r\n", |
| "\r\n", |
| "It is otherwise very very boring.\r\n" |
| }; |
| |
| // Build splat'd out text version |
| private final String p_text1_block = StringUtil.join(p_text1, ""); |
| |
| /** |
| * Test paragraph based extraction |
| */ |
| @Test |
| public void testExtractFromParagraphs() throws IOException { |
| WordExtractor extractor = openExtractor("test2.doc"); |
| String[] text = extractor.getParagraphText(); |
| |
| assertEquals(p_text1.length, text.length); |
| for (int i = 0; i < p_text1.length; i++) { |
| assertEquals(p_text1[i], text[i]); |
| } |
| extractor.close(); |
| |
| // Lots of paragraphs with only a few lines in them |
| WordExtractor extractor2 = openExtractor("test.doc"); |
| assertEquals(24, extractor2.getParagraphText().length); |
| assertEquals("as d\r\n", extractor2.getParagraphText()[16]); |
| assertEquals("as d\r\n", extractor2.getParagraphText()[17]); |
| assertEquals("as d\r\n", extractor2.getParagraphText()[18]); |
| extractor2.close(); |
| } |
| |
| /** |
| * Test the paragraph -> flat extraction |
| */ |
| @Test |
| public void testGetText() throws IOException { |
| WordExtractor extractor = openExtractor("test2.doc"); |
| assertEqualsTrim(p_text1_block, extractor.getText()); |
| |
| // For the 2nd, should give similar answers for |
| // the two methods, differing only in line endings |
| |
| // nope, they must have different results, because of garbage |
| // assertEquals( |
| // extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""), |
| // extractor2.getText().replaceAll("[\\r\\n]", "")); |
| extractor.close(); |
| } |
| |
| /** |
| * Test textPieces based extraction |
| */ |
| @Test |
| public void testExtractFromTextPieces() throws IOException { |
| WordExtractor extractor = openExtractor("test2.doc"); |
| String text = extractor.getTextFromPieces(); |
| assertEquals(p_text1_block, text); |
| extractor.close(); |
| } |
| |
| |
| /** |
| * Test that we can get data from two different embedded word documents |
| */ |
| @Test |
| public void testExtractFromEmbeded() throws IOException { |
| InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls"); |
| POIFSFileSystem fs = new POIFSFileSystem(is); |
| is.close(); |
| |
| DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B7"); |
| DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B2"); |
| |
| // Should have WordDocument and 1Table |
| assertNotNull(dirA.getEntry("1Table")); |
| assertNotNull(dirA.getEntry("WordDocument")); |
| |
| assertNotNull(dirB.getEntry("1Table")); |
| assertNotNull(dirB.getEntry("WordDocument")); |
| |
| // Check each in turn |
| HWPFDocument docA = new HWPFDocument(dirA); |
| WordExtractor extractorA = new WordExtractor(docA); |
| |
| assertNotNull(extractorA.getText()); |
| assertTrue(extractorA.getText().length() > 20); |
| assertEqualsTrim("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractorA.getText()); |
| assertEquals("Sample Doc 1", extractorA.getSummaryInformation().getTitle()); |
| assertEquals("Sample Test", extractorA.getSummaryInformation().getSubject()); |
| |
| HWPFDocument docB = new HWPFDocument(dirB); |
| WordExtractor extractorB = new WordExtractor(docB); |
| |
| assertNotNull(extractorB.getText()); |
| assertTrue(extractorB.getText().length() > 20); |
| assertEqualsTrim("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractorB.getText()); |
| assertEquals("Sample Doc 2", extractorB.getSummaryInformation().getTitle()); |
| assertEquals("Another Sample Test", extractorB.getSummaryInformation().getSubject()); |
| |
| extractorA.close(); |
| docA.close(); |
| |
| extractorB.close(); |
| docB.close(); |
| |
| fs.close(); |
| } |
| |
| @Test |
| public void testWithHeader() throws IOException { |
| // Non-unicode |
| HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("ThreeColHeadFoot.doc"); |
| WordExtractor extractor1 = new WordExtractor(doc1); |
| |
| assertEquals("First header column!\tMid header Right header!\n", extractor1.getHeaderText()); |
| assertContains(extractor1.getText(), "First header column!"); |
| extractor1.close(); |
| doc1.close(); |
| |
| // Unicode |
| HWPFDocument doc2 = HWPFTestDataSamples.openSampleFile("HeaderFooterUnicode.doc"); |
| WordExtractor extractor2 = new WordExtractor(doc2); |
| |
| assertEquals("This is a simple header, with a \u20ac euro symbol in it.\n\n", extractor2.getHeaderText()); |
| assertContains(extractor2.getText(), "This is a simple header"); |
| extractor2.close(); |
| doc2.close(); |
| } |
| |
| @Test |
| public void testWithFooter() throws IOException { |
| // Non-unicode |
| HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("ThreeColHeadFoot.doc"); |
| WordExtractor extractor1 = new WordExtractor(doc1); |
| |
| assertEquals("Footer Left\tFooter Middle Footer Right\n", extractor1.getFooterText()); |
| assertContains(extractor1.getText(), "Footer Left"); |
| extractor1.close(); |
| doc1.close(); |
| |
| // Unicode |
| HWPFDocument doc2 = HWPFTestDataSamples.openSampleFile("HeaderFooterUnicode.doc"); |
| WordExtractor extractor2 = new WordExtractor(doc2); |
| |
| assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\n", extractor2.getFooterText()); |
| assertContains(extractor2.getText(), "The footer, with"); |
| extractor2.close(); |
| doc2.close(); |
| } |
| |
| @Test |
| public void testFootnote() throws IOException { |
| HWPFDocument doc = HWPFTestDataSamples.openSampleFile("footnote.doc"); |
| WordExtractor extractor = new WordExtractor(doc); |
| |
| assertExtractedContains(extractor.getFootnoteText(), "TestFootnote"); |
| assertEquals(0x00, doc.getRange().getSection(0).getFootnoteNumberingFormat()); // msonfcArabic |
| assertEquals(0x00, doc.getRange().getSection(0).getFootnoteRestartQualifier()); // rncCont |
| assertEquals(0, doc.getRange().getSection(0).getFootnoteNumberingOffset()); |
| assertEquals(1, doc.getFootnotes().getNotesCount()); |
| extractor.close(); |
| doc.close(); |
| } |
| |
| @Test |
| public void testEndnote() throws IOException { |
| HWPFDocument doc = HWPFTestDataSamples.openSampleFile("footnote.doc"); |
| WordExtractor extractor = new WordExtractor(doc); |
| |
| assertExtractedContains(extractor.getEndnoteText(), "TestEndnote"); |
| assertEquals(0x02, doc.getRange().getSection(0).getEndnoteNumberingFormat()); // msonfcLCRoman |
| assertEquals(0x00, doc.getRange().getSection(0).getEndnoteRestartQualifier()); // rncCont |
| assertEquals(0, doc.getRange().getSection(0).getEndnoteNumberingOffset()); |
| assertEquals(1, doc.getEndnotes().getNotesCount()); |
| extractor.close(); |
| doc.close(); |
| } |
| |
| @Test |
| public void testComments() throws IOException { |
| WordExtractor extractor = openExtractor("footnote.doc"); |
| assertExtractedContains(extractor.getCommentsText(), "TestComment"); |
| extractor.close(); |
| } |
| |
| @Test(expected=OldWordFileFormatException.class) |
| public void testWord95_WordExtractor() throws Exception { |
| // Too old for the default |
| openExtractor("Word95.doc").close(); |
| } |
| |
| @Test |
| public void testWord95() throws Exception { |
| // Can work with the special one |
| InputStream is = docTests.openResourceAsStream("Word95.doc"); |
| Word6Extractor w6e = new Word6Extractor(is); |
| is.close(); |
| |
| String text = w6e.getText(); |
| |
| assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); |
| assertTrue(text.contains("Paragraph 2")); |
| assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it")); |
| assertTrue(text.contains("Last (4th) paragraph")); |
| |
| String[] tp = w6e.getParagraphText(); |
| assertEquals(7, tp.length); |
| assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]); |
| assertEquals("\r\n", tp[1]); |
| assertEquals("Paragraph 2\r\n", tp[2]); |
| assertEquals("\r\n", tp[3]); |
| assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]); |
| assertEquals("\r\n", tp[5]); |
| assertEquals("Last (4th) paragraph.\r\n", tp[6]); |
| w6e.close(); |
| } |
| |
| @Test(expected=OldWordFileFormatException.class) |
| public void testWord6_WordExtractor() throws IOException { |
| // Too old for the default |
| openExtractor("Word6.doc").close(); |
| } |
| |
| @Test |
| public void testWord6() throws Exception { |
| InputStream is = docTests.openResourceAsStream("Word6.doc"); |
| Word6Extractor w6e = new Word6Extractor(is); |
| is.close(); |
| String text = w6e.getText(); |
| |
| assertTrue(text.contains("The quick brown fox jumps over the lazy dog")); |
| |
| String[] tp = w6e.getParagraphText(); |
| assertEquals(1, tp.length); |
| assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]); |
| w6e.close(); |
| } |
| |
| @Test |
| public void testFastSaved() throws Exception { |
| WordExtractor extractor = openExtractor("rasp.doc"); |
| |
| String text = extractor.getText(); |
| assertTrue(text.contains("\u0425\u0425\u0425\u0425\u0425")); |
| assertTrue(text.contains("\u0423\u0423\u0423\u0423\u0423")); |
| |
| extractor.close(); |
| } |
| |
| @Test |
| public void testFirstParagraphFix() throws Exception { |
| WordExtractor extractor = openExtractor("Bug48075.doc"); |
| |
| String text = extractor.getText(); |
| |
| assertTrue(text.startsWith("\u041f\u0440\u0438\u043b\u043e\u0436\u0435\u043d\u0438\u0435")); |
| extractor.close(); |
| } |
| |
| /** |
| * Tests that we can work with both {@link POIFSFileSystem} |
| * and {@link NPOIFSFileSystem} |
| */ |
| @Test |
| public void testDifferentPOIFS() throws Exception { |
| // Open the two filesystems |
| File file = docTests.getFile("test2.doc"); |
| InputStream is = new FileInputStream(file); |
| OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is); |
| is.close(); |
| NPOIFSFileSystem npoifs = new NPOIFSFileSystem(file); |
| |
| DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() }; |
| |
| // Open directly |
| for(DirectoryNode dir : files) { |
| @SuppressWarnings("resource") |
| WordExtractor extractor = new WordExtractor(dir); |
| assertEqualsTrim(p_text1_block, extractor.getText()); |
| // extractor.close(); |
| } |
| |
| // Open via a HWPFDocument |
| for(DirectoryNode dir : files) { |
| HWPFDocument doc = new HWPFDocument(dir); |
| WordExtractor extractor = new WordExtractor(doc); |
| assertEqualsTrim(p_text1_block, extractor.getText()); |
| extractor.close(); |
| } |
| |
| npoifs.close(); |
| } |
| |
| /** |
| * [RESOLVED FIXED] Bug 51686 - Update to POI 3.8 beta 4 causes |
| * ConcurrentModificationException in Tika's OfficeParser |
| */ |
| @Test |
| public void testBug51686() throws IOException { |
| InputStream is = docTests.openResourceAsStream( "Bug51686.doc" ); |
| POIFSFileSystem fs = new POIFSFileSystem(is); |
| is.close(); |
| |
| String text = null; |
| |
| for (Entry entry : fs.getRoot()) { |
| if ("WordDocument".equals(entry.getName())) { |
| WordExtractor ex = new WordExtractor(fs); |
| try { |
| text = ex.getText(); |
| } finally { |
| ex.close(); |
| } |
| } |
| } |
| |
| assertNotNull(text); |
| fs.close(); |
| } |
| |
| @Test |
| public void testExtractorFromWord6Extractor() throws Exception { |
| InputStream is = POIDataSamples.getHPSFInstance().openResourceAsStream("TestMickey.doc"); |
| POIFSFileSystem fs = new POIFSFileSystem(is); |
| is.close(); |
| Word6Extractor wExt = new Word6Extractor(fs); |
| try { |
| POITextExtractor ext = wExt.getMetadataTextExtractor(); |
| try { |
| // Now overall |
| String text = ext.getText(); |
| assertContains(text, "TEMPLATE = Normal"); |
| assertContains(text, "SUBJECT = sample subject"); |
| assertContains(text, "MANAGER = sample manager"); |
| assertContains(text, "COMPANY = sample company"); |
| } finally { |
| ext.close(); |
| } |
| } finally { |
| wExt.close(); |
| fs.close(); |
| } |
| } |
| |
| private WordExtractor openExtractor(String fileName) throws IOException { |
| InputStream is = docTests.openResourceAsStream(fileName); |
| try { |
| return new WordExtractor(is); |
| } finally { |
| is.close(); |
| } |
| |
| } |
| } |