trunk/src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java - poi - Git at Google

 /* ====================================================================
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
    this work for additional information regarding copyright ownership.
    The ASF licenses this file to You under the Apache License, Version 2.0
    (the "License"); you may not use this file except in compliance with
    the License.  You may obtain a copy of the License at

        http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
 ==================================================================== */

 package org.apache.poi.hwpf.extractor;

 import static org.apache.poi.POITestCase.assertContains;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;

 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;

 import org.apache.poi.POIDataSamples;
 import org.apache.poi.POITextExtractor;
 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.hwpf.HWPFTestDataSamples;
 import org.apache.poi.hwpf.OldWordFileFormatException;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.util.StringUtil;
 import org.junit.Test;

 /**
  * Test the different routes to extracting text
  */
 public final class TestWordExtractor {

     private static POIDataSamples docTests = POIDataSamples.getDocumentInstance();

     public static void assertEqualsTrim( String expected, String actual )
     {
         String newExpected = expected.replaceAll( "\r\n", "\n" )
                 .replaceAll( "\r", "\n" ).trim();
         String newActual = actual.replaceAll( "\r\n", "\n" )
                 .replaceAll( "\r", "\n" ).trim();
         assertEquals( newExpected, newActual );
     }

     private static void assertExtractedContains(String[] extracted, String needle) {
         String endnote = StringUtil.join(extracted, "");
         assertContains(endnote, needle);
     }

 	private final String[] p_text1 = new String[] {
 			"This is a simple word document\r\n",
 			"\r\n",
 			"It has a number of paragraphs in it\r\n",
 			"\r\n",
 			"Some of them even feature bold, italic and underlined text\r\n",
 			"\r\n",
 			"\r\n",
 			"This bit is in a different font and size\r\n",
 			"\r\n",
 			"\r\n",
 			"This bit features some red text.\r\n",
 			"\r\n",
 			"\r\n",
 			"It is otherwise very very boring.\r\n"
 	};

     // Build splat'd out text version
 	private final String p_text1_block = StringUtil.join(p_text1, "");

 	/**
 	 * Test paragraph based extraction
 	 */
 	@Test
 	public void testExtractFromParagraphs() throws IOException {
         WordExtractor extractor = openExtractor("test2.doc");
 		String[] text = extractor.getParagraphText();

 		assertEquals(p_text1.length, text.length);
 		for (int i = 0; i < p_text1.length; i++) {
 			assertEquals(p_text1[i], text[i]);
 		}
         extractor.close();

 		// Lots of paragraphs with only a few lines in them
         WordExtractor extractor2 = openExtractor("test.doc");
 		assertEquals(24, extractor2.getParagraphText().length);
 		assertEquals("as d\r\n", extractor2.getParagraphText()[16]);
 		assertEquals("as d\r\n", extractor2.getParagraphText()[17]);
 		assertEquals("as d\r\n", extractor2.getParagraphText()[18]);
 		extractor2.close();
 	}

 	/**
 	 * Test the paragraph -> flat extraction
 	 */
     @Test
 	public void testGetText() throws IOException {
         WordExtractor extractor = openExtractor("test2.doc");
         assertEqualsTrim(p_text1_block, extractor.getText());

         // For the 2nd, should give similar answers for
         // the two methods, differing only in line endings

         // nope, they must have different results, because of garbage
         // assertEquals(
         // extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
         // extractor2.getText().replaceAll("[\\r\\n]", ""));
 		extractor.close();
     }

 	/**
 	 * Test textPieces based extraction
 	 */
     @Test
 	public void testExtractFromTextPieces() throws IOException {
         WordExtractor extractor = openExtractor("test2.doc");
 		String text = extractor.getTextFromPieces();
 		assertEquals(p_text1_block, text);
 		extractor.close();
 	}


 	/**
 	 * Test that we can get data from two different embedded word documents
 	 */
     @Test
 	public void testExtractFromEmbeded() throws IOException {
 	    InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
 		POIFSFileSystem fs = new POIFSFileSystem(is);
 		is.close();

 		DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B7");
 		DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B2");

 		// Should have WordDocument and 1Table
 		assertNotNull(dirA.getEntry("1Table"));
 		assertNotNull(dirA.getEntry("WordDocument"));

 		assertNotNull(dirB.getEntry("1Table"));
 		assertNotNull(dirB.getEntry("WordDocument"));

 		// Check each in turn
         HWPFDocument docA = new HWPFDocument(dirA);
 		WordExtractor extractorA = new WordExtractor(docA);

 		assertNotNull(extractorA.getText());
 		assertTrue(extractorA.getText().length() > 20);
 		assertEqualsTrim("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractorA.getText());
 		assertEquals("Sample Doc 1", extractorA.getSummaryInformation().getTitle());
 		assertEquals("Sample Test", extractorA.getSummaryInformation().getSubject());

 		HWPFDocument docB = new HWPFDocument(dirB);
 		WordExtractor extractorB = new WordExtractor(docB);

 		assertNotNull(extractorB.getText());
 		assertTrue(extractorB.getText().length() > 20);
 		assertEqualsTrim("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractorB.getText());
 		assertEquals("Sample Doc 2", extractorB.getSummaryInformation().getTitle());
 		assertEquals("Another Sample Test", extractorB.getSummaryInformation().getSubject());

 		extractorA.close();
 		docA.close();

 		extractorB.close();
 		docB.close();

 		fs.close();
 	}

     @Test
 	public void testWithHeader() throws IOException {
 		// Non-unicode
 		HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("ThreeColHeadFoot.doc");
 		WordExtractor extractor1 = new WordExtractor(doc1);

 		assertEquals("First header column!\tMid header Right header!\n", extractor1.getHeaderText());
 		assertContains(extractor1.getText(), "First header column!");
 		extractor1.close();
 		doc1.close();

 		// Unicode
 		HWPFDocument doc2 = HWPFTestDataSamples.openSampleFile("HeaderFooterUnicode.doc");
 		WordExtractor extractor2 = new WordExtractor(doc2);

 		assertEquals("This is a simple header, with a \u20ac euro symbol in it.\n\n", extractor2.getHeaderText());
 		assertContains(extractor2.getText(), "This is a simple header");
 		extractor2.close();
 		doc2.close();
 	}

     @Test
 	public void testWithFooter() throws IOException {
 		// Non-unicode
 		HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("ThreeColHeadFoot.doc");
 		WordExtractor extractor1 = new WordExtractor(doc1);

 		assertEquals("Footer Left\tFooter Middle Footer Right\n", extractor1.getFooterText());
 		assertContains(extractor1.getText(), "Footer Left");
         extractor1.close();
         doc1.close();

 		// Unicode
 		HWPFDocument doc2 = HWPFTestDataSamples.openSampleFile("HeaderFooterUnicode.doc");
 		WordExtractor extractor2 = new WordExtractor(doc2);

 		assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\n", extractor2.getFooterText());
 		assertContains(extractor2.getText(), "The footer, with");
         extractor2.close();
         doc2.close();
 	}

     @Test
 	public void testFootnote() throws IOException {
 		HWPFDocument doc = HWPFTestDataSamples.openSampleFile("footnote.doc");
 		WordExtractor extractor = new WordExtractor(doc);

 		assertExtractedContains(extractor.getFootnoteText(), "TestFootnote");
 		assertEquals(0x00, doc.getRange().getSection(0).getFootnoteNumberingFormat()); // msonfcArabic
 		assertEquals(0x00, doc.getRange().getSection(0).getFootnoteRestartQualifier()); // rncCont
 		assertEquals(0, doc.getRange().getSection(0).getFootnoteNumberingOffset());
 		assertEquals(1, doc.getFootnotes().getNotesCount());
 		extractor.close();
 		doc.close();
 	}

     @Test
 	public void testEndnote() throws IOException {
 		HWPFDocument doc = HWPFTestDataSamples.openSampleFile("footnote.doc");
 		WordExtractor extractor = new WordExtractor(doc);

 		assertExtractedContains(extractor.getEndnoteText(), "TestEndnote");
 		assertEquals(0x02, doc.getRange().getSection(0).getEndnoteNumberingFormat()); // msonfcLCRoman
 		assertEquals(0x00, doc.getRange().getSection(0).getEndnoteRestartQualifier()); // rncCont
 		assertEquals(0, doc.getRange().getSection(0).getEndnoteNumberingOffset());
 		assertEquals(1, doc.getEndnotes().getNotesCount());
 		extractor.close();
 		doc.close();
 	}

     @Test
 	public void testComments() throws IOException {
 		WordExtractor extractor = openExtractor("footnote.doc");
 		assertExtractedContains(extractor.getCommentsText(), "TestComment");
 		extractor.close();
 	}

     @Test(expected=OldWordFileFormatException.class)
     public void testWord95_WordExtractor() throws Exception {
         // Too old for the default
         openExtractor("Word95.doc").close();
     }

     @Test
 	public void testWord95() throws Exception {
 		// Can work with the special one
         InputStream is = docTests.openResourceAsStream("Word95.doc");
 	    Word6Extractor w6e = new Word6Extractor(is);
 	    is.close();

 		String text = w6e.getText();

 		assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
         assertTrue(text.contains("Paragraph 2"));
         assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
         assertTrue(text.contains("Last (4th) paragraph"));

         String[] tp = w6e.getParagraphText();
         assertEquals(7, tp.length);
         assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
         assertEquals("\r\n", tp[1]);
         assertEquals("Paragraph 2\r\n", tp[2]);
         assertEquals("\r\n", tp[3]);
         assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]);
         assertEquals("\r\n", tp[5]);
         assertEquals("Last (4th) paragraph.\r\n", tp[6]);
         w6e.close();
 	}

     @Test(expected=OldWordFileFormatException.class)
     public void testWord6_WordExtractor() throws IOException {
         // Too old for the default
         openExtractor("Word6.doc").close();
     }

     @Test
 	public void testWord6() throws Exception {
         InputStream is = docTests.openResourceAsStream("Word6.doc");
         Word6Extractor w6e = new Word6Extractor(is);
         is.close();
         String text = w6e.getText();

         assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));

         String[] tp = w6e.getParagraphText();
         assertEquals(1, tp.length);
         assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
         w6e.close();
 	}

     @Test
     public void testFastSaved() throws Exception {
         WordExtractor extractor = openExtractor("rasp.doc");

         String text = extractor.getText();
         assertTrue(text.contains("\u0425\u0425\u0425\u0425\u0425"));
         assertTrue(text.contains("\u0423\u0423\u0423\u0423\u0423"));

         extractor.close();
     }

     @Test
     public void testFirstParagraphFix() throws Exception {
         WordExtractor extractor = openExtractor("Bug48075.doc");

         String text = extractor.getText();

         assertTrue(text.startsWith("\u041f\u0440\u0438\u043b\u043e\u0436\u0435\u043d\u0438\u0435"));
         extractor.close();
     }

     /**
      * Tests that we can work with both {@link POIFSFileSystem}
      *  and {@link NPOIFSFileSystem}
      */
     @Test
     public void testDifferentPOIFS() throws Exception {
        // Open the two filesystems
        File file = docTests.getFile("test2.doc");
        InputStream is = new FileInputStream(file);
        OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is);
        is.close();
        NPOIFSFileSystem npoifs = new NPOIFSFileSystem(file);

        DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };

        // Open directly
        for(DirectoryNode dir : files) {
           @SuppressWarnings("resource")
           WordExtractor extractor = new WordExtractor(dir);
           assertEqualsTrim(p_text1_block, extractor.getText());
           // extractor.close();
        }

        // Open via a HWPFDocument
        for(DirectoryNode dir : files) {
           HWPFDocument doc = new HWPFDocument(dir);
           WordExtractor extractor = new WordExtractor(doc);
           assertEqualsTrim(p_text1_block, extractor.getText());
           extractor.close();
        }

        npoifs.close();
     }

     /**
      * [RESOLVED FIXED] Bug 51686 - Update to POI 3.8 beta 4 causes
      * ConcurrentModificationException in Tika's OfficeParser
      */
     @Test
     public void testBug51686() throws IOException {
         InputStream is = docTests.openResourceAsStream( "Bug51686.doc" );
         POIFSFileSystem fs = new POIFSFileSystem(is);
         is.close();

         String text = null;

         for (Entry entry : fs.getRoot()) {
             if ("WordDocument".equals(entry.getName())) {
                 WordExtractor ex = new WordExtractor(fs);
                 try {
                     text = ex.getText();
                 } finally {
                     ex.close();
                 }
             }
         }

         assertNotNull(text);
         fs.close();
     }

     @Test
     public void testExtractorFromWord6Extractor() throws Exception {
         InputStream is = POIDataSamples.getHPSFInstance().openResourceAsStream("TestMickey.doc");
         POIFSFileSystem fs = new POIFSFileSystem(is);
         is.close();
         Word6Extractor wExt = new Word6Extractor(fs);
         try {
             POITextExtractor ext = wExt.getMetadataTextExtractor();
             try {
                 // Now overall
                 String text = ext.getText();
                 assertContains(text, "TEMPLATE = Normal");
                 assertContains(text, "SUBJECT = sample subject");
                 assertContains(text, "MANAGER = sample manager");
                 assertContains(text, "COMPANY = sample company");
             } finally {
                 ext.close();
             }
         } finally {
             wExt.close();
             fs.close();
         }
     }

     private WordExtractor openExtractor(String fileName) throws IOException {
         InputStream is = docTests.openResourceAsStream(fileName);
         try {
             return new WordExtractor(is);
         } finally {
             is.close();
         }

     }
 }
	/* ====================================================================
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to You under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==================================================================== */

	package org.apache.poi.hwpf.extractor;

	import static org.apache.poi.POITestCase.assertContains;
	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertNotNull;
	import static org.junit.Assert.assertTrue;

	import java.io.File;
	import java.io.FileInputStream;
	import java.io.IOException;
	import java.io.InputStream;

	import org.apache.poi.POIDataSamples;
	import org.apache.poi.POITextExtractor;
	import org.apache.poi.hwpf.HWPFDocument;
	import org.apache.poi.hwpf.HWPFTestDataSamples;
	import org.apache.poi.hwpf.OldWordFileFormatException;
	import org.apache.poi.poifs.filesystem.DirectoryNode;
	import org.apache.poi.poifs.filesystem.Entry;
	import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
	import org.apache.poi.poifs.filesystem.OPOIFSFileSystem;
	import org.apache.poi.poifs.filesystem.POIFSFileSystem;
	import org.apache.poi.util.StringUtil;
	import org.junit.Test;

	/**
	* Test the different routes to extracting text
	*/
	public final class TestWordExtractor {

	private static POIDataSamples docTests = POIDataSamples.getDocumentInstance();

	public static void assertEqualsTrim( String expected, String actual )
	{
	String newExpected = expected.replaceAll( "\r\n", "\n" )
	.replaceAll( "\r", "\n" ).trim();
	String newActual = actual.replaceAll( "\r\n", "\n" )
	.replaceAll( "\r", "\n" ).trim();
	assertEquals( newExpected, newActual );
	}

	private static void assertExtractedContains(String[] extracted, String needle) {
	String endnote = StringUtil.join(extracted, "");
	assertContains(endnote, needle);
	}

	private final String[] p_text1 = new String[] {
	"This is a simple word document\r\n",
	"\r\n",
	"It has a number of paragraphs in it\r\n",
	"\r\n",
	"Some of them even feature bold, italic and underlined text\r\n",
	"\r\n",
	"\r\n",
	"This bit is in a different font and size\r\n",
	"\r\n",
	"\r\n",
	"This bit features some red text.\r\n",
	"\r\n",
	"\r\n",
	"It is otherwise very very boring.\r\n"
	};

	// Build splat'd out text version
	private final String p_text1_block = StringUtil.join(p_text1, "");

	/**
	* Test paragraph based extraction
	*/
	@Test
	public void testExtractFromParagraphs() throws IOException {
	WordExtractor extractor = openExtractor("test2.doc");
	String[] text = extractor.getParagraphText();

	assertEquals(p_text1.length, text.length);
	for (int i = 0; i < p_text1.length; i++) {
	assertEquals(p_text1[i], text[i]);
	}
	extractor.close();

	// Lots of paragraphs with only a few lines in them
	WordExtractor extractor2 = openExtractor("test.doc");
	assertEquals(24, extractor2.getParagraphText().length);
	assertEquals("as d\r\n", extractor2.getParagraphText()[16]);
	assertEquals("as d\r\n", extractor2.getParagraphText()[17]);
	assertEquals("as d\r\n", extractor2.getParagraphText()[18]);
	extractor2.close();
	}

	/**
	* Test the paragraph -> flat extraction
	*/
	@Test
	public void testGetText() throws IOException {
	WordExtractor extractor = openExtractor("test2.doc");
	assertEqualsTrim(p_text1_block, extractor.getText());

	// For the 2nd, should give similar answers for
	// the two methods, differing only in line endings

	// nope, they must have different results, because of garbage
	// assertEquals(
	// extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
	// extractor2.getText().replaceAll("[\\r\\n]", ""));
	extractor.close();
	}

	/**
	* Test textPieces based extraction
	*/
	@Test
	public void testExtractFromTextPieces() throws IOException {
	WordExtractor extractor = openExtractor("test2.doc");
	String text = extractor.getTextFromPieces();
	assertEquals(p_text1_block, text);
	extractor.close();
	}


	/**
	* Test that we can get data from two different embedded word documents
	*/
	@Test
	public void testExtractFromEmbeded() throws IOException {
	InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
	POIFSFileSystem fs = new POIFSFileSystem(is);
	is.close();

	DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B7");
	DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B2");

	// Should have WordDocument and 1Table
	assertNotNull(dirA.getEntry("1Table"));
	assertNotNull(dirA.getEntry("WordDocument"));

	assertNotNull(dirB.getEntry("1Table"));
	assertNotNull(dirB.getEntry("WordDocument"));

	// Check each in turn
	HWPFDocument docA = new HWPFDocument(dirA);
	WordExtractor extractorA = new WordExtractor(docA);

	assertNotNull(extractorA.getText());
	assertTrue(extractorA.getText().length() > 20);
	assertEqualsTrim("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractorA.getText());
	assertEquals("Sample Doc 1", extractorA.getSummaryInformation().getTitle());
	assertEquals("Sample Test", extractorA.getSummaryInformation().getSubject());

	HWPFDocument docB = new HWPFDocument(dirB);
	WordExtractor extractorB = new WordExtractor(docB);

	assertNotNull(extractorB.getText());
	assertTrue(extractorB.getText().length() > 20);
	assertEqualsTrim("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractorB.getText());
	assertEquals("Sample Doc 2", extractorB.getSummaryInformation().getTitle());
	assertEquals("Another Sample Test", extractorB.getSummaryInformation().getSubject());

	extractorA.close();
	docA.close();

	extractorB.close();
	docB.close();

	fs.close();
	}

	@Test
	public void testWithHeader() throws IOException {
	// Non-unicode
	HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("ThreeColHeadFoot.doc");
	WordExtractor extractor1 = new WordExtractor(doc1);

	assertEquals("First header column!\tMid header Right header!\n", extractor1.getHeaderText());
	assertContains(extractor1.getText(), "First header column!");
	extractor1.close();
	doc1.close();

	// Unicode
	HWPFDocument doc2 = HWPFTestDataSamples.openSampleFile("HeaderFooterUnicode.doc");
	WordExtractor extractor2 = new WordExtractor(doc2);

	assertEquals("This is a simple header, with a \u20ac euro symbol in it.\n\n", extractor2.getHeaderText());
	assertContains(extractor2.getText(), "This is a simple header");
	extractor2.close();
	doc2.close();
	}

	@Test
	public void testWithFooter() throws IOException {
	// Non-unicode
	HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("ThreeColHeadFoot.doc");
	WordExtractor extractor1 = new WordExtractor(doc1);

	assertEquals("Footer Left\tFooter Middle Footer Right\n", extractor1.getFooterText());
	assertContains(extractor1.getText(), "Footer Left");
	extractor1.close();
	doc1.close();

	// Unicode
	HWPFDocument doc2 = HWPFTestDataSamples.openSampleFile("HeaderFooterUnicode.doc");
	WordExtractor extractor2 = new WordExtractor(doc2);

	assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\n", extractor2.getFooterText());
	assertContains(extractor2.getText(), "The footer, with");
	extractor2.close();
	doc2.close();
	}

	@Test
	public void testFootnote() throws IOException {
	HWPFDocument doc = HWPFTestDataSamples.openSampleFile("footnote.doc");
	WordExtractor extractor = new WordExtractor(doc);

	assertExtractedContains(extractor.getFootnoteText(), "TestFootnote");
	assertEquals(0x00, doc.getRange().getSection(0).getFootnoteNumberingFormat()); // msonfcArabic
	assertEquals(0x00, doc.getRange().getSection(0).getFootnoteRestartQualifier()); // rncCont
	assertEquals(0, doc.getRange().getSection(0).getFootnoteNumberingOffset());
	assertEquals(1, doc.getFootnotes().getNotesCount());
	extractor.close();
	doc.close();
	}

	@Test
	public void testEndnote() throws IOException {
	HWPFDocument doc = HWPFTestDataSamples.openSampleFile("footnote.doc");
	WordExtractor extractor = new WordExtractor(doc);

	assertExtractedContains(extractor.getEndnoteText(), "TestEndnote");
	assertEquals(0x02, doc.getRange().getSection(0).getEndnoteNumberingFormat()); // msonfcLCRoman
	assertEquals(0x00, doc.getRange().getSection(0).getEndnoteRestartQualifier()); // rncCont
	assertEquals(0, doc.getRange().getSection(0).getEndnoteNumberingOffset());
	assertEquals(1, doc.getEndnotes().getNotesCount());
	extractor.close();
	doc.close();
	}

	@Test
	public void testComments() throws IOException {
	WordExtractor extractor = openExtractor("footnote.doc");
	assertExtractedContains(extractor.getCommentsText(), "TestComment");
	extractor.close();
	}

	@Test(expected=OldWordFileFormatException.class)
	public void testWord95_WordExtractor() throws Exception {
	// Too old for the default
	openExtractor("Word95.doc").close();
	}

	@Test
	public void testWord95() throws Exception {
	// Can work with the special one
	InputStream is = docTests.openResourceAsStream("Word95.doc");
	Word6Extractor w6e = new Word6Extractor(is);
	is.close();

	String text = w6e.getText();

	assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));
	assertTrue(text.contains("Paragraph 2"));
	assertTrue(text.contains("Paragraph 3. Has some RED text and some BLUE BOLD text in it"));
	assertTrue(text.contains("Last (4th) paragraph"));

	String[] tp = w6e.getParagraphText();
	assertEquals(7, tp.length);
	assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
	assertEquals("\r\n", tp[1]);
	assertEquals("Paragraph 2\r\n", tp[2]);
	assertEquals("\r\n", tp[3]);
	assertEquals("Paragraph 3. Has some RED text and some BLUE BOLD text in it.\r\n", tp[4]);
	assertEquals("\r\n", tp[5]);
	assertEquals("Last (4th) paragraph.\r\n", tp[6]);
	w6e.close();
	}

	@Test(expected=OldWordFileFormatException.class)
	public void testWord6_WordExtractor() throws IOException {
	// Too old for the default
	openExtractor("Word6.doc").close();
	}

	@Test
	public void testWord6() throws Exception {
	InputStream is = docTests.openResourceAsStream("Word6.doc");
	Word6Extractor w6e = new Word6Extractor(is);
	is.close();
	String text = w6e.getText();

	assertTrue(text.contains("The quick brown fox jumps over the lazy dog"));

	String[] tp = w6e.getParagraphText();
	assertEquals(1, tp.length);
	assertEquals("The quick brown fox jumps over the lazy dog\r\n", tp[0]);
	w6e.close();
	}

	@Test
	public void testFastSaved() throws Exception {
	WordExtractor extractor = openExtractor("rasp.doc");

	String text = extractor.getText();
	assertTrue(text.contains("\u0425\u0425\u0425\u0425\u0425"));
	assertTrue(text.contains("\u0423\u0423\u0423\u0423\u0423"));

	extractor.close();
	}

	@Test
	public void testFirstParagraphFix() throws Exception {
	WordExtractor extractor = openExtractor("Bug48075.doc");

	String text = extractor.getText();

	assertTrue(text.startsWith("\u041f\u0440\u0438\u043b\u043e\u0436\u0435\u043d\u0438\u0435"));
	extractor.close();
	}

	/**
	* Tests that we can work with both {@link POIFSFileSystem}
	* and {@link NPOIFSFileSystem}
	*/
	@Test
	public void testDifferentPOIFS() throws Exception {
	// Open the two filesystems
	File file = docTests.getFile("test2.doc");
	InputStream is = new FileInputStream(file);
	OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is);
	is.close();
	NPOIFSFileSystem npoifs = new NPOIFSFileSystem(file);

	DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };

	// Open directly
	for(DirectoryNode dir : files) {
	@SuppressWarnings("resource")
	WordExtractor extractor = new WordExtractor(dir);
	assertEqualsTrim(p_text1_block, extractor.getText());
	// extractor.close();
	}

	// Open via a HWPFDocument
	for(DirectoryNode dir : files) {
	HWPFDocument doc = new HWPFDocument(dir);
	WordExtractor extractor = new WordExtractor(doc);
	assertEqualsTrim(p_text1_block, extractor.getText());
	extractor.close();
	}

	npoifs.close();
	}

	/**
	* [RESOLVED FIXED] Bug 51686 - Update to POI 3.8 beta 4 causes
	* ConcurrentModificationException in Tika's OfficeParser
	*/
	@Test
	public void testBug51686() throws IOException {
	InputStream is = docTests.openResourceAsStream( "Bug51686.doc" );
	POIFSFileSystem fs = new POIFSFileSystem(is);
	is.close();

	String text = null;

	for (Entry entry : fs.getRoot()) {
	if ("WordDocument".equals(entry.getName())) {
	WordExtractor ex = new WordExtractor(fs);
	try {
	text = ex.getText();
	} finally {
	ex.close();
	}
	}
	}

	assertNotNull(text);
	fs.close();
	}

	@Test
	public void testExtractorFromWord6Extractor() throws Exception {
	InputStream is = POIDataSamples.getHPSFInstance().openResourceAsStream("TestMickey.doc");
	POIFSFileSystem fs = new POIFSFileSystem(is);
	is.close();
	Word6Extractor wExt = new Word6Extractor(fs);
	try {
	POITextExtractor ext = wExt.getMetadataTextExtractor();
	try {
	// Now overall
	String text = ext.getText();
	assertContains(text, "TEMPLATE = Normal");
	assertContains(text, "SUBJECT = sample subject");
	assertContains(text, "MANAGER = sample manager");
	assertContains(text, "COMPANY = sample company");
	} finally {
	ext.close();
	}
	} finally {
	wExt.close();
	fs.close();
	}
	}

	private WordExtractor openExtractor(String fileName) throws IOException {
	InputStream is = docTests.openResourceAsStream(fileName);
	try {
	return new WordExtractor(is);
	} finally {
	is.close();
	}

	}
	}