src/scratchpad/testcases/org/apache/poi/hwpf/extractor/TestWordExtractor.java - poi - Git at Google

 /*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 package org.apache.poi.hwpf.extractor;

 import java.io.File;
 import java.io.FileInputStream;

 import junit.framework.TestCase;

 import org.apache.poi.hwpf.HWPFDocument;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;

 /**
  * Test the different routes to extracting text
  *
  * @author Nick Burch (nick at torchbox dot com)
  */
 public class TestWordExtractor extends TestCase {
 	private String[] p_text1 = new String[] {
 			"This is a simple word document\r\n",
 			"\r\n",
 			"It has a number of paragraphs in it\r\n",
 			"\r\n",
 			"Some of them even feature bold, italic and underlined text\r\n",
 			"\r\n",
 			"\r\n",
 			"This bit is in a different font and size\r\n",
 			"\r\n",
 			"\r\n",
 			"This bit features some red text.\r\n",
 			"\r\n",
 			"\r\n",
 			"It is otherwise very very boring.\r\n"
 	};
 	private String p_text1_block = new String();

 	// Well behaved document
 	private WordExtractor extractor;
 	// Corrupted document - can't do paragraph based stuff
 	private WordExtractor extractor2;
 	// A word doc embeded in an excel file
 	private String filename3;

 	// With header and footer
 	private String filename4;
 	// With unicode header and footer
 	private String filename5;

     protected void setUp() throws Exception {
 		String dirname = System.getProperty("HWPF.testdata.path");
 		String pdirname = System.getProperty("POIFS.testdata.path");

 		String filename = dirname + "/test2.doc";
 		String filename2 = dirname + "/test.doc";
 		filename3 = pdirname + "/excel_with_embeded.xls";
 		filename4 = dirname + "/ThreeColHeadFoot.doc";
 		filename5 = dirname + "/HeaderFooterUnicode.doc";

 		extractor = new WordExtractor(new FileInputStream(filename));
 		extractor2 = new WordExtractor(new FileInputStream(filename2));

 		// Build splat'd out text version
 		for(int i=0; i<p_text1.length; i++) {
 			p_text1_block += p_text1[i];
 		}
     }

     /**
      * Test paragraph based extraction
      */
     public void testExtractFromParagraphs() {
     	String[] text = extractor.getParagraphText();

     	assertEquals(p_text1.length, text.length);
     	for(int i=0; i<p_text1.length; i++) {
     		assertEquals(p_text1[i], text[i]);
     	}

     	// On second one, should fall back
     	assertEquals(1, extractor2.getParagraphText().length);
     }

     /**
      * Test the paragraph -> flat extraction
      */
     public void testGetText() {
     	assertEquals(p_text1_block, extractor.getText());

     	// On second one, should fall back to text piece
     	assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
     }

     /**
      * Test textPieces based extraction
      */
     public void testExtractFromTextPieces() throws Exception {
     	String text = extractor.getTextFromPieces();
     	assertEquals(p_text1_block, text);
     }


     /**
      * Test that we can get data from two different
      *  embeded word documents
      * @throws Exception
      */
     public void testExtractFromEmbeded() throws Exception {
     	POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3));
     	HWPFDocument doc;
     	WordExtractor extractor3;

     	DirectoryNode dirA = (DirectoryNode)
 			fs.getRoot().getEntry("MBD0000A3B7");
     	DirectoryNode dirB = (DirectoryNode)
     		fs.getRoot().getEntry("MBD0000A3B2");

     	// Should have WordDocument and 1Table
     	assertNotNull(dirA.getEntry("1Table"));
     	assertNotNull(dirA.getEntry("WordDocument"));

     	assertNotNull(dirB.getEntry("1Table"));
     	assertNotNull(dirB.getEntry("WordDocument"));

     	// Check each in turn
     	doc = new HWPFDocument(dirA, fs);
     	extractor3 = new WordExtractor(doc);

     	assertNotNull(extractor3.getText());
     	assertTrue(extractor3.getText().length() > 20);
     	assertEquals("I am a sample document\r\nNot much on me\r\nI am document 1\r\n",
     			extractor3.getText());
     	assertEquals("Sample Doc 1", extractor3.getSummaryInformation().getTitle());
     	assertEquals("Sample Test", extractor3.getSummaryInformation().getSubject());


     	doc = new HWPFDocument(dirB, fs);
     	extractor3 = new WordExtractor(doc);

     	assertNotNull(extractor3.getText());
     	assertTrue(extractor3.getText().length() > 20);
     	assertEquals("I am another sample document\r\nNot much on me\r\nI am document 2\r\n",
     			extractor3.getText());
     	assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
     	assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
     }

     public void testWithHeader() throws Exception {
     	// Non-unicode
     	HWPFDocument doc = new HWPFDocument(
     			new FileInputStream(filename4)
     	);
     	extractor = new WordExtractor(doc);

     	assertEquals(
     			"First header column!\tMid header Right header!\n",
     			extractor.getHeaderText()
     	);

     	String text = extractor.getText();
     	assertTrue(
     			text.indexOf("First header column!") > -1
     	);


     	// Unicode
     	doc = new HWPFDocument(
     			new FileInputStream(filename5)
     	);
     	extractor = new WordExtractor(doc);

     	assertEquals(
     			"This is a simple header, with a \u20ac euro symbol in it.\n\n",
     			extractor.getHeaderText()
     	);
     	text = extractor.getText();
     	assertTrue(
     			text.indexOf("This is a simple header") > -1
     	);
     }

     public void testWithFooter() throws Exception {
     	// Non-unicode
     	HWPFDocument doc = new HWPFDocument(
     			new FileInputStream(filename4)
     	);
     	extractor = new WordExtractor(doc);

     	assertEquals(
     			"Footer Left\tFooter Middle Footer Right\n",
     			extractor.getFooterText()
     	);

     	String text = extractor.getText();
     	assertTrue(
     			text.indexOf("Footer Left") > -1
     	);


     	// Unicode
     	doc = new HWPFDocument(
     			new FileInputStream(filename5)
     	);
     	extractor = new WordExtractor(doc);

     	assertEquals(
     			"The footer, with Moli\u00e8re, has Unicode in it.\n",
     			extractor.getFooterText()
     	);
     	text = extractor.getText();
     	assertTrue(
     			text.indexOf("The footer, with") > -1
     	);
     }

     /**
      * Tests that we can work with both {@link POIFSFileSystem}
      *  and {@link NPOIFSFileSystem}
      */
     public void testDifferentPOIFS() throws Exception {
 		 String dirname = System.getProperty("HWPF.testdata.path");
        File f = new File(dirname, "test2.doc");

        // Open the two filesystems
        DirectoryNode[] files = new DirectoryNode[2];
        files[0] = (new POIFSFileSystem(new FileInputStream(f))).getRoot();
        files[1] = (new NPOIFSFileSystem(f)).getRoot();

        // Open directly
        for(DirectoryNode dir : files) {
           WordExtractor extractor = new WordExtractor(dir);
           assertEquals(p_text1_block, extractor.getText());
        }

        // Open via a HWPFDocument
        for(DirectoryNode dir : files) {
           HWPFDocument doc = new HWPFDocument(dir);
           WordExtractor extractor = new WordExtractor(doc);
           assertEquals(p_text1_block, extractor.getText());
        }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.poi.hwpf.extractor;

	import java.io.File;
	import java.io.FileInputStream;

	import junit.framework.TestCase;

	import org.apache.poi.hwpf.HWPFDocument;
	import org.apache.poi.poifs.filesystem.DirectoryNode;
	import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
	import org.apache.poi.poifs.filesystem.POIFSFileSystem;

	/**
	* Test the different routes to extracting text
	*
	* @author Nick Burch (nick at torchbox dot com)
	*/
	public class TestWordExtractor extends TestCase {
	private String[] p_text1 = new String[] {
	"This is a simple word document\r\n",
	"\r\n",
	"It has a number of paragraphs in it\r\n",
	"\r\n",
	"Some of them even feature bold, italic and underlined text\r\n",
	"\r\n",
	"\r\n",
	"This bit is in a different font and size\r\n",
	"\r\n",
	"\r\n",
	"This bit features some red text.\r\n",
	"\r\n",
	"\r\n",
	"It is otherwise very very boring.\r\n"
	};
	private String p_text1_block = new String();

	// Well behaved document
	private WordExtractor extractor;
	// Corrupted document - can't do paragraph based stuff
	private WordExtractor extractor2;
	// A word doc embeded in an excel file
	private String filename3;

	// With header and footer
	private String filename4;
	// With unicode header and footer
	private String filename5;

	protected void setUp() throws Exception {
	String dirname = System.getProperty("HWPF.testdata.path");
	String pdirname = System.getProperty("POIFS.testdata.path");

	String filename = dirname + "/test2.doc";
	String filename2 = dirname + "/test.doc";
	filename3 = pdirname + "/excel_with_embeded.xls";
	filename4 = dirname + "/ThreeColHeadFoot.doc";
	filename5 = dirname + "/HeaderFooterUnicode.doc";

	extractor = new WordExtractor(new FileInputStream(filename));
	extractor2 = new WordExtractor(new FileInputStream(filename2));

	// Build splat'd out text version
	for(int i=0; i<p_text1.length; i++) {
	p_text1_block += p_text1[i];
	}
	}

	/**
	* Test paragraph based extraction
	*/
	public void testExtractFromParagraphs() {
	String[] text = extractor.getParagraphText();

	assertEquals(p_text1.length, text.length);
	for(int i=0; i<p_text1.length; i++) {
	assertEquals(p_text1[i], text[i]);
	}

	// On second one, should fall back
	assertEquals(1, extractor2.getParagraphText().length);
	}

	/**
	* Test the paragraph -> flat extraction
	*/
	public void testGetText() {
	assertEquals(p_text1_block, extractor.getText());

	// On second one, should fall back to text piece
	assertEquals(extractor2.getTextFromPieces(), extractor2.getText());
	}

	/**
	* Test textPieces based extraction
	*/
	public void testExtractFromTextPieces() throws Exception {
	String text = extractor.getTextFromPieces();
	assertEquals(p_text1_block, text);
	}


	/**
	* Test that we can get data from two different
	* embeded word documents
	* @throws Exception
	*/
	public void testExtractFromEmbeded() throws Exception {
	POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(filename3));
	HWPFDocument doc;
	WordExtractor extractor3;

	DirectoryNode dirA = (DirectoryNode)
	fs.getRoot().getEntry("MBD0000A3B7");
	DirectoryNode dirB = (DirectoryNode)
	fs.getRoot().getEntry("MBD0000A3B2");

	// Should have WordDocument and 1Table
	assertNotNull(dirA.getEntry("1Table"));
	assertNotNull(dirA.getEntry("WordDocument"));

	assertNotNull(dirB.getEntry("1Table"));
	assertNotNull(dirB.getEntry("WordDocument"));

	// Check each in turn
	doc = new HWPFDocument(dirA, fs);
	extractor3 = new WordExtractor(doc);

	assertNotNull(extractor3.getText());
	assertTrue(extractor3.getText().length() > 20);
	assertEquals("I am a sample document\r\nNot much on me\r\nI am document 1\r\n",
	extractor3.getText());
	assertEquals("Sample Doc 1", extractor3.getSummaryInformation().getTitle());
	assertEquals("Sample Test", extractor3.getSummaryInformation().getSubject());


	doc = new HWPFDocument(dirB, fs);
	extractor3 = new WordExtractor(doc);

	assertNotNull(extractor3.getText());
	assertTrue(extractor3.getText().length() > 20);
	assertEquals("I am another sample document\r\nNot much on me\r\nI am document 2\r\n",
	extractor3.getText());
	assertEquals("Sample Doc 2", extractor3.getSummaryInformation().getTitle());
	assertEquals("Another Sample Test", extractor3.getSummaryInformation().getSubject());
	}

	public void testWithHeader() throws Exception {
	// Non-unicode
	HWPFDocument doc = new HWPFDocument(
	new FileInputStream(filename4)
	);
	extractor = new WordExtractor(doc);

	assertEquals(
	"First header column!\tMid header Right header!\n",
	extractor.getHeaderText()
	);

	String text = extractor.getText();
	assertTrue(
	text.indexOf("First header column!") > -1
	);


	// Unicode
	doc = new HWPFDocument(
	new FileInputStream(filename5)
	);
	extractor = new WordExtractor(doc);

	assertEquals(
	"This is a simple header, with a \u20ac euro symbol in it.\n\n",
	extractor.getHeaderText()
	);
	text = extractor.getText();
	assertTrue(
	text.indexOf("This is a simple header") > -1
	);
	}

	public void testWithFooter() throws Exception {
	// Non-unicode
	HWPFDocument doc = new HWPFDocument(
	new FileInputStream(filename4)
	);
	extractor = new WordExtractor(doc);

	assertEquals(
	"Footer Left\tFooter Middle Footer Right\n",
	extractor.getFooterText()
	);

	String text = extractor.getText();
	assertTrue(
	text.indexOf("Footer Left") > -1
	);


	// Unicode
	doc = new HWPFDocument(
	new FileInputStream(filename5)
	);
	extractor = new WordExtractor(doc);

	assertEquals(
	"The footer, with Moli\u00e8re, has Unicode in it.\n",
	extractor.getFooterText()
	);
	text = extractor.getText();
	assertTrue(
	text.indexOf("The footer, with") > -1
	);
	}

	/**
	* Tests that we can work with both {@link POIFSFileSystem}
	* and {@link NPOIFSFileSystem}
	*/
	public void testDifferentPOIFS() throws Exception {
	String dirname = System.getProperty("HWPF.testdata.path");
	File f = new File(dirname, "test2.doc");

	// Open the two filesystems
	DirectoryNode[] files = new DirectoryNode[2];
	files[0] = (new POIFSFileSystem(new FileInputStream(f))).getRoot();
	files[1] = (new NPOIFSFileSystem(f)).getRoot();

	// Open directly
	for(DirectoryNode dir : files) {
	WordExtractor extractor = new WordExtractor(dir);
	assertEquals(p_text1_block, extractor.getText());
	}

	// Open via a HWPFDocument
	for(DirectoryNode dir : files) {
	HWPFDocument doc = new HWPFDocument(dir);
	WordExtractor extractor = new WordExtractor(doc);
	assertEquals(p_text1_block, extractor.getText());
	}
	}
	}