blob: 64981e94e733e68b39fdffb55652a26d1c2716dd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.poi.hwpf.extractor;
import java.io.IOException;
import java.io.InputStream;
import java.io.FileInputStream;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.TextPiece;
import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
* Class to extract the text from a Word Document.
*
* You should use either getParagraphText() or getText() unless
* you have a strong reason otherwise.
*
* @author Nick Burch (nick at torchbox dot com)
*/
public class WordExtractor extends POIOLE2TextExtractor {
private POIFSFileSystem fs;
private HWPFDocument doc;
/**
* Create a new Word Extractor
* @param is InputStream containing the word file
*/
public WordExtractor(InputStream is) throws IOException {
this( HWPFDocument.verifyAndBuildPOIFS(is) );
}
/**
* Create a new Word Extractor
* @param fs POIFSFileSystem containing the word file
*/
public WordExtractor(POIFSFileSystem fs) throws IOException {
this(new HWPFDocument(fs));
this.fs = fs;
}
/**
* Create a new Word Extractor
* @param dir DirectoryNode containing the word file
*/
public WordExtractor(DirectoryNode dir) throws IOException {
this(new HWPFDocument(dir));
this.fs = fs;
}
/**
* Create a new Word Extractor
* @param doc The HWPFDocument to extract from
*/
public WordExtractor(HWPFDocument doc) throws IOException {
super(doc);
this.doc = doc;
}
/**
* Command line extractor, so people will stop moaning that
* they can't just run this.
*/
public static void main(String[] args) throws IOException {
if(args.length == 0) {
System.err.println("Use:");
System.err.println(" java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
System.exit(1);
}
// Process the first argument as a file
FileInputStream fin = new FileInputStream(args[0]);
WordExtractor extractor = new WordExtractor(fin);
System.out.println(extractor.getText());
}
/**
* Get the text from the word file, as an array with one String
* per paragraph
*/
public String[] getParagraphText() {
String[] ret;
// Extract using the model code
try {
Range r = doc.getRange();
ret = new String[r.numParagraphs()];
for(int i=0; i<ret.length; i++) {
Paragraph p = r.getParagraph(i);
ret[i] = p.text();
// Fix the line ending
if(ret[i].endsWith("\r")) {
ret[i] = ret[i] + "\n";
}
}
} catch(Exception e) {
// Something's up with turning the text pieces into paragraphs
// Fall back to ripping out the text pieces
ret = new String[1];
ret[0] = getTextFromPieces();
}
return ret;
}
/**
* Add the header/footer text, if it's not empty
*/
private void appendHeaderFooter(String text, StringBuffer out) {
if(text == null || text.length() == 0)
return;
text = text.replace('\r', '\n');
if(! text.endsWith("\n")) {
out.append(text);
out.append('\n');
return;
}
if(text.endsWith("\n\n")) {
out.append(text.substring(0, text.length()-1));
return;
}
out.append(text);
return;
}
/**
* Grab the text from the headers
*/
public String getHeaderText() {
HeaderStories hs = new HeaderStories(doc);
StringBuffer ret = new StringBuffer();
if(hs.getFirstHeader() != null) {
appendHeaderFooter(hs.getFirstHeader(), ret);
}
if(hs.getEvenHeader() != null) {
appendHeaderFooter(hs.getEvenHeader(), ret);
}
if(hs.getOddHeader() != null) {
appendHeaderFooter(hs.getOddHeader(), ret);
}
return ret.toString();
}
/**
* Grab the text from the footers
*/
public String getFooterText() {
HeaderStories hs = new HeaderStories(doc);
StringBuffer ret = new StringBuffer();
if(hs.getFirstFooter() != null) {
appendHeaderFooter(hs.getFirstFooter(), ret);
}
if(hs.getEvenFooter() != null) {
appendHeaderFooter(hs.getEvenFooter(), ret);
}
if(hs.getOddFooter() != null) {
appendHeaderFooter(hs.getOddFooter(), ret);
}
return ret.toString();
}
/**
* Grab the text out of the text pieces. Might also include various
* bits of crud, but will work in cases where the text piece -> paragraph
* mapping is broken. Fast too.
*/
public String getTextFromPieces() {
StringBuffer textBuf = new StringBuffer();
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
while (textPieces.hasNext()) {
TextPiece piece = (TextPiece) textPieces.next();
String encoding = "Cp1252";
if (piece.isUnicode()) {
encoding = "UTF-16LE";
}
try {
String text = new String(piece.getRawBytes(), encoding);
textBuf.append(text);
} catch(UnsupportedEncodingException e) {
throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
}
}
String text = textBuf.toString();
// Fix line endings (Note - won't get all of them
text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
text = text.replaceAll("\r\r", "\r\n\r\n");
if(text.endsWith("\r")) {
text += "\n";
}
return text;
}
/**
* Grab the text, based on the paragraphs. Shouldn't include any crud,
* but slightly slower than getTextFromPieces().
*/
public String getText() {
StringBuffer ret = new StringBuffer();
ret.append(getHeaderText());
String[] text = getParagraphText();
for(int i=0; i<text.length; i++) {
ret.append(text[i]);
}
ret.append(getFooterText());
return ret.toString();
}
/**
* Removes any fields (eg macros, page markers etc)
* from the string.
*/
public static String stripFields(String text) {
return Range.stripFields(text);
}
}