blob: 04fe38ca3829000e5f590df8bf3faef46fbfdd9d [file] [log] [blame]
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi.hwpf;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import org.apache.poi.POIDocument;
import org.apache.poi.hwpf.model.CHPBinTable;
import org.apache.poi.hwpf.model.FileInformationBlock;
import org.apache.poi.hwpf.model.FontTable;
import org.apache.poi.hwpf.model.ListTables;
import org.apache.poi.hwpf.model.PAPBinTable;
import org.apache.poi.hwpf.model.SectionTable;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.model.TextPieceTable;
import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
import org.apache.poi.hwpf.usermodel.ObjectsPool;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.Internal;
/**
* This class holds much of the core of a Word document, but
* without some of the table structure information.
* You generally want to work with one of
* {@link HWPFDocument} or {@link HWPFOldDocument}
*/
public abstract class HWPFDocumentCore extends POIDocument
{
protected static final String STREAM_OBJECT_POOL = "ObjectPool";
protected static final String STREAM_WORD_DOCUMENT = "WordDocument";
/** Holds OLE2 objects */
protected ObjectPoolImpl _objectPool;
/** The FIB */
protected FileInformationBlock _fib;
/** Holds styles for this document.*/
protected StyleSheet _ss;
/** Contains formatting properties for text*/
protected CHPBinTable _cbt;
/** Contains formatting properties for paragraphs*/
protected PAPBinTable _pbt;
/** Contains formatting properties for sections.*/
protected SectionTable _st;
/** Holds fonts for this document.*/
protected FontTable _ft;
/** Hold list tables */
protected ListTables _lt;
/** main document stream buffer*/
protected byte[] _mainStream;
protected HWPFDocumentCore()
{
super((DirectoryNode)null);
}
/**
* Takens an InputStream, verifies that it's not RTF or PDF, builds a
* POIFSFileSystem from it, and returns that.
*/
public static POIFSFileSystem verifyAndBuildPOIFS(InputStream istream) throws IOException {
// Open a PushbackInputStream, so we can peek at the first few bytes
PushbackInputStream pis = new PushbackInputStream(istream,6);
byte[] first6 = new byte[6];
pis.read(first6);
// Does it start with {\rtf ? If so, it's really RTF
if(first6[0] == '{' && first6[1] == '\\' && first6[2] == 'r'
&& first6[3] == 't' && first6[4] == 'f') {
throw new IllegalArgumentException("The document is really a RTF file");
} else if(first6[0] == '%' && first6[1] == 'P' && first6[2] == 'D' && first6[3] == 'F' ) {
throw new IllegalArgumentException("The document is really a PDF file");
}
// OK, so it's neither RTF nor PDF
// Open a POIFSFileSystem on the (pushed back) stream
pis.unread(first6);
return new POIFSFileSystem(pis);
}
/**
* This constructor loads a Word document from an InputStream.
*
* @param istream The InputStream that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in InputStream.
*/
public HWPFDocumentCore(InputStream istream) throws IOException
{
//do Ole stuff
this( verifyAndBuildPOIFS(istream) );
}
/**
* This constructor loads a Word document from a POIFSFileSystem
*
* @param pfilesystem The POIFSFileSystem that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in POIFSFileSystem.
*/
public HWPFDocumentCore(POIFSFileSystem pfilesystem) throws IOException
{
this(pfilesystem.getRoot());
}
/**
* This constructor loads a Word document from a specific point
* in a POIFSFileSystem, probably not the default.
* Used typically to open embeded documents.
*
* @param directory The DirectoryNode that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in POIFSFileSystem.
*/
public HWPFDocumentCore(DirectoryNode directory) throws IOException {
// Sort out the hpsf properties
super(directory);
// read in the main stream.
DocumentEntry documentProps = (DocumentEntry)
directory.getEntry("WordDocument");
_mainStream = new byte[documentProps.getSize()];
directory.createDocumentInputStream(STREAM_WORD_DOCUMENT).read(_mainStream);
// Create our FIB, and check for the doc being encrypted
_fib = new FileInformationBlock(_mainStream);
DirectoryEntry objectPoolEntry;
try {
objectPoolEntry = (DirectoryEntry) directory
.getEntry(STREAM_OBJECT_POOL);
} catch (FileNotFoundException exc) {
objectPoolEntry = null;
}
_objectPool = new ObjectPoolImpl(objectPoolEntry);
}
/**
* Returns the range which covers the whole of the document, but excludes
* any headers and footers.
*/
public abstract Range getRange();
/**
* Returns the range that covers all text in the file, including main text,
* footnotes, headers and comments
*/
public abstract Range getOverallRange();
/**
* Returns document text, i.e. text information from all text pieces,
* including OLE descriptions and field codes
*/
public String getDocumentText() {
return getText().toString();
}
/**
* Internal method to access document text
*/
@Internal
public abstract StringBuilder getText();
public CHPBinTable getCharacterTable()
{
return _cbt;
}
public PAPBinTable getParagraphTable()
{
return _pbt;
}
public SectionTable getSectionTable()
{
return _st;
}
public StyleSheet getStyleSheet()
{
return _ss;
}
public ListTables getListTables()
{
return _lt;
}
public FontTable getFontTable()
{
return _ft;
}
public FileInformationBlock getFileInformationBlock()
{
return _fib;
}
public ObjectsPool getObjectsPool()
{
return _objectPool;
}
public abstract TextPieceTable getTextTable();
}