blob: 51128d9b7cfdebd37a6eeec8e92df438f3bf59c3 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.rtf;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
import java.nio.Buffer;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;
import java.util.Calendar;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Map;
import java.util.Stack;
import java.util.TimeZone;
import org.apache.commons.io.IOUtils;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.CharsetUtils;
/* Tokenizes and performs a "shallow" parse of the RTF
* document, just enough to properly decode the text.
*
* TODO: we should cutover to a "real" tokenizer (eg JFlex);
* it should give better perf, by replacing the excessive
* "else if" string compares with FSA traversal. */
final class TextExtractor {
private static final char SPACE = ' ';
private static final String P = "p";
private static final String LI = "li";
private static final String OL = "ol";
private static final String UL = "ul";
private static final Charset ASCII = Charset.forName("US-ASCII");
private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
private static final Charset MAC_ROMAN = getCharset("MacRoman");
private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
private static final Charset WINDOWS_57011 = getCharset("windows-57011");
private static final Charset WINDOWS_57010 = getCharset("windows-57010");
private static final Charset WINDOWS_57009 = getCharset("windows-57009");
private static final Charset WINDOWS_57008 = getCharset("windows-57008");
private static final Charset WINDOWS_57007 = getCharset("windows-57007");
private static final Charset WINDOWS_57006 = getCharset("windows-57006");
private static final Charset WINDOWS_57005 = getCharset("windows-57005");
private static final Charset WINDOWS_57004 = getCharset("windows-57004");
private static final Charset WINDOWS_57003 = getCharset("windows-57003");
private static final Charset X_ISCII91 = getCharset("x-ISCII91");
private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
private static final Charset X_JOHAB = getCharset("x-Johab");
private static final Charset CP12582 = getCharset("CP1258");
private static final Charset CP12572 = getCharset("CP1257");
private static final Charset CP12562 = getCharset("CP1256");
private static final Charset CP12552 = getCharset("CP1255");
private static final Charset CP12542 = getCharset("CP1254");
private static final Charset CP12532 = getCharset("CP1253");
private static final Charset CP1252 = getCharset("CP1252");
private static final Charset CP12512 = getCharset("CP1251");
private static final Charset CP12502 = getCharset("CP1250");
private static final Charset CP950 = getCharset("CP950");
private static final Charset CP949 = getCharset("CP949");
private static final Charset MS9362 = getCharset("MS936");
private static final Charset MS8742 = getCharset("MS874");
private static final Charset CP866 = getCharset("CP866");
private static final Charset CP865 = getCharset("CP865");
private static final Charset CP864 = getCharset("CP864");
private static final Charset CP863 = getCharset("CP863");
private static final Charset CP862 = getCharset("CP862");
private static final Charset CP860 = getCharset("CP860");
private static final Charset CP852 = getCharset("CP852");
private static final Charset CP8502 = getCharset("CP850");
private static final Charset CP819 = getCharset("CP819");
private static final Charset WINDOWS_720 = getCharset("windows-720");
private static final Charset WINDOWS_711 = getCharset("windows-711");
private static final Charset WINDOWS_710 = getCharset("windows-710");
private static final Charset WINDOWS_709 = getCharset("windows-709");
private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
private static final Charset CP4372 = getCharset("CP437");
private static final Charset CP850 = getCharset("cp850");
private static final Charset CP437 = getCharset("cp437");
private static final Charset MS874 = getCharset("ms874");
private static final Charset CP1257 = getCharset("cp1257");
private static final Charset CP1256 = getCharset("cp1256");
private static final Charset CP1255 = getCharset("cp1255");
private static final Charset CP1258 = getCharset("cp1258");
private static final Charset CP1254 = getCharset("cp1254");
private static final Charset CP1253 = getCharset("cp1253");
private static final Charset MS950 = getCharset("ms950");
private static final Charset MS936 = getCharset("ms936");
private static final Charset MS1361 = getCharset("ms1361");
private static final Charset MS932 = getCharset("MS932");
private static final Charset CP1251 = getCharset("cp1251");
private static final Charset CP1250 = getCharset("cp1250");
private static final Charset MAC_THAI = getCharset("MacThai");
private static final Charset MAC_TURKISH = getCharset("MacTurkish");
private static final Charset MAC_GREEK = getCharset("MacGreek");
private static final Charset MAC_ARABIC = getCharset("MacArabic");
private static final Charset MAC_HEBREW = getCharset("MacHebrew");
private static final Charset JOHAB = getCharset("johab");
private static final Charset BIG5 = getCharset("Big5");
private static final Charset GB2312 = getCharset("GB2312");
private static final Charset MS949 = getCharset("ms949");
// The RTF doc has a "font table" that assigns ords
// (f0, f1, f2, etc.) to fonts and charsets, using the
// \fcharsetN control word. This mapping maps from the
// N to corresponding Java charset:
private static final Map<Integer, Charset> FCHARSET_MAP = new HashMap<>();
// The RTF may specify the \ansicpgN charset in the
// header; this maps the N to the corresponding Java
// character set:
private static final Map<Integer, Charset> ANSICPG_MAP = new HashMap<>();
static {
FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
// charset 1 is Default
// charset 2 is Symbol
FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
FCHARSET_MAP.put(79, MS949); // Mac Hangul
FCHARSET_MAP.put(80, GB2312); // Mac GB2312
FCHARSET_MAP.put(81, BIG5); // Mac Big5
FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
FCHARSET_MAP.put(88, CP1250); // Mac East Europe
FCHARSET_MAP.put(89, CP1251); // Mac Russian
FCHARSET_MAP.put(128, MS932); // Shift JIS
FCHARSET_MAP.put(129, MS949); // Hangul
FCHARSET_MAP.put(130, MS1361); // Johab
FCHARSET_MAP.put(134, MS936); // GB2312
FCHARSET_MAP.put(136, MS950); // Big5
FCHARSET_MAP.put(161, CP1253); // Greek
FCHARSET_MAP.put(162, CP1254); // Turkish
FCHARSET_MAP.put(163, CP1258); // Vietnamese
FCHARSET_MAP.put(177, CP1255); // Hebrew
FCHARSET_MAP.put(178, CP1256); // Arabic
// FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
// FCHARSET_MAP.put( 180, "" ); // Arabic user
// FCHARSET_MAP.put( 181, "" ); // Hebrew user
FCHARSET_MAP.put(186, CP1257); // Baltic
FCHARSET_MAP.put(204, CP1251); // Russian
FCHARSET_MAP.put(222, MS874); // Thai
FCHARSET_MAP.put(238, CP1250); // Eastern European
FCHARSET_MAP.put(254, CP437); // PC 437
FCHARSET_MAP.put(255, CP850); // OEM
}
static {
ANSICPG_MAP.put(437, CP4372); // US IBM
ANSICPG_MAP.put(708, ISO_8859_6); // Arabic (ASMO 708)
ANSICPG_MAP.put(709, WINDOWS_709); // Arabic (ASMO 449+, BCON V4)
ANSICPG_MAP.put(710, WINDOWS_710); // Arabic (transparent Arabic)
ANSICPG_MAP.put(710, WINDOWS_711); // Arabic (Nafitha Enhanced)
ANSICPG_MAP.put(710, WINDOWS_720); // Arabic (transparent ASMO)
ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe)
ANSICPG_MAP.put(850, CP8502); // IBM Multilingual
ANSICPG_MAP.put(852, CP852); // Eastern European
ANSICPG_MAP.put(860, CP860); // Portuguese
ANSICPG_MAP.put(862, CP862); // Hebrew
ANSICPG_MAP.put(863, CP863); // French Canadian
ANSICPG_MAP.put(864, CP864); // Arabic
ANSICPG_MAP.put(865, CP865); // Norwegian
ANSICPG_MAP.put(866, CP866); // Soviet Union
ANSICPG_MAP.put(874, MS8742); // Thai
ANSICPG_MAP.put(932, MS932); // Japanese
ANSICPG_MAP.put(936, MS9362); // Simplified Chinese
ANSICPG_MAP.put(949, CP949); // Korean
ANSICPG_MAP.put(950, CP950); // Traditional Chinese
ANSICPG_MAP.put(1250, CP12502); // Eastern European
ANSICPG_MAP.put(1251, CP12512); // Cyrillic
ANSICPG_MAP.put(1252, CP1252); // Western European
ANSICPG_MAP.put(1253, CP12532); // Greek
ANSICPG_MAP.put(1254, CP12542); // Turkish
ANSICPG_MAP.put(1255, CP12552); // Hebrew
ANSICPG_MAP.put(1256, CP12562); // Arabic
ANSICPG_MAP.put(1257, CP12572); // Baltic
ANSICPG_MAP.put(1258, CP12582); // Vietnamese
ANSICPG_MAP.put(1361, X_JOHAB); // Johab
ANSICPG_MAP.put(10000, MAC_ROMAN); // Mac Roman
ANSICPG_MAP.put(10001, SHIFT_JIS); // Mac Japan
ANSICPG_MAP.put(10004, MAC_ARABIC); // Mac Arabic
ANSICPG_MAP.put(10005, MAC_HEBREW); // Mac Hebrew
ANSICPG_MAP.put(10006, MAC_GREEK); // Mac Hebrew
ANSICPG_MAP.put(10007, MAC_CYRILLIC); // Mac Cyrillic
ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE); // MAC Latin2
ANSICPG_MAP.put(10081, MAC_TURKISH); // Mac Turkish
ANSICPG_MAP.put(57002, X_ISCII91); // Devanagari
// TODO: in theory these other charsets are simple
// shifts off of Devanagari, so we could impl that
// here:
ANSICPG_MAP.put(57003, WINDOWS_57003); // Bengali
ANSICPG_MAP.put(57004, WINDOWS_57004); // Tamil
ANSICPG_MAP.put(57005, WINDOWS_57005); // Telugu
ANSICPG_MAP.put(57006, WINDOWS_57006); // Assamese
ANSICPG_MAP.put(57007, WINDOWS_57007); // Oriya
ANSICPG_MAP.put(57008, WINDOWS_57008); // Kannada
ANSICPG_MAP.put(57009, WINDOWS_57009); // Malayalam
ANSICPG_MAP.put(57010, WINDOWS_57010); // Gujariti
ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi
}
// Used when we decode bytes -> chars using CharsetDecoder:
private final char[] outputArray = new char[128];
private final Buffer outputCharBuffer = CharBuffer.wrap(outputArray);
// Holds the font table from this RTF doc, mapping
// the font number (from \fN control word) to the
// corresponding charset:
private final Map<Integer, Charset> fontToCharset = new HashMap<>();
// Group stack: when we open a new group, we push
// the previous group state onto the stack; when we
// close the group, we restore it
private final LinkedList<GroupState> groupStates = new LinkedList<>();
private final StringBuilder pendingBuffer = new StringBuilder();
private final XHTMLContentHandler out;
private final Metadata metadata;
private final RTFEmbObjHandler embObjHandler;
// How many next ansi chars we should skip; this
// is 0 except when we are still in the "ansi
// shadow" after seeing a unicode escape, at which
// point it's set to the last ucN skip we had seen:
int ansiSkip = 0;
private int written = 0;
// Hold pending bytes (encoded in the current charset)
// for text output:
private byte[] pendingBytes = new byte[16];
private int pendingByteCount;
private Buffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
// Holds pending chars for text output
private char[] pendingChars = new char[10];
private int pendingCharCount;
// Holds chars for a still-being-tokenized control word
private byte[] pendingControl = new byte[10];
private int pendingControlCount;
// Reused when possible:
private CharsetDecoder decoder;
private Charset lastCharset;
private Charset globalCharset = WINDOWS_1252;
private int globalDefaultFont = -1;
private int curFontID = -1;
// Current group state; in theory this initial
// GroupState is unused because the RTF doc should
// immediately open the top group (start with {):
private GroupState groupState = new GroupState();
private boolean inHeader = true;
//0 not yet in font table, 1 in font table, 2 have processed font table
private int fontTableState = 0;
//depth at which the font table started
private int fontTableDepth;
// Non null if we are processing metadata (title,
// keywords, etc.) inside the info group:
private Property nextMetaData;
private boolean inParagraph;
// Non-zero if we are processing inside a field destination:
private int fieldState;
// Non-zero list index
private int pendingListEnd;
private Map<Integer, ListDescriptor> listTable = new HashMap<>();
private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<>();
private Map<Integer, ListDescriptor> currentListTable;
private ListDescriptor currentList;
private int listTableLevel = -1;
private boolean ignoreListMarkup;
// Non-null if we've seen the url for a HYPERLINK but not yet
// its text:
private String pendingURL;
// Used to process the sub-groups inside the upr
// group:
private int uprState = -1;
// Used when extracting CREATION date:
private int year, month, day, hour, minute;
//This keeps track of the following elements as they are
//written to the handler: p, li, ol, ul
//This tries to prevent malformed tag orders in the RTF
//e.g. <p></li></ol></p>
//from generating malformed xml tags. (TIKA-2899)
//This may conceal problems with our parser.
//TODO:
// 1) do we need to add all elements, a, b, i, etc.
// 2) are we doing the right thing by ignoring an element
// if its match doesn't pop off the stack...or should
// we pop all at the first failure.
private Stack<String> paragraphStack = new Stack<>();
//this is an arbitrary limit on the size of the stack
//to defend against DoS with memory consumption
private int maxStackSize = 1000;
public TextExtractor(XHTMLContentHandler out, Metadata metadata,
RTFEmbObjHandler embObjHandler) {
this.metadata = metadata;
this.out = out;
this.embObjHandler = embObjHandler;
}
private static Charset getCharset(String name) {
try {
return CharsetUtils.forName(name);
} catch (IllegalArgumentException e) {
return ASCII;
}
}
protected static boolean isHexChar(int ch) {
return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
}
private static boolean isAlpha(int ch) {
return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
}
private static boolean isDigit(int ch) {
return ch >= '0' && ch <= '9';
}
protected static int hexValue(int ch) {
if (ch >= '0' && ch <= '9') {
return ch - '0';
} else if (ch >= 'a' && ch <= 'z') {
return 10 + (ch - 'a');
} else {
assert ch >= 'A' && ch <= 'Z';
return 10 + (ch - 'A');
}
}
public boolean isIgnoringLists() {
return ignoreListMarkup;
}
public void setIgnoreListMarkup(boolean ignore) {
this.ignoreListMarkup = ignore;
}
// Push pending bytes or pending chars:
private void pushText() throws IOException, SAXException, TikaException {
if (pendingByteCount != 0) {
assert pendingCharCount == 0;
pushBytes();
} else {
pushChars();
}
}
// Buffers the byte (unit in the current charset) for
// output:
private void addOutputByte(int b) throws IOException, SAXException, TikaException {
assert b >= 0 && b < 256 : "byte value out of range: " + b;
if (pendingCharCount != 0) {
pushChars();
}
if (groupState.pictDepth > 0) {
embObjHandler.writeMetadataChar((char) b);
} else {
// Save the byte in pending buffer:
if (pendingByteCount == pendingBytes.length) {
// Gradual but exponential growth:
final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
pendingBytes = newArray;
pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
}
pendingBytes[pendingByteCount++] = (byte) b;
}
}
// Buffers a byte as part of a control word:
private void addControl(int b) {
assert isAlpha(b);
// Save the byte in pending buffer:
if (pendingControlCount == pendingControl.length) {
// Gradual but exponential growth:
final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
pendingControl = newArray;
}
pendingControl[pendingControlCount++] = (byte) b;
}
// Buffers a UTF16 code unit for output
private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
if (pendingByteCount != 0) {
pushBytes();
}
if (inHeader || fieldState == 1) {
pendingBuffer.append(ch);
} else if (groupState.sn == true || groupState.sv == true) {
embObjHandler.writeMetadataChar(ch);
} else {
if (pendingCharCount == pendingChars.length) {
// Gradual but exponential growth:
final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
pendingChars = newArray;
}
pendingChars[pendingCharCount++] = ch;
}
}
// Shallow parses the entire doc, writing output to
// this.out and this.metadata
public void extract(InputStream in) throws IOException, SAXException, TikaException {
// in = new FilterInputStream(in) {
// public int read() throws IOException {
// int r = super.read();
// System.out.write(r);
// System.out.flush();
// return r;
// }
// public int read(byte b[], int off, int len) throws IOException {
// int r = super.read(b, off, len);
// System.out.write(b, off, r);
// System.out.flush();
// return r;
// }
// };
extract(new PushbackInputStream(in, 2));
}
private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
out.startDocument();
while (true) {
final int b = in.read();
if (b == -1) {
break;
} else if (b == '\\') {
parseControlToken(in);
} else if (b == '{') {
pushText();
processGroupStart(in);
} else if (b == '}') {
pushText();
processGroupEnd();
if (groupStates.isEmpty()) {
// parsed document closing brace
break;
}
} else if (groupState.objdata == true || groupState.pictDepth == 1) {
embObjHandler.writeHexChar(b);
} else if (b != '\r' && b != '\n' &&
(!groupState.ignore || nextMetaData != null || groupState.sn == true ||
groupState.sv == true)) {
// Linefeed and carriage return are not
// significant
if (ansiSkip != 0) {
ansiSkip--;
} else {
addOutputByte(b);
}
}
}
endParagraph(false);
//close out whatever tags were left
while (paragraphStack.size() > 0) {
end(paragraphStack.pop());
}
out.endDocument();
}
private void parseControlToken(PushbackInputStream in)
throws IOException, SAXException, TikaException {
int b = in.read();
if (b == '\'') {
// escaped hex char
parseHexChar(in);
} else if (isAlpha(b)) {
// control word
parseControlWord((char) b, in);
} else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
// escaped char
addOutputByte(b);
} else if (b != -1) {
// control symbol, eg \* or \~
processControlSymbol((char) b);
}
}
private void parseHexChar(PushbackInputStream in)
throws IOException, SAXException, TikaException {
int hex1 = in.read();
if (!isHexChar(hex1)) {
// DOC ERROR (malformed hex escape): ignore
in.unread(hex1);
return;
}
int hex2 = in.read();
if (!isHexChar(hex2)) {
// TODO: log a warning here, somehow?
// DOC ERROR (malformed hex escape):
// ignore
in.unread(hex2);
return;
}
if (ansiSkip != 0) {
// Skip this ansi char since we are
// still in the shadow of a unicode
// escape:
ansiSkip--;
} else {
// Unescape:
addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
}
}
private void parseControlWord(int firstChar, PushbackInputStream in)
throws IOException, SAXException, TikaException {
addControl(firstChar);
int b = in.read();
while (isAlpha(b)) {
addControl(b);
b = in.read();
}
boolean hasParam = false;
boolean negParam = false;
if (b == '-') {
negParam = true;
hasParam = true;
b = in.read();
}
int param = 0;
while (isDigit(b)) {
param *= 10;
param += (b - '0');
hasParam = true;
b = in.read();
}
// space is consumed as part of the
// control word, but is not added to the
// control word
if (b != ' ') {
in.unread(b);
}
if (hasParam) {
if (negParam) {
param = -param;
}
processControlWord(param, in);
} else {
processControlWord();
}
pendingControlCount = 0;
}
private void lazyStartParagraph() throws IOException, SAXException, TikaException {
boolean localInParagraph = inParagraph;
if (paragraphStack.size() > 0 && paragraphStack.contains(P)) {
localInParagraph = true;
}
if (!localInParagraph) {
// Ensure </i></b> order
if (groupState.italic) {
end("i");
}
if (groupState.bold) {
end("b");
}
if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
endList(pendingListEnd);
pendingListEnd = 0;
}
if (inList() && pendingListEnd != groupState.list) {
startList(groupState.list);
}
if (inList()) {
start(LI);
pushParagraphTag(LI);
} else {
start(P);
pushParagraphTag(P);
}
// Ensure <b><i> order
if (groupState.bold) {
start("b");
}
if (groupState.italic) {
start("i");
}
inParagraph = true;
}
}
private void pushParagraphTag(String tag) {
if (paragraphStack.size() < maxStackSize) {
paragraphStack.push(tag);
} else {
//ignore. Something is very, very wrong...
}
}
private void endParagraph(boolean preserveStyles)
throws IOException, SAXException, TikaException {
pushText();
//maintain consecutive new lines
if (!inParagraph) {
lazyStartParagraph();
}
if (inParagraph || paragraphStack.size() > 0) {
if (groupState.italic) {
end("i");
groupState.italic = preserveStyles;
}
if (groupState.bold) {
end("b");
groupState.bold = preserveStyles;
}
boolean badTagAlignment = false;
if (inList()) {
if (paragraphStack.size() > 0) {
String lastP = paragraphStack.pop();
if (lastP.equals(LI)) {
end(LI);
} else {
pushParagraphTag(lastP);
badTagAlignment = true;
}
} else {
//there should have been a starting li
}
} else {
if (paragraphStack.size() > 0) {
String lastP = paragraphStack.pop();
if (P.equals(lastP)) {
end(P);
} else {
pushParagraphTag(lastP);
badTagAlignment = true;
}
}
}
//if there was a failure in tag alignment,
//dump all tags and start fresh.
if (badTagAlignment) {
while (paragraphStack.size() > 0) {
end(paragraphStack.pop());
}
}
if (preserveStyles && (groupState.bold || groupState.italic)) {
start(P);
pushParagraphTag(P);
if (groupState.bold) {
start("b");
}
if (groupState.italic) {
start("i");
}
inParagraph = true;
} else {
inParagraph = false;
}
}
// Ensure closing the list at document end
if (!preserveStyles && pendingListEnd != 0) {
endList(pendingListEnd);
pendingListEnd = 0;
}
}
// Push pending UTF16 units to out ContentHandler
private void pushChars() throws IOException, SAXException, TikaException {
if (pendingCharCount != 0) {
lazyStartParagraph();
out.characters(pendingChars, 0, pendingCharCount);
pendingCharCount = 0;
}
}
// Decodes the buffered bytes in pendingBytes
// into UTF16 code units, and sends the characters
// to the out ContentHandler, if we are in the body,
// else appends the characters to the pendingBuffer
private void pushBytes() throws IOException, SAXException, TikaException {
if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {
final CharsetDecoder decoder = getDecoder();
pendingByteBuffer.limit(pendingByteCount);
assert pendingByteBuffer.position() == 0;
assert outputCharBuffer.position() == 0;
while (true) {
// We pass true for endOfInput because, when
// we are called, we should have seen a
// complete sequence of characters for this
// charset:
final CoderResult result = decoder.decode((ByteBuffer) pendingByteBuffer,
(CharBuffer) outputCharBuffer, true);
final int pos = outputCharBuffer.position();
if (pos > 0) {
if (inHeader || fieldState == 1) {
pendingBuffer.append(outputArray, 0, pos);
} else {
lazyStartParagraph();
out.characters(outputArray, 0, pos);
}
outputCharBuffer.position(0);
}
if (result == CoderResult.UNDERFLOW) {
break;
}
}
while (true) {
final CoderResult result = decoder.flush((CharBuffer) outputCharBuffer);
final int pos = outputCharBuffer.position();
if (pos > 0) {
if (inHeader || fieldState == 1) {
pendingBuffer.append(outputArray, 0, pos);
} else {
lazyStartParagraph();
out.characters(outputArray, 0, pos);
}
outputCharBuffer.position(0);
}
if (result == CoderResult.UNDERFLOW) {
break;
}
}
// Reset for next decode
decoder.reset();
pendingByteBuffer.position(0);
}
pendingByteCount = 0;
}
// NOTE: s must be ascii alpha only
private boolean equals(String s) {
if (pendingControlCount != s.length()) {
return false;
}
for (int idx = 0; idx < pendingControlCount; idx++) {
assert isAlpha(s.charAt(idx));
if (((byte) s.charAt(idx)) != pendingControl[idx]) {
return false;
}
}
return true;
}
private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
switch (ch) {
case '~':
// Non-breaking space -> unicode NON-BREAKING SPACE
addOutputChar('\u00a0');
break;
case '*':
// Ignorable destination (control words defined after
// the 1987 RTF spec). These are already handled by
// processGroupStart()
break;
case '-':
// Optional hyphen -> unicode SOFT HYPHEN
addOutputChar('\u00ad');
break;
case '_':
// Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
addOutputChar('\u2011');
break;
default:
break;
}
}
private CharsetDecoder getDecoder() throws TikaException {
Charset charset = getCharset();
// Common case: charset is same as last time, so
// just reuse it:
if (lastCharset == null || !charset.equals(lastCharset)) {
decoder = charset.newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPLACE);
decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
lastCharset = charset;
}
return decoder;
}
// Return current charset in-use
private Charset getCharset() throws TikaException {
// If a specific font (fN) was set, use its charset
if (groupState.fontCharset != null) {
return groupState.fontCharset;
}
// Else, if global default font (defN) was set, use that one
if (globalDefaultFont != -1 && !inHeader) {
Charset cs = fontToCharset.get(globalDefaultFont);
if (cs != null) {
return cs;
}
}
// Else, use the global charset
if (globalCharset == null) {
throw new TikaException("unable to determine charset");
}
return globalCharset;
}
// Handle control word that takes a parameter:
private void processControlWord(int param, PushbackInputStream in)
throws IOException, SAXException, TikaException {
// TODO: afN? (associated font number)
// TODO: do these alter text output...?
/*
} else if (equals("stshfdbch")) {
// font to be used by default in
// style sheet for East Asian chars
// arg N is font table entry
} else if (equals("stshfloch")) {
// font to be used by default in
// style sheet for ASCII chars
// arg N is font table entry
} else if (equals("stshfhich")) {
// font to be used by default in
// style sheet for High Ansi chars
// arg N is font table entry
} else if (equals("stshfbi")) {
// style sheet for Complex Scripts (BIDI) chars
// arg N is font table entry
*/
// TODO: inefficient that we check equals N times;
// we'd get better perf w/ real lexer (eg
// JFlex), which uses single-pass FSM to do cmp:
if (inHeader) {
if (equals("ansicpg")) {
// ANSI codepage
Charset cs = ANSICPG_MAP.get(param);
if (cs != null) {
globalCharset = cs;
}
} else if (equals("deff")) {
// Default font
globalDefaultFont = param;
} else if (equals("nofpages")) {
metadata.add(Office.PAGE_COUNT, Integer.toString(param));
} else if (equals("nofwords")) {
metadata.add(Office.WORD_COUNT, Integer.toString(param));
} else if (equals("nofchars")) {
metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
} else if (equals("yr")) {
year = param;
} else if (equals("mo")) {
month = param;
} else if (equals("dy")) {
day = param;
} else if (equals("hr")) {
hour = param;
} else if (equals("min")) {
minute = param;
}
if (fontTableState == 1) {
// Still inside font table -- record the
// mappings of fN to the fcharset:
if (groupState.depth < fontTableDepth) {
fontTableState = 2;
} else {
if (equals("f")) {
// Start new font definition
curFontID = param;
} else if (equals("fcharset")) {
Charset cs = FCHARSET_MAP.get(param);
if (cs != null) {
fontToCharset.put(curFontID, cs);
}
}
}
}
//if you've already seen the font table,
//you aren't in another header item (e.g. styles)
//and you see an fX, you're out of the header
if (fontTableState == 2 && !groupState.ignore && equals("f")) {
inHeader = false;
}
if (currentList != null) {
if (equals("listid")) {
currentList.id = param;
currentListTable.put(currentList.id, currentList);
} else if (equals("listtemplateid")) {
currentList.templateID = param;
} else if (equals("levelnfc") || equals("levelnfcn")) {
//sanity check to make sure list information isn't corrupt
if (listTableLevel > -1 && listTableLevel < currentList.numberType.length) {
currentList.numberType[listTableLevel] = param;
}
}
}
} else {
// In document
if (equals("b")) {
// b0
assert param == 0;
if (groupState.bold) {
pushText();
if (groupState.italic) {
end("i");
}
end("b");
if (groupState.italic) {
start("i");
}
groupState.bold = false;
}
} else if (equals("i")) {
// i0
assert param == 0;
if (groupState.italic) {
pushText();
end("i");
groupState.italic = false;
}
} else if (equals("f")) {
// Change current font
Charset fontCharset = fontToCharset.get(param);
// Push any buffered text before changing
// font:
pushText();
if (fontCharset != null) {
groupState.fontCharset = fontCharset;
} else {
// DOC ERROR: font change referenced a
// non-table'd font number
// TODO: log a warning? Throw an exc?
groupState.fontCharset = null;
}
} else if (equals("ls")) {
groupState.list = param;
} else if (equals("lslvl")) {
groupState.listLevel = param;
}
}
// Process unicode escape. This can appear in doc
// or in header, since the metadata (info) fields
// in the header can be unicode escaped as well:
if (equals("u")) {
// Unicode escape
if (!groupState.ignore || groupState.sv || groupState.sn) {
final char utf16CodeUnit = (char) (param & 0xffff);
addOutputChar(utf16CodeUnit);
}
// After seeing a unicode escape we must
// skip the next ucSkip ansi chars (the
// "unicode shadow")
ansiSkip = groupState.ucSkip;
} else if (equals("uc")) {
// Change unicode shadow length
groupState.ucSkip = param;
} else if (equals("bin")) {
if (param >= 0) {
if (groupState.pictDepth == 1) {
try {
embObjHandler.writeBytes(in, param);
} catch (IOException | TikaException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
embObjHandler.reset();
}
} else {
IOUtils.skipFully(in, param);
}
} else {
// log some warning?
}
}
}
private boolean inList() {
return !ignoreListMarkup && groupState.list != 0;
}
/**
* Marks the current list as pending to end. This is done to be able to merge list items of
* the same list within the same enclosing list tag (ie. either <code>"ul"</code>, or
* <code>"ol"</code>).
*/
private void pendingListEnd() {
pendingListEnd = groupState.list;
groupState.list = 0;
}
/**
* Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
* type for the given <code>listID</code>.
*
* @param listID The ID of the list.
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
private void endList(int listID) throws IOException, SAXException, TikaException {
if (!ignoreListMarkup) {
String xl = isUnorderedList(listID) ? UL : OL;
if (paragraphStack.size() > 0) {
String p = paragraphStack.pop();
if (xl.equals(p)) {
end(xl);
}
} else {
//stack as empty, the list was never started
}
}
}
/**
* Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
* type for the given <code>listID</code>.
*
* @param listID The ID of the list.
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
private void startList(int listID) throws IOException, SAXException, TikaException {
if (!ignoreListMarkup) {
String xl = isUnorderedList(listID) ? UL : OL;
start(xl);
pushParagraphTag(xl);
}
}
private boolean isUnorderedList(int listID) {
ListDescriptor list = listTable.get(listID);
if (list != null) {
return list.isUnordered(groupState.listLevel);
}
return true;
}
private void end(String tag) throws IOException, SAXException, TikaException {
out.endElement(tag);
}
private void start(String tag) throws IOException, SAXException, TikaException {
out.startElement(tag);
}
// Handle non-parameter control word:
private void processControlWord() throws IOException, SAXException, TikaException {
if (inHeader) {
if (equals("ansi")) {
globalCharset = WINDOWS_1252;
} else if (equals("pca")) {
globalCharset = CP850;
} else if (equals("pc")) {
globalCharset = CP437;
} else if (equals("mac")) {
globalCharset = MAC_ROMAN;
}
if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
groupState.ignore = true;
} else if (equals("listtable")) {
currentListTable = listTable;
} else if (equals("listoverridetable")) {
currentListTable = listOverrideTable;
}
if (uprState == -1) {
// TODO: we can also parse \creatim, \revtim,
// \printim, \version, etc.
if (equals("author")) {
nextMetaData = TikaCoreProperties.CREATOR;
} else if (equals("title")) {
nextMetaData = TikaCoreProperties.TITLE;
} else if (equals("subject")) {
nextMetaData = OfficeOpenXMLCore.SUBJECT;
} else if (equals("keywords")) {
nextMetaData = Office.KEYWORDS;
} else if (equals("category")) {
nextMetaData = OfficeOpenXMLCore.CATEGORY;
} else if (equals("comment")) {
nextMetaData = TikaCoreProperties.COMMENTS;
} else if (equals("company")) {
nextMetaData = OfficeOpenXMLExtended.COMPANY;
} else if (equals("manager")) {
nextMetaData = OfficeOpenXMLExtended.MANAGER;
} else if (equals("template")) {
nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
} else if (equals("creatim")) {
nextMetaData = TikaCoreProperties.CREATED;
}
}
if (fontTableState == 0) {
// Didn't see font table yet
if (equals("fonttbl")) {
fontTableState = 1;
fontTableDepth = groupState.depth;
}
} else if (fontTableState == 1) {
// Inside font table
if (groupState.depth < fontTableDepth) {
fontTableState = 2;
}
}
// List table handling
if (currentListTable != null) {
if (equals("list") || equals("listoverride")) {
currentList = new ListDescriptor();
listTableLevel = -1;
} else if (currentList != null) {
if (equals("liststylename")) {
currentList.isStyle = true;
} else if (equals("listlevel")) {
listTableLevel++;
}
}
}
if (!groupState.ignore &&
(equals("par") || equals("pard") || equals("sect") || equals("sectd") ||
equals("plain") || equals("ltrch") || equals("rtlch") ||
equals("htmlrtf") || equals("line"))) {
inHeader = false;
}
} else {
if (equals("b")) {
if (!groupState.bold) {
pushText();
lazyStartParagraph();
if (groupState.italic) {
// Make sure nesting is always <b><i>
end("i");
}
groupState.bold = true;
start("b");
if (groupState.italic) {
start("i");
}
}
} else if (equals("i")) {
if (!groupState.italic) {
pushText();
lazyStartParagraph();
groupState.italic = true;
start("i");
}
}
}
final boolean ignored = groupState.ignore;
if (equals("pard")) {
// Reset styles
pushText();
if (groupState.italic) {
end("i");
groupState.italic = false;
}
if (groupState.bold) {
end("b");
groupState.bold = false;
}
if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
pendingListEnd();
}
} else if (equals("plain")) {
if (groupState.italic || groupState.bold) {
// Reset styles
pushText();
if (groupState.italic) {
end("i");
groupState.italic = false;
}
if (groupState.bold) {
end("b");
groupState.bold = false;
}
}
} else if (equals("par")) {
if (!ignored) {
endParagraph(true);
if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
pendingListEnd();
}
}
} else if (equals("shptxt")) {
pushText();
// Text inside a shape
groupState.ignore = false;
} else if (equals("chatn")) {
addOutputChar(SPACE);
pushText();
// Annotation ID
groupState.ignore = false;
} else if (equals("atnid")) {
addOutputChar(SPACE);
pushText();
// Annotation ID
groupState.ignore = false;
} else if (equals("atnauthor")) {
addOutputChar(SPACE);
pushText();
// Annotation author
groupState.ignore = false;
} else if (equals("annotation")) {
groupState.annotation = true;
pushText();
// Annotation
groupState.ignore = false;
} else if (equals("listtext")) {
groupState.ignore = true;
} else if (equals("cell")) {
// TODO: we should produce a table output here?
//addOutputChar(' ');
endParagraph(true);
} else if (equals("sp")) {
groupState.sp = true;
} else if (equals("sn")) {
embObjHandler.startSN();
groupState.sn = true;
} else if (equals("sv")) {
embObjHandler.startSV();
groupState.sv = true;
} else if (equals("object")) {
pushText();
embObjHandler.setInObject(true);
groupState.object = true;
} else if (equals("objdata")) {
groupState.objdata = true;
embObjHandler.startObjData();
} else if (equals("pict")) {
pushText();
// TODO: create img tag? but can that support
// embedded image data?
groupState.pictDepth = 1;
embObjHandler.startPict();
} else if (equals("line")) {
if (!ignored) {
addOutputChar('\n');
}
} else if (equals("column")) {
if (!ignored) {
addOutputChar(' ');
}
} else if (equals("page")) {
if (!ignored) {
addOutputChar('\n');
}
} else if (equals("softline")) {
if (!ignored) {
addOutputChar('\n');
}
} else if (equals("softcolumn")) {
if (!ignored) {
addOutputChar(' ');
}
} else if (equals("softpage")) {
if (!ignored) {
addOutputChar('\n');
}
} else if (equals("tab")) {
if (!ignored) {
addOutputChar('\t');
}
} else if (equals("upr")) {
uprState = 0;
} else if (equals("ud") && uprState == 1) {
uprState = -1;
// 2nd group inside the upr destination, which
// contains the unicode encoding of the text, so
// we want to keep that:
groupState.ignore = false;
} else if (equals("bullet")) {
if (!ignored) {
// unicode BULLET
addOutputChar('\u2022');
}
} else if (equals("endash")) {
if (!ignored) {
// unicode EN DASH
addOutputChar('\u2013');
}
} else if (equals("emdash")) {
if (!ignored) {
// unicode EM DASH
addOutputChar('\u2014');
}
} else if (equals("enspace")) {
if (!ignored) {
// unicode EN SPACE
addOutputChar('\u2002');
}
} else if (equals("qmspace")) {
if (!ignored) {
// quarter em space -> unicode FOUR-PER-EM SPACE
addOutputChar('\u2005');
}
} else if (equals("emspace")) {
if (!ignored) {
// unicode EM SPACE
addOutputChar('\u2003');
}
} else if (equals("lquote")) {
if (!ignored) {
// unicode LEFT SINGLE QUOTATION MARK
addOutputChar('\u2018');
}
} else if (equals("rquote")) {
if (!ignored) {
// unicode RIGHT SINGLE QUOTATION MARK
addOutputChar('\u2019');
}
} else if (equals("ldblquote")) {
if (!ignored) {
// unicode LEFT DOUBLE QUOTATION MARK
addOutputChar('\u201C');
}
} else if (equals("rdblquote")) {
if (!ignored) {
// unicode RIGHT DOUBLE QUOTATION MARK
addOutputChar('\u201D');
}
} else if (equals("fldinst")) {
fieldState = 1;
groupState.ignore = false;
} else if (equals("fldrslt") && fieldState == 2) {
assert pendingURL != null;
lazyStartParagraph();
out.startElement("a", "href", pendingURL);
pendingURL = null;
fieldState = 3;
groupState.ignore = false;
}
}
// Push new GroupState
private void processGroupStart(PushbackInputStream in) throws IOException {
ansiSkip = 0;
// Push current groupState onto the stack
groupStates.add(groupState);
// Make new GroupState
groupState = new GroupState(groupState);
assert groupStates.size() == groupState.depth :
"size=" + groupStates.size() + " depth=" + groupState.depth;
if (uprState == 0) {
uprState = 1;
groupState.ignore = true;
}
// Check for ignorable groups. Note that
// sometimes we un-ignore within this group, eg
// when handling upr escape.
int b2 = in.read();
if (b2 == '\\') {
int b3 = in.read();
if (b3 == '*') {
groupState.ignore = true;
}
in.unread(b3);
}
in.unread(b2);
}
// Pop current GroupState
private void processGroupEnd() throws IOException, SAXException, TikaException {
if (inHeader) {
if (nextMetaData != null) {
if (nextMetaData == TikaCoreProperties.CREATED) {
Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT);
cal.set(year, month - 1, day, hour, minute, 0);
metadata.set(nextMetaData, cal.getTime());
} else if (nextMetaData.isMultiValuePermitted()) {
metadata.add(nextMetaData, pendingBuffer.toString());
} else {
metadata.set(nextMetaData, pendingBuffer.toString());
}
nextMetaData = null;
}
pendingBuffer.setLength(0);
}
assert groupState.depth > 0;
ansiSkip = 0;
if (groupState.objdata == true) {
try {
embObjHandler.handleCompletedObject();
} catch (TikaException | IOException e) {
EmbeddedDocumentUtil.recordException(e, metadata);
}
groupState.objdata = false;
} else if (groupState.pictDepth > 0) {
if (groupState.sn == true) {
embObjHandler.endSN();
} else if (groupState.sv == true) {
embObjHandler.endSV();
} else if (groupState.sp == true) {
embObjHandler.endSP();
} else if (groupState.pictDepth == 1) {
embObjHandler.handleCompletedObject();
}
}
if (groupState.annotation == true) {
addOutputChar(SPACE);
}
if (groupState.object == true) {
embObjHandler.setInObject(false);
}
// Be robust if RTF doc is corrupt (has too many
// closing }s):
// TODO: log a warning?
if (groupStates.size() > 0) {
// Restore group state:
final GroupState outerGroupState = groupStates.removeLast();
// Close italic, if outer does not have italic or
// bold changed:
if (groupState.italic) {
if (!outerGroupState.italic || groupState.bold != outerGroupState.bold) {
end("i");
groupState.italic = false;
}
}
// Close bold
if (groupState.bold && !outerGroupState.bold) {
end("b");
}
// Open bold
if (!groupState.bold && outerGroupState.bold) {
start("b");
}
// Open italic
if (!groupState.italic && outerGroupState.italic) {
start("i");
}
groupState = outerGroupState;
}
assert groupStates.size() == groupState.depth;
if (fieldState == 1) {
String s = pendingBuffer.toString().trim();
pendingBuffer.setLength(0);
if (s.startsWith("HYPERLINK")) {
s = s.substring(9).trim();
// TODO: what other instructions can be in a
// HYPERLINK destination?
final boolean isLocalLink = s.contains("\\l ");
int idx = s.indexOf('"');
if (idx != -1) {
int idx2 = s.indexOf('"', 1 + idx);
if (idx2 != -1) {
s = s.substring(1 + idx, idx2);
}
}
pendingURL = (isLocalLink ? "#" : "") + s;
fieldState = 2;
} else {
fieldState = 0;
}
// TODO: we could process the other known field
// types. Right now, we will extract their text
// inlined, but fail to record them in metadata
// as a field value.
} else if (fieldState == 3) {
end("a");
fieldState = 0;
}
}
}