| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.tika.parser.microsoft.rtf; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.PushbackInputStream; |
| import java.nio.Buffer; |
| import java.nio.ByteBuffer; |
| import java.nio.CharBuffer; |
| import java.nio.charset.Charset; |
| import java.nio.charset.CharsetDecoder; |
| import java.nio.charset.CoderResult; |
| import java.nio.charset.CodingErrorAction; |
| import java.util.Calendar; |
| import java.util.HashMap; |
| import java.util.LinkedList; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Stack; |
| import java.util.TimeZone; |
| |
| import org.apache.commons.io.IOUtils; |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.extractor.EmbeddedDocumentUtil; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.Office; |
| import org.apache.tika.metadata.OfficeOpenXMLCore; |
| import org.apache.tika.metadata.OfficeOpenXMLExtended; |
| import org.apache.tika.metadata.Property; |
| import org.apache.tika.metadata.TikaCoreProperties; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| import org.apache.tika.utils.CharsetUtils; |
| |
| /* Tokenizes and performs a "shallow" parse of the RTF |
| * document, just enough to properly decode the text. |
| * |
| * TODO: we should cutover to a "real" tokenizer (eg JFlex); |
| * it should give better perf, by replacing the excessive |
| * "else if" string compares with FSA traversal. */ |
| |
| final class TextExtractor { |
| |
| private static final char SPACE = ' '; |
| private static final String P = "p"; |
| private static final String LI = "li"; |
| private static final String OL = "ol"; |
| private static final String UL = "ul"; |
| |
| private static final Charset ASCII = Charset.forName("US-ASCII"); |
| private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252"); |
| private static final Charset MAC_ROMAN = getCharset("MacRoman"); |
| private static final Charset SHIFT_JIS = getCharset("Shift_JIS"); |
| private static final Charset WINDOWS_57011 = getCharset("windows-57011"); |
| private static final Charset WINDOWS_57010 = getCharset("windows-57010"); |
| private static final Charset WINDOWS_57009 = getCharset("windows-57009"); |
| private static final Charset WINDOWS_57008 = getCharset("windows-57008"); |
| private static final Charset WINDOWS_57007 = getCharset("windows-57007"); |
| private static final Charset WINDOWS_57006 = getCharset("windows-57006"); |
| private static final Charset WINDOWS_57005 = getCharset("windows-57005"); |
| private static final Charset WINDOWS_57004 = getCharset("windows-57004"); |
| private static final Charset WINDOWS_57003 = getCharset("windows-57003"); |
| private static final Charset X_ISCII91 = getCharset("x-ISCII91"); |
| private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope"); |
| private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic"); |
| private static final Charset X_JOHAB = getCharset("x-Johab"); |
| private static final Charset CP12582 = getCharset("CP1258"); |
| private static final Charset CP12572 = getCharset("CP1257"); |
| private static final Charset CP12562 = getCharset("CP1256"); |
| private static final Charset CP12552 = getCharset("CP1255"); |
| private static final Charset CP12542 = getCharset("CP1254"); |
| private static final Charset CP12532 = getCharset("CP1253"); |
| private static final Charset CP1252 = getCharset("CP1252"); |
| private static final Charset CP12512 = getCharset("CP1251"); |
| private static final Charset CP12502 = getCharset("CP1250"); |
| private static final Charset CP950 = getCharset("CP950"); |
| private static final Charset CP949 = getCharset("CP949"); |
| private static final Charset MS9362 = getCharset("MS936"); |
| private static final Charset MS8742 = getCharset("MS874"); |
| private static final Charset CP866 = getCharset("CP866"); |
| private static final Charset CP865 = getCharset("CP865"); |
| private static final Charset CP864 = getCharset("CP864"); |
| private static final Charset CP863 = getCharset("CP863"); |
| private static final Charset CP862 = getCharset("CP862"); |
| private static final Charset CP860 = getCharset("CP860"); |
| private static final Charset CP852 = getCharset("CP852"); |
| private static final Charset CP8502 = getCharset("CP850"); |
| private static final Charset CP819 = getCharset("CP819"); |
| private static final Charset WINDOWS_720 = getCharset("windows-720"); |
| private static final Charset WINDOWS_711 = getCharset("windows-711"); |
| private static final Charset WINDOWS_710 = getCharset("windows-710"); |
| private static final Charset WINDOWS_709 = getCharset("windows-709"); |
| private static final Charset ISO_8859_6 = getCharset("ISO-8859-6"); |
| private static final Charset CP4372 = getCharset("CP437"); |
| private static final Charset CP850 = getCharset("cp850"); |
| private static final Charset CP437 = getCharset("cp437"); |
| private static final Charset MS874 = getCharset("ms874"); |
| private static final Charset CP1257 = getCharset("cp1257"); |
| private static final Charset CP1256 = getCharset("cp1256"); |
| private static final Charset CP1255 = getCharset("cp1255"); |
| private static final Charset CP1258 = getCharset("cp1258"); |
| private static final Charset CP1254 = getCharset("cp1254"); |
| private static final Charset CP1253 = getCharset("cp1253"); |
| private static final Charset MS950 = getCharset("ms950"); |
| private static final Charset MS936 = getCharset("ms936"); |
| private static final Charset MS1361 = getCharset("ms1361"); |
| private static final Charset MS932 = getCharset("MS932"); |
| private static final Charset CP1251 = getCharset("cp1251"); |
| private static final Charset CP1250 = getCharset("cp1250"); |
| private static final Charset MAC_THAI = getCharset("MacThai"); |
| private static final Charset MAC_TURKISH = getCharset("MacTurkish"); |
| private static final Charset MAC_GREEK = getCharset("MacGreek"); |
| private static final Charset MAC_ARABIC = getCharset("MacArabic"); |
| private static final Charset MAC_HEBREW = getCharset("MacHebrew"); |
| private static final Charset JOHAB = getCharset("johab"); |
| private static final Charset BIG5 = getCharset("Big5"); |
| private static final Charset GB2312 = getCharset("GB2312"); |
| private static final Charset MS949 = getCharset("ms949"); |
| // The RTF doc has a "font table" that assigns ords |
| // (f0, f1, f2, etc.) to fonts and charsets, using the |
| // \fcharsetN control word. This mapping maps from the |
| // N to corresponding Java charset: |
| private static final Map<Integer, Charset> FCHARSET_MAP = new HashMap<>(); |
| // The RTF may specify the \ansicpgN charset in the |
| // header; this maps the N to the corresponding Java |
| // character set: |
| private static final Map<Integer, Charset> ANSICPG_MAP = new HashMap<>(); |
| |
| static { |
| FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI |
| // charset 1 is Default |
| // charset 2 is Symbol |
| |
| FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman |
| FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis |
| FCHARSET_MAP.put(79, MS949); // Mac Hangul |
| FCHARSET_MAP.put(80, GB2312); // Mac GB2312 |
| FCHARSET_MAP.put(81, BIG5); // Mac Big5 |
| FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old) |
| FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew |
| FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic |
| FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek |
| FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish |
| FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai |
| FCHARSET_MAP.put(88, CP1250); // Mac East Europe |
| FCHARSET_MAP.put(89, CP1251); // Mac Russian |
| |
| FCHARSET_MAP.put(128, MS932); // Shift JIS |
| FCHARSET_MAP.put(129, MS949); // Hangul |
| FCHARSET_MAP.put(130, MS1361); // Johab |
| FCHARSET_MAP.put(134, MS936); // GB2312 |
| FCHARSET_MAP.put(136, MS950); // Big5 |
| FCHARSET_MAP.put(161, CP1253); // Greek |
| FCHARSET_MAP.put(162, CP1254); // Turkish |
| FCHARSET_MAP.put(163, CP1258); // Vietnamese |
| FCHARSET_MAP.put(177, CP1255); // Hebrew |
| FCHARSET_MAP.put(178, CP1256); // Arabic |
| // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional |
| // FCHARSET_MAP.put( 180, "" ); // Arabic user |
| // FCHARSET_MAP.put( 181, "" ); // Hebrew user |
| FCHARSET_MAP.put(186, CP1257); // Baltic |
| |
| FCHARSET_MAP.put(204, CP1251); // Russian |
| FCHARSET_MAP.put(222, MS874); // Thai |
| FCHARSET_MAP.put(238, CP1250); // Eastern European |
| FCHARSET_MAP.put(254, CP437); // PC 437 |
| FCHARSET_MAP.put(255, CP850); // OEM |
| } |
| |
| static { |
| ANSICPG_MAP.put(437, CP4372); // US IBM |
| ANSICPG_MAP.put(708, ISO_8859_6); // Arabic (ASMO 708) |
| |
| ANSICPG_MAP.put(709, WINDOWS_709); // Arabic (ASMO 449+, BCON V4) |
| ANSICPG_MAP.put(710, WINDOWS_710); // Arabic (transparent Arabic) |
| ANSICPG_MAP.put(710, WINDOWS_711); // Arabic (Nafitha Enhanced) |
| ANSICPG_MAP.put(710, WINDOWS_720); // Arabic (transparent ASMO) |
| ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe) |
| ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe) |
| |
| ANSICPG_MAP.put(819, CP819); // Windows 3.1 (US & Western Europe) |
| ANSICPG_MAP.put(850, CP8502); // IBM Multilingual |
| ANSICPG_MAP.put(852, CP852); // Eastern European |
| ANSICPG_MAP.put(860, CP860); // Portuguese |
| ANSICPG_MAP.put(862, CP862); // Hebrew |
| ANSICPG_MAP.put(863, CP863); // French Canadian |
| ANSICPG_MAP.put(864, CP864); // Arabic |
| ANSICPG_MAP.put(865, CP865); // Norwegian |
| ANSICPG_MAP.put(866, CP866); // Soviet Union |
| ANSICPG_MAP.put(874, MS8742); // Thai |
| ANSICPG_MAP.put(932, MS932); // Japanese |
| ANSICPG_MAP.put(936, MS9362); // Simplified Chinese |
| ANSICPG_MAP.put(949, CP949); // Korean |
| ANSICPG_MAP.put(950, CP950); // Traditional Chinese |
| ANSICPG_MAP.put(1250, CP12502); // Eastern European |
| ANSICPG_MAP.put(1251, CP12512); // Cyrillic |
| ANSICPG_MAP.put(1252, CP1252); // Western European |
| ANSICPG_MAP.put(1253, CP12532); // Greek |
| ANSICPG_MAP.put(1254, CP12542); // Turkish |
| ANSICPG_MAP.put(1255, CP12552); // Hebrew |
| ANSICPG_MAP.put(1256, CP12562); // Arabic |
| ANSICPG_MAP.put(1257, CP12572); // Baltic |
| ANSICPG_MAP.put(1258, CP12582); // Vietnamese |
| ANSICPG_MAP.put(1361, X_JOHAB); // Johab |
| ANSICPG_MAP.put(10000, MAC_ROMAN); // Mac Roman |
| ANSICPG_MAP.put(10001, SHIFT_JIS); // Mac Japan |
| ANSICPG_MAP.put(10004, MAC_ARABIC); // Mac Arabic |
| ANSICPG_MAP.put(10005, MAC_HEBREW); // Mac Hebrew |
| ANSICPG_MAP.put(10006, MAC_GREEK); // Mac Hebrew |
| ANSICPG_MAP.put(10007, MAC_CYRILLIC); // Mac Cyrillic |
| ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE); // MAC Latin2 |
| ANSICPG_MAP.put(10081, MAC_TURKISH); // Mac Turkish |
| ANSICPG_MAP.put(57002, X_ISCII91); // Devanagari |
| |
| // TODO: in theory these other charsets are simple |
| // shifts off of Devanagari, so we could impl that |
| // here: |
| ANSICPG_MAP.put(57003, WINDOWS_57003); // Bengali |
| ANSICPG_MAP.put(57004, WINDOWS_57004); // Tamil |
| ANSICPG_MAP.put(57005, WINDOWS_57005); // Telugu |
| ANSICPG_MAP.put(57006, WINDOWS_57006); // Assamese |
| ANSICPG_MAP.put(57007, WINDOWS_57007); // Oriya |
| ANSICPG_MAP.put(57008, WINDOWS_57008); // Kannada |
| ANSICPG_MAP.put(57009, WINDOWS_57009); // Malayalam |
| ANSICPG_MAP.put(57010, WINDOWS_57010); // Gujariti |
| ANSICPG_MAP.put(57011, WINDOWS_57011); // Punjabi |
| } |
| |
| // Used when we decode bytes -> chars using CharsetDecoder: |
| private final char[] outputArray = new char[128]; |
| private final Buffer outputCharBuffer = CharBuffer.wrap(outputArray); |
| // Holds the font table from this RTF doc, mapping |
| // the font number (from \fN control word) to the |
| // corresponding charset: |
| private final Map<Integer, Charset> fontToCharset = new HashMap<>(); |
| // Group stack: when we open a new group, we push |
| // the previous group state onto the stack; when we |
| // close the group, we restore it |
| private final LinkedList<GroupState> groupStates = new LinkedList<>(); |
| private final StringBuilder pendingBuffer = new StringBuilder(); |
| private final XHTMLContentHandler out; |
| private final Metadata metadata; |
| private final RTFEmbObjHandler embObjHandler; |
| // How many next ansi chars we should skip; this |
| // is 0 except when we are still in the "ansi |
| // shadow" after seeing a unicode escape, at which |
| // point it's set to the last ucN skip we had seen: |
| int ansiSkip = 0; |
| private int written = 0; |
| // Hold pending bytes (encoded in the current charset) |
| // for text output: |
| private byte[] pendingBytes = new byte[16]; |
| private int pendingByteCount; |
| private Buffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes); |
| // Holds pending chars for text output |
| private char[] pendingChars = new char[10]; |
| private int pendingCharCount; |
| // Holds chars for a still-being-tokenized control word |
| private byte[] pendingControl = new byte[10]; |
| private int pendingControlCount; |
| // Reused when possible: |
| private CharsetDecoder decoder; |
| private Charset lastCharset; |
| private Charset globalCharset = WINDOWS_1252; |
| private int globalDefaultFont = -1; |
| private int curFontID = -1; |
| // Current group state; in theory this initial |
| // GroupState is unused because the RTF doc should |
| // immediately open the top group (start with {): |
| private GroupState groupState = new GroupState(); |
| private boolean inHeader = true; |
| //0 not yet in font table, 1 in font table, 2 have processed font table |
| private int fontTableState = 0; |
| //depth at which the font table started |
| private int fontTableDepth; |
| // Non null if we are processing metadata (title, |
| // keywords, etc.) inside the info group: |
| private Property nextMetaData; |
| private boolean inParagraph; |
| // Non-zero if we are processing inside a field destination: |
| private int fieldState; |
| // Non-zero list index |
| private int pendingListEnd; |
| private Map<Integer, ListDescriptor> listTable = new HashMap<>(); |
| private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<>(); |
| private Map<Integer, ListDescriptor> currentListTable; |
| private ListDescriptor currentList; |
| private int listTableLevel = -1; |
| private boolean ignoreListMarkup; |
| // Non-null if we've seen the url for a HYPERLINK but not yet |
| // its text: |
| private String pendingURL; |
| // Used to process the sub-groups inside the upr |
| // group: |
| private int uprState = -1; |
| // Used when extracting CREATION date: |
| private int year, month, day, hour, minute; |
| |
| //This keeps track of the following elements as they are |
| //written to the handler: p, li, ol, ul |
| //This tries to prevent malformed tag orders in the RTF |
| //e.g. <p></li></ol></p> |
| //from generating malformed xml tags. (TIKA-2899) |
| //This may conceal problems with our parser. |
| //TODO: |
| // 1) do we need to add all elements, a, b, i, etc. |
| // 2) are we doing the right thing by ignoring an element |
| // if its match doesn't pop off the stack...or should |
| // we pop all at the first failure. |
| private Stack<String> paragraphStack = new Stack<>(); |
| //this is an arbitrary limit on the size of the stack |
| //to defend against DoS with memory consumption |
| private int maxStackSize = 1000; |
| |
| public TextExtractor(XHTMLContentHandler out, Metadata metadata, |
| RTFEmbObjHandler embObjHandler) { |
| this.metadata = metadata; |
| this.out = out; |
| this.embObjHandler = embObjHandler; |
| } |
| |
| private static Charset getCharset(String name) { |
| try { |
| return CharsetUtils.forName(name); |
| } catch (IllegalArgumentException e) { |
| return ASCII; |
| } |
| } |
| |
| protected static boolean isHexChar(int ch) { |
| return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F'); |
| } |
| |
| private static boolean isAlpha(int ch) { |
| return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z'); |
| } |
| |
| private static boolean isDigit(int ch) { |
| return ch >= '0' && ch <= '9'; |
| } |
| |
| protected static int hexValue(int ch) { |
| if (ch >= '0' && ch <= '9') { |
| return ch - '0'; |
| } else if (ch >= 'a' && ch <= 'z') { |
| return 10 + (ch - 'a'); |
| } else { |
| assert ch >= 'A' && ch <= 'Z'; |
| return 10 + (ch - 'A'); |
| } |
| } |
| |
| public boolean isIgnoringLists() { |
| return ignoreListMarkup; |
| } |
| |
| public void setIgnoreListMarkup(boolean ignore) { |
| this.ignoreListMarkup = ignore; |
| } |
| |
| // Push pending bytes or pending chars: |
| private void pushText() throws IOException, SAXException, TikaException { |
| if (pendingByteCount != 0) { |
| assert pendingCharCount == 0; |
| pushBytes(); |
| } else { |
| pushChars(); |
| } |
| } |
| |
| // Buffers the byte (unit in the current charset) for |
| // output: |
| private void addOutputByte(int b) throws IOException, SAXException, TikaException { |
| assert b >= 0 && b < 256 : "byte value out of range: " + b; |
| |
| if (pendingCharCount != 0) { |
| pushChars(); |
| } |
| if (groupState.pictDepth > 0) { |
| embObjHandler.writeMetadataChar((char) b); |
| } else { |
| // Save the byte in pending buffer: |
| if (pendingByteCount == pendingBytes.length) { |
| // Gradual but exponential growth: |
| final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)]; |
| System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length); |
| pendingBytes = newArray; |
| pendingByteBuffer = ByteBuffer.wrap(pendingBytes); |
| } |
| pendingBytes[pendingByteCount++] = (byte) b; |
| } |
| } |
| |
| // Buffers a byte as part of a control word: |
| private void addControl(int b) { |
| assert isAlpha(b); |
| // Save the byte in pending buffer: |
| if (pendingControlCount == pendingControl.length) { |
| // Gradual but exponential growth: |
| final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)]; |
| System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length); |
| pendingControl = newArray; |
| } |
| pendingControl[pendingControlCount++] = (byte) b; |
| } |
| |
| // Buffers a UTF16 code unit for output |
| private void addOutputChar(char ch) throws IOException, SAXException, TikaException { |
| if (pendingByteCount != 0) { |
| pushBytes(); |
| } |
| |
| if (inHeader || fieldState == 1) { |
| pendingBuffer.append(ch); |
| } else if (groupState.sn == true || groupState.sv == true) { |
| embObjHandler.writeMetadataChar(ch); |
| } else { |
| if (pendingCharCount == pendingChars.length) { |
| // Gradual but exponential growth: |
| final char[] newArray = new char[(int) (pendingChars.length * 1.25)]; |
| System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length); |
| pendingChars = newArray; |
| } |
| pendingChars[pendingCharCount++] = ch; |
| } |
| } |
| |
| // Shallow parses the entire doc, writing output to |
| // this.out and this.metadata |
| public void extract(InputStream in) throws IOException, SAXException, TikaException { |
| // in = new FilterInputStream(in) { |
| // public int read() throws IOException { |
| // int r = super.read(); |
| // System.out.write(r); |
| // System.out.flush(); |
| // return r; |
| // } |
| // public int read(byte b[], int off, int len) throws IOException { |
| // int r = super.read(b, off, len); |
| // System.out.write(b, off, r); |
| // System.out.flush(); |
| // return r; |
| // } |
| // }; |
| extract(new PushbackInputStream(in, 2)); |
| } |
| |
| private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException { |
| out.startDocument(); |
| |
| while (true) { |
| final int b = in.read(); |
| if (b == -1) { |
| break; |
| } else if (b == '\\') { |
| parseControlToken(in); |
| } else if (b == '{') { |
| pushText(); |
| processGroupStart(in); |
| } else if (b == '}') { |
| pushText(); |
| processGroupEnd(); |
| if (groupStates.isEmpty()) { |
| // parsed document closing brace |
| break; |
| } |
| } else if (groupState.objdata == true || groupState.pictDepth == 1) { |
| embObjHandler.writeHexChar(b); |
| } else if (b != '\r' && b != '\n' && |
| (!groupState.ignore || nextMetaData != null || groupState.sn == true || |
| groupState.sv == true)) { |
| // Linefeed and carriage return are not |
| // significant |
| if (ansiSkip != 0) { |
| ansiSkip--; |
| } else { |
| addOutputByte(b); |
| } |
| } |
| } |
| |
| endParagraph(false); |
| |
| //close out whatever tags were left |
| while (paragraphStack.size() > 0) { |
| end(paragraphStack.pop()); |
| } |
| out.endDocument(); |
| } |
| |
| private void parseControlToken(PushbackInputStream in) |
| throws IOException, SAXException, TikaException { |
| int b = in.read(); |
| if (b == '\'') { |
| // escaped hex char |
| parseHexChar(in); |
| } else if (isAlpha(b)) { |
| // control word |
| parseControlWord((char) b, in); |
| } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') { |
| // escaped char |
| addOutputByte(b); |
| } else if (b != -1) { |
| // control symbol, eg \* or \~ |
| processControlSymbol((char) b); |
| } |
| } |
| |
| private void parseHexChar(PushbackInputStream in) |
| throws IOException, SAXException, TikaException { |
| int hex1 = in.read(); |
| if (!isHexChar(hex1)) { |
| // DOC ERROR (malformed hex escape): ignore |
| in.unread(hex1); |
| return; |
| } |
| |
| int hex2 = in.read(); |
| if (!isHexChar(hex2)) { |
| // TODO: log a warning here, somehow? |
| // DOC ERROR (malformed hex escape): |
| // ignore |
| in.unread(hex2); |
| return; |
| } |
| |
| if (ansiSkip != 0) { |
| // Skip this ansi char since we are |
| // still in the shadow of a unicode |
| // escape: |
| ansiSkip--; |
| } else { |
| // Unescape: |
| addOutputByte(16 * hexValue(hex1) + hexValue(hex2)); |
| } |
| } |
| |
| private void parseControlWord(int firstChar, PushbackInputStream in) |
| throws IOException, SAXException, TikaException { |
| addControl(firstChar); |
| |
| int b = in.read(); |
| while (isAlpha(b)) { |
| addControl(b); |
| b = in.read(); |
| } |
| |
| boolean hasParam = false; |
| boolean negParam = false; |
| if (b == '-') { |
| negParam = true; |
| hasParam = true; |
| b = in.read(); |
| } |
| |
| int param = 0; |
| while (isDigit(b)) { |
| param *= 10; |
| param += (b - '0'); |
| hasParam = true; |
| b = in.read(); |
| } |
| |
| // space is consumed as part of the |
| // control word, but is not added to the |
| // control word |
| if (b != ' ') { |
| in.unread(b); |
| } |
| |
| if (hasParam) { |
| if (negParam) { |
| param = -param; |
| } |
| processControlWord(param, in); |
| } else { |
| processControlWord(); |
| } |
| |
| pendingControlCount = 0; |
| } |
| |
| private void lazyStartParagraph() throws IOException, SAXException, TikaException { |
| |
| boolean localInParagraph = inParagraph; |
| if (paragraphStack.size() > 0 && paragraphStack.contains(P)) { |
| localInParagraph = true; |
| } |
| if (!localInParagraph) { |
| // Ensure </i></b> order |
| if (groupState.italic) { |
| end("i"); |
| } |
| if (groupState.bold) { |
| end("b"); |
| } |
| if (pendingListEnd != 0 && groupState.list != pendingListEnd) { |
| endList(pendingListEnd); |
| pendingListEnd = 0; |
| } |
| if (inList() && pendingListEnd != groupState.list) { |
| startList(groupState.list); |
| } |
| if (inList()) { |
| start(LI); |
| pushParagraphTag(LI); |
| } else { |
| start(P); |
| pushParagraphTag(P); |
| } |
| |
| // Ensure <b><i> order |
| if (groupState.bold) { |
| start("b"); |
| } |
| if (groupState.italic) { |
| start("i"); |
| } |
| inParagraph = true; |
| } |
| } |
| |
| private void pushParagraphTag(String tag) { |
| if (paragraphStack.size() < maxStackSize) { |
| paragraphStack.push(tag); |
| } else { |
| //ignore. Something is very, very wrong... |
| } |
| } |
| |
| private void endParagraph(boolean preserveStyles) |
| throws IOException, SAXException, TikaException { |
| pushText(); |
| //maintain consecutive new lines |
| if (!inParagraph) { |
| lazyStartParagraph(); |
| } |
| if (inParagraph || paragraphStack.size() > 0) { |
| if (groupState.italic) { |
| end("i"); |
| groupState.italic = preserveStyles; |
| } |
| if (groupState.bold) { |
| end("b"); |
| groupState.bold = preserveStyles; |
| } |
| boolean badTagAlignment = false; |
| if (inList()) { |
| if (paragraphStack.size() > 0) { |
| String lastP = paragraphStack.pop(); |
| if (lastP.equals(LI)) { |
| end(LI); |
| } else { |
| pushParagraphTag(lastP); |
| badTagAlignment = true; |
| } |
| } else { |
| //there should have been a starting li |
| } |
| } else { |
| if (paragraphStack.size() > 0) { |
| String lastP = paragraphStack.pop(); |
| if (P.equals(lastP)) { |
| end(P); |
| } else { |
| pushParagraphTag(lastP); |
| badTagAlignment = true; |
| } |
| } |
| } |
| //if there was a failure in tag alignment, |
| //dump all tags and start fresh. |
| if (badTagAlignment) { |
| while (paragraphStack.size() > 0) { |
| end(paragraphStack.pop()); |
| } |
| } |
| if (preserveStyles && (groupState.bold || groupState.italic)) { |
| start(P); |
| pushParagraphTag(P); |
| if (groupState.bold) { |
| start("b"); |
| } |
| if (groupState.italic) { |
| start("i"); |
| } |
| inParagraph = true; |
| } else { |
| inParagraph = false; |
| } |
| } |
| |
| // Ensure closing the list at document end |
| if (!preserveStyles && pendingListEnd != 0) { |
| endList(pendingListEnd); |
| pendingListEnd = 0; |
| } |
| } |
| |
| // Push pending UTF16 units to out ContentHandler |
| private void pushChars() throws IOException, SAXException, TikaException { |
| if (pendingCharCount != 0) { |
| lazyStartParagraph(); |
| out.characters(pendingChars, 0, pendingCharCount); |
| pendingCharCount = 0; |
| } |
| } |
| |
| // Decodes the buffered bytes in pendingBytes |
| // into UTF16 code units, and sends the characters |
| // to the out ContentHandler, if we are in the body, |
| // else appends the characters to the pendingBuffer |
| private void pushBytes() throws IOException, SAXException, TikaException { |
| if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) { |
| |
| final CharsetDecoder decoder = getDecoder(); |
| pendingByteBuffer.limit(pendingByteCount); |
| assert pendingByteBuffer.position() == 0; |
| assert outputCharBuffer.position() == 0; |
| |
| while (true) { |
| // We pass true for endOfInput because, when |
| // we are called, we should have seen a |
| // complete sequence of characters for this |
| // charset: |
| final CoderResult result = decoder.decode((ByteBuffer) pendingByteBuffer, |
| (CharBuffer) outputCharBuffer, true); |
| |
| final int pos = outputCharBuffer.position(); |
| if (pos > 0) { |
| if (inHeader || fieldState == 1) { |
| pendingBuffer.append(outputArray, 0, pos); |
| } else { |
| lazyStartParagraph(); |
| out.characters(outputArray, 0, pos); |
| } |
| outputCharBuffer.position(0); |
| } |
| |
| if (result == CoderResult.UNDERFLOW) { |
| break; |
| } |
| } |
| |
| while (true) { |
| final CoderResult result = decoder.flush((CharBuffer) outputCharBuffer); |
| |
| final int pos = outputCharBuffer.position(); |
| if (pos > 0) { |
| if (inHeader || fieldState == 1) { |
| pendingBuffer.append(outputArray, 0, pos); |
| } else { |
| lazyStartParagraph(); |
| out.characters(outputArray, 0, pos); |
| } |
| outputCharBuffer.position(0); |
| } |
| |
| if (result == CoderResult.UNDERFLOW) { |
| break; |
| } |
| } |
| |
| // Reset for next decode |
| decoder.reset(); |
| pendingByteBuffer.position(0); |
| } |
| |
| pendingByteCount = 0; |
| } |
| |
| // NOTE: s must be ascii alpha only |
| private boolean equals(String s) { |
| if (pendingControlCount != s.length()) { |
| return false; |
| } |
| for (int idx = 0; idx < pendingControlCount; idx++) { |
| assert isAlpha(s.charAt(idx)); |
| if (((byte) s.charAt(idx)) != pendingControl[idx]) { |
| return false; |
| } |
| } |
| return true; |
| } |
| |
| private void processControlSymbol(char ch) throws IOException, SAXException, TikaException { |
| switch (ch) { |
| case '~': |
| // Non-breaking space -> unicode NON-BREAKING SPACE |
| addOutputChar('\u00a0'); |
| break; |
| case '*': |
| // Ignorable destination (control words defined after |
| // the 1987 RTF spec). These are already handled by |
| // processGroupStart() |
| break; |
| case '-': |
| // Optional hyphen -> unicode SOFT HYPHEN |
| addOutputChar('\u00ad'); |
| break; |
| case '_': |
| // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN |
| addOutputChar('\u2011'); |
| break; |
| default: |
| break; |
| } |
| } |
| |
| private CharsetDecoder getDecoder() throws TikaException { |
| Charset charset = getCharset(); |
| |
| // Common case: charset is same as last time, so |
| // just reuse it: |
| if (lastCharset == null || !charset.equals(lastCharset)) { |
| decoder = charset.newDecoder(); |
| decoder.onMalformedInput(CodingErrorAction.REPLACE); |
| decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); |
| lastCharset = charset; |
| } |
| |
| return decoder; |
| } |
| |
| // Return current charset in-use |
| private Charset getCharset() throws TikaException { |
| // If a specific font (fN) was set, use its charset |
| if (groupState.fontCharset != null) { |
| return groupState.fontCharset; |
| } |
| |
| // Else, if global default font (defN) was set, use that one |
| if (globalDefaultFont != -1 && !inHeader) { |
| Charset cs = fontToCharset.get(globalDefaultFont); |
| if (cs != null) { |
| return cs; |
| } |
| } |
| |
| // Else, use the global charset |
| if (globalCharset == null) { |
| throw new TikaException("unable to determine charset"); |
| } |
| |
| return globalCharset; |
| } |
| |
| // Handle control word that takes a parameter: |
| private void processControlWord(int param, PushbackInputStream in) |
| throws IOException, SAXException, TikaException { |
| // TODO: afN? (associated font number) |
| |
| // TODO: do these alter text output...? |
| /* |
| } else if (equals("stshfdbch")) { |
| // font to be used by default in |
| // style sheet for East Asian chars |
| // arg N is font table entry |
| } else if (equals("stshfloch")) { |
| // font to be used by default in |
| // style sheet for ASCII chars |
| // arg N is font table entry |
| } else if (equals("stshfhich")) { |
| // font to be used by default in |
| // style sheet for High Ansi chars |
| // arg N is font table entry |
| } else if (equals("stshfbi")) { |
| // style sheet for Complex Scripts (BIDI) chars |
| // arg N is font table entry |
| */ |
| |
| // TODO: inefficient that we check equals N times; |
| // we'd get better perf w/ real lexer (eg |
| // JFlex), which uses single-pass FSM to do cmp: |
| if (inHeader) { |
| if (equals("ansicpg")) { |
| // ANSI codepage |
| Charset cs = ANSICPG_MAP.get(param); |
| if (cs != null) { |
| globalCharset = cs; |
| } |
| } else if (equals("deff")) { |
| // Default font |
| globalDefaultFont = param; |
| } else if (equals("nofpages")) { |
| metadata.add(Office.PAGE_COUNT, Integer.toString(param)); |
| } else if (equals("nofwords")) { |
| metadata.add(Office.WORD_COUNT, Integer.toString(param)); |
| } else if (equals("nofchars")) { |
| metadata.add(Office.CHARACTER_COUNT, Integer.toString(param)); |
| } else if (equals("yr")) { |
| year = param; |
| } else if (equals("mo")) { |
| month = param; |
| } else if (equals("dy")) { |
| day = param; |
| } else if (equals("hr")) { |
| hour = param; |
| } else if (equals("min")) { |
| minute = param; |
| } |
| |
| if (fontTableState == 1) { |
| // Still inside font table -- record the |
| // mappings of fN to the fcharset: |
| if (groupState.depth < fontTableDepth) { |
| fontTableState = 2; |
| } else { |
| if (equals("f")) { |
| // Start new font definition |
| curFontID = param; |
| } else if (equals("fcharset")) { |
| Charset cs = FCHARSET_MAP.get(param); |
| if (cs != null) { |
| fontToCharset.put(curFontID, cs); |
| } |
| } |
| } |
| } |
| //if you've already seen the font table, |
| //you aren't in another header item (e.g. styles) |
| //and you see an fX, you're out of the header |
| if (fontTableState == 2 && !groupState.ignore && equals("f")) { |
| inHeader = false; |
| } |
| |
| if (currentList != null) { |
| if (equals("listid")) { |
| currentList.id = param; |
| currentListTable.put(currentList.id, currentList); |
| } else if (equals("listtemplateid")) { |
| currentList.templateID = param; |
| } else if (equals("levelnfc") || equals("levelnfcn")) { |
| //sanity check to make sure list information isn't corrupt |
| if (listTableLevel > -1 && listTableLevel < currentList.numberType.length) { |
| currentList.numberType[listTableLevel] = param; |
| } |
| } |
| } |
| } else { |
| // In document |
| if (equals("b")) { |
| // b0 |
| assert param == 0; |
| if (groupState.bold) { |
| pushText(); |
| if (groupState.italic) { |
| end("i"); |
| } |
| end("b"); |
| if (groupState.italic) { |
| start("i"); |
| } |
| groupState.bold = false; |
| } |
| } else if (equals("i")) { |
| // i0 |
| assert param == 0; |
| if (groupState.italic) { |
| pushText(); |
| end("i"); |
| groupState.italic = false; |
| } |
| } else if (equals("f")) { |
| // Change current font |
| Charset fontCharset = fontToCharset.get(param); |
| |
| // Push any buffered text before changing |
| // font: |
| pushText(); |
| |
| if (fontCharset != null) { |
| groupState.fontCharset = fontCharset; |
| } else { |
| // DOC ERROR: font change referenced a |
| // non-table'd font number |
| // TODO: log a warning? Throw an exc? |
| groupState.fontCharset = null; |
| } |
| } else if (equals("ls")) { |
| groupState.list = param; |
| } else if (equals("lslvl")) { |
| groupState.listLevel = param; |
| } |
| } |
| |
| // Process unicode escape. This can appear in doc |
| // or in header, since the metadata (info) fields |
| // in the header can be unicode escaped as well: |
| if (equals("u")) { |
| // Unicode escape |
| if (!groupState.ignore || groupState.sv || groupState.sn) { |
| final char utf16CodeUnit = (char) (param & 0xffff); |
| addOutputChar(utf16CodeUnit); |
| } |
| |
| // After seeing a unicode escape we must |
| // skip the next ucSkip ansi chars (the |
| // "unicode shadow") |
| ansiSkip = groupState.ucSkip; |
| } else if (equals("uc")) { |
| // Change unicode shadow length |
| groupState.ucSkip = param; |
| } else if (equals("bin")) { |
| if (param >= 0) { |
| if (groupState.pictDepth == 1) { |
| try { |
| embObjHandler.writeBytes(in, param); |
| } catch (IOException | TikaException e) { |
| EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); |
| embObjHandler.reset(); |
| } |
| } else { |
| IOUtils.skipFully(in, param); |
| } |
| } else { |
| // log some warning? |
| } |
| } |
| } |
| |
| private boolean inList() { |
| return !ignoreListMarkup && groupState.list != 0; |
| } |
| |
| /** |
| * Marks the current list as pending to end. This is done to be able to merge list items of |
| * the same list within the same enclosing list tag (ie. either <code>"ul"</code>, or |
| * <code>"ol"</code>). |
| */ |
| private void pendingListEnd() { |
| pendingListEnd = groupState.list; |
| groupState.list = 0; |
| } |
| |
| /** |
| * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list |
| * type for the given <code>listID</code>. |
| * |
| * @param listID The ID of the list. |
| * @throws IOException |
| * @throws SAXException |
| * @throws TikaException |
| */ |
| private void endList(int listID) throws IOException, SAXException, TikaException { |
| if (!ignoreListMarkup) { |
| String xl = isUnorderedList(listID) ? UL : OL; |
| if (paragraphStack.size() > 0) { |
| String p = paragraphStack.pop(); |
| if (xl.equals(p)) { |
| end(xl); |
| } |
| } else { |
| //stack as empty, the list was never started |
| } |
| } |
| } |
| |
| /** |
| * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list |
| * type for the given <code>listID</code>. |
| * |
| * @param listID The ID of the list. |
| * @throws IOException |
| * @throws SAXException |
| * @throws TikaException |
| */ |
| private void startList(int listID) throws IOException, SAXException, TikaException { |
| if (!ignoreListMarkup) { |
| String xl = isUnorderedList(listID) ? UL : OL; |
| start(xl); |
| pushParagraphTag(xl); |
| } |
| } |
| |
| private boolean isUnorderedList(int listID) { |
| ListDescriptor list = listTable.get(listID); |
| if (list != null) { |
| return list.isUnordered(groupState.listLevel); |
| } |
| return true; |
| } |
| |
| private void end(String tag) throws IOException, SAXException, TikaException { |
| out.endElement(tag); |
| } |
| |
| private void start(String tag) throws IOException, SAXException, TikaException { |
| out.startElement(tag); |
| } |
| |
| // Handle non-parameter control word: |
| private void processControlWord() throws IOException, SAXException, TikaException { |
| if (inHeader) { |
| if (equals("ansi")) { |
| globalCharset = WINDOWS_1252; |
| } else if (equals("pca")) { |
| globalCharset = CP850; |
| } else if (equals("pc")) { |
| globalCharset = CP437; |
| } else if (equals("mac")) { |
| globalCharset = MAC_ROMAN; |
| } |
| |
| if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) { |
| groupState.ignore = true; |
| } else if (equals("listtable")) { |
| currentListTable = listTable; |
| } else if (equals("listoverridetable")) { |
| currentListTable = listOverrideTable; |
| } |
| |
| if (uprState == -1) { |
| // TODO: we can also parse \creatim, \revtim, |
| // \printim, \version, etc. |
| if (equals("author")) { |
| nextMetaData = TikaCoreProperties.CREATOR; |
| } else if (equals("title")) { |
| nextMetaData = TikaCoreProperties.TITLE; |
| } else if (equals("subject")) { |
| nextMetaData = OfficeOpenXMLCore.SUBJECT; |
| } else if (equals("keywords")) { |
| nextMetaData = Office.KEYWORDS; |
| } else if (equals("category")) { |
| nextMetaData = OfficeOpenXMLCore.CATEGORY; |
| } else if (equals("comment")) { |
| nextMetaData = TikaCoreProperties.COMMENTS; |
| } else if (equals("company")) { |
| nextMetaData = OfficeOpenXMLExtended.COMPANY; |
| } else if (equals("manager")) { |
| nextMetaData = OfficeOpenXMLExtended.MANAGER; |
| } else if (equals("template")) { |
| nextMetaData = OfficeOpenXMLExtended.TEMPLATE; |
| } else if (equals("creatim")) { |
| nextMetaData = TikaCoreProperties.CREATED; |
| } |
| } |
| |
| if (fontTableState == 0) { |
| // Didn't see font table yet |
| if (equals("fonttbl")) { |
| fontTableState = 1; |
| fontTableDepth = groupState.depth; |
| } |
| } else if (fontTableState == 1) { |
| // Inside font table |
| if (groupState.depth < fontTableDepth) { |
| fontTableState = 2; |
| } |
| } |
| |
| // List table handling |
| if (currentListTable != null) { |
| if (equals("list") || equals("listoverride")) { |
| currentList = new ListDescriptor(); |
| listTableLevel = -1; |
| } else if (currentList != null) { |
| if (equals("liststylename")) { |
| currentList.isStyle = true; |
| } else if (equals("listlevel")) { |
| listTableLevel++; |
| } |
| } |
| } |
| |
| if (!groupState.ignore && |
| (equals("par") || equals("pard") || equals("sect") || equals("sectd") || |
| equals("plain") || equals("ltrch") || equals("rtlch") || |
| equals("htmlrtf") || equals("line"))) { |
| inHeader = false; |
| } |
| } else { |
| if (equals("b")) { |
| if (!groupState.bold) { |
| pushText(); |
| lazyStartParagraph(); |
| if (groupState.italic) { |
| // Make sure nesting is always <b><i> |
| end("i"); |
| } |
| groupState.bold = true; |
| start("b"); |
| if (groupState.italic) { |
| start("i"); |
| } |
| } |
| } else if (equals("i")) { |
| if (!groupState.italic) { |
| pushText(); |
| lazyStartParagraph(); |
| groupState.italic = true; |
| start("i"); |
| } |
| } |
| } |
| |
| final boolean ignored = groupState.ignore; |
| if (equals("pard")) { |
| // Reset styles |
| pushText(); |
| if (groupState.italic) { |
| end("i"); |
| groupState.italic = false; |
| } |
| if (groupState.bold) { |
| end("b"); |
| groupState.bold = false; |
| } |
| if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0)) |
| pendingListEnd(); |
| } |
| } else if (equals("plain")) { |
| if (groupState.italic || groupState.bold) { |
| // Reset styles |
| pushText(); |
| if (groupState.italic) { |
| end("i"); |
| groupState.italic = false; |
| } |
| if (groupState.bold) { |
| end("b"); |
| groupState.bold = false; |
| } |
| } |
| } else if (equals("par")) { |
| if (!ignored) { |
| endParagraph(true); |
| if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0)) |
| pendingListEnd(); |
| } |
| } |
| } else if (equals("shptxt")) { |
| pushText(); |
| // Text inside a shape |
| groupState.ignore = false; |
| } else if (equals("chatn")) { |
| addOutputChar(SPACE); |
| pushText(); |
| // Annotation ID |
| groupState.ignore = false; |
| } else if (equals("atnid")) { |
| addOutputChar(SPACE); |
| pushText(); |
| // Annotation ID |
| groupState.ignore = false; |
| } else if (equals("atnauthor")) { |
| addOutputChar(SPACE); |
| pushText(); |
| // Annotation author |
| groupState.ignore = false; |
| } else if (equals("annotation")) { |
| groupState.annotation = true; |
| pushText(); |
| // Annotation |
| groupState.ignore = false; |
| } else if (equals("listtext")) { |
| groupState.ignore = true; |
| } else if (equals("cell")) { |
| // TODO: we should produce a table output here? |
| //addOutputChar(' '); |
| endParagraph(true); |
| } else if (equals("sp")) { |
| groupState.sp = true; |
| } else if (equals("sn")) { |
| embObjHandler.startSN(); |
| groupState.sn = true; |
| } else if (equals("sv")) { |
| embObjHandler.startSV(); |
| groupState.sv = true; |
| } else if (equals("object")) { |
| pushText(); |
| embObjHandler.setInObject(true); |
| groupState.object = true; |
| } else if (equals("objdata")) { |
| groupState.objdata = true; |
| embObjHandler.startObjData(); |
| } else if (equals("pict")) { |
| pushText(); |
| // TODO: create img tag? but can that support |
| // embedded image data? |
| groupState.pictDepth = 1; |
| embObjHandler.startPict(); |
| } else if (equals("line")) { |
| if (!ignored) { |
| addOutputChar('\n'); |
| } |
| } else if (equals("column")) { |
| if (!ignored) { |
| addOutputChar(' '); |
| } |
| } else if (equals("page")) { |
| if (!ignored) { |
| addOutputChar('\n'); |
| } |
| } else if (equals("softline")) { |
| if (!ignored) { |
| addOutputChar('\n'); |
| } |
| } else if (equals("softcolumn")) { |
| if (!ignored) { |
| addOutputChar(' '); |
| } |
| } else if (equals("softpage")) { |
| if (!ignored) { |
| addOutputChar('\n'); |
| } |
| } else if (equals("tab")) { |
| if (!ignored) { |
| addOutputChar('\t'); |
| } |
| } else if (equals("upr")) { |
| uprState = 0; |
| } else if (equals("ud") && uprState == 1) { |
| uprState = -1; |
| // 2nd group inside the upr destination, which |
| // contains the unicode encoding of the text, so |
| // we want to keep that: |
| groupState.ignore = false; |
| } else if (equals("bullet")) { |
| if (!ignored) { |
| // unicode BULLET |
| addOutputChar('\u2022'); |
| } |
| } else if (equals("endash")) { |
| if (!ignored) { |
| // unicode EN DASH |
| addOutputChar('\u2013'); |
| } |
| } else if (equals("emdash")) { |
| if (!ignored) { |
| // unicode EM DASH |
| addOutputChar('\u2014'); |
| } |
| } else if (equals("enspace")) { |
| if (!ignored) { |
| // unicode EN SPACE |
| addOutputChar('\u2002'); |
| } |
| } else if (equals("qmspace")) { |
| if (!ignored) { |
| // quarter em space -> unicode FOUR-PER-EM SPACE |
| addOutputChar('\u2005'); |
| } |
| } else if (equals("emspace")) { |
| if (!ignored) { |
| // unicode EM SPACE |
| addOutputChar('\u2003'); |
| } |
| } else if (equals("lquote")) { |
| if (!ignored) { |
| // unicode LEFT SINGLE QUOTATION MARK |
| addOutputChar('\u2018'); |
| } |
| } else if (equals("rquote")) { |
| if (!ignored) { |
| // unicode RIGHT SINGLE QUOTATION MARK |
| addOutputChar('\u2019'); |
| } |
| } else if (equals("ldblquote")) { |
| if (!ignored) { |
| // unicode LEFT DOUBLE QUOTATION MARK |
| addOutputChar('\u201C'); |
| } |
| } else if (equals("rdblquote")) { |
| if (!ignored) { |
| // unicode RIGHT DOUBLE QUOTATION MARK |
| addOutputChar('\u201D'); |
| } |
| } else if (equals("fldinst")) { |
| fieldState = 1; |
| groupState.ignore = false; |
| } else if (equals("fldrslt") && fieldState == 2) { |
| assert pendingURL != null; |
| lazyStartParagraph(); |
| out.startElement("a", "href", pendingURL); |
| pendingURL = null; |
| fieldState = 3; |
| groupState.ignore = false; |
| } |
| } |
| |
| // Push new GroupState |
| private void processGroupStart(PushbackInputStream in) throws IOException { |
| ansiSkip = 0; |
| // Push current groupState onto the stack |
| groupStates.add(groupState); |
| |
| // Make new GroupState |
| groupState = new GroupState(groupState); |
| assert groupStates.size() == groupState.depth : |
| "size=" + groupStates.size() + " depth=" + groupState.depth; |
| |
| if (uprState == 0) { |
| uprState = 1; |
| groupState.ignore = true; |
| } |
| |
| // Check for ignorable groups. Note that |
| // sometimes we un-ignore within this group, eg |
| // when handling upr escape. |
| int b2 = in.read(); |
| if (b2 == '\\') { |
| int b3 = in.read(); |
| if (b3 == '*') { |
| groupState.ignore = true; |
| } |
| in.unread(b3); |
| } |
| in.unread(b2); |
| } |
| |
| // Pop current GroupState |
| private void processGroupEnd() throws IOException, SAXException, TikaException { |
| if (inHeader) { |
| if (nextMetaData != null) { |
| if (nextMetaData == TikaCoreProperties.CREATED) { |
| Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT); |
| cal.set(year, month - 1, day, hour, minute, 0); |
| metadata.set(nextMetaData, cal.getTime()); |
| } else if (nextMetaData.isMultiValuePermitted()) { |
| metadata.add(nextMetaData, pendingBuffer.toString()); |
| } else { |
| metadata.set(nextMetaData, pendingBuffer.toString()); |
| } |
| nextMetaData = null; |
| } |
| pendingBuffer.setLength(0); |
| } |
| |
| assert groupState.depth > 0; |
| ansiSkip = 0; |
| |
| if (groupState.objdata == true) { |
| try { |
| embObjHandler.handleCompletedObject(); |
| } catch (TikaException | IOException e) { |
| EmbeddedDocumentUtil.recordException(e, metadata); |
| } |
| groupState.objdata = false; |
| } else if (groupState.pictDepth > 0) { |
| if (groupState.sn == true) { |
| embObjHandler.endSN(); |
| } else if (groupState.sv == true) { |
| embObjHandler.endSV(); |
| } else if (groupState.sp == true) { |
| embObjHandler.endSP(); |
| } else if (groupState.pictDepth == 1) { |
| embObjHandler.handleCompletedObject(); |
| } |
| } |
| if (groupState.annotation == true) { |
| addOutputChar(SPACE); |
| } |
| if (groupState.object == true) { |
| embObjHandler.setInObject(false); |
| } |
| |
| // Be robust if RTF doc is corrupt (has too many |
| // closing }s): |
| // TODO: log a warning? |
| if (groupStates.size() > 0) { |
| // Restore group state: |
| final GroupState outerGroupState = groupStates.removeLast(); |
| |
| // Close italic, if outer does not have italic or |
| // bold changed: |
| if (groupState.italic) { |
| if (!outerGroupState.italic || groupState.bold != outerGroupState.bold) { |
| end("i"); |
| groupState.italic = false; |
| } |
| } |
| |
| // Close bold |
| if (groupState.bold && !outerGroupState.bold) { |
| end("b"); |
| } |
| |
| // Open bold |
| if (!groupState.bold && outerGroupState.bold) { |
| start("b"); |
| } |
| |
| // Open italic |
| if (!groupState.italic && outerGroupState.italic) { |
| start("i"); |
| } |
| groupState = outerGroupState; |
| } |
| assert groupStates.size() == groupState.depth; |
| |
| if (fieldState == 1) { |
| String s = pendingBuffer.toString().trim(); |
| pendingBuffer.setLength(0); |
| if (s.startsWith("HYPERLINK")) { |
| s = s.substring(9).trim(); |
| // TODO: what other instructions can be in a |
| // HYPERLINK destination? |
| final boolean isLocalLink = s.contains("\\l "); |
| int idx = s.indexOf('"'); |
| if (idx != -1) { |
| int idx2 = s.indexOf('"', 1 + idx); |
| if (idx2 != -1) { |
| s = s.substring(1 + idx, idx2); |
| } |
| } |
| pendingURL = (isLocalLink ? "#" : "") + s; |
| fieldState = 2; |
| } else { |
| fieldState = 0; |
| } |
| |
| // TODO: we could process the other known field |
| // types. Right now, we will extract their text |
| // inlined, but fail to record them in metadata |
| // as a field value. |
| } else if (fieldState == 3) { |
| end("a"); |
| fieldState = 0; |
| } |
| } |
| } |