tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.tika.parser.microsoft.rtf;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.PushbackInputStream;
 import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.charset.Charset;
 import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CoderResult;
 import java.nio.charset.CodingErrorAction;
 import java.util.Calendar;
 import java.util.HashMap;
 import java.util.LinkedList;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Stack;
 import java.util.TimeZone;

 import org.apache.commons.io.IOUtils;
 import org.xml.sax.SAXException;

 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
 import org.apache.tika.metadata.OfficeOpenXMLExtended;
 import org.apache.tika.metadata.Property;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.CharsetUtils;

 /* Tokenizes and performs a "shallow" parse of the RTF
  * document, just enough to properly decode the text.
  *
  * TODO: we should cutover to a "real" tokenizer (eg JFlex);
  * it should give better perf, by replacing the excessive
  * "else if" string compares with FSA traversal. */

 final class TextExtractor {

     private static final char SPACE = ' ';
     private static final String P = "p";
     private static final String LI = "li";
     private static final String OL = "ol";
     private static final String UL = "ul";

     private static final Charset ASCII = Charset.forName("US-ASCII");
     private static final Charset WINDOWS_1252 = getCharset("WINDOWS-1252");
     private static final Charset MAC_ROMAN = getCharset("MacRoman");
     private static final Charset SHIFT_JIS = getCharset("Shift_JIS");
     private static final Charset WINDOWS_57011 = getCharset("windows-57011");
     private static final Charset WINDOWS_57010 = getCharset("windows-57010");
     private static final Charset WINDOWS_57009 = getCharset("windows-57009");
     private static final Charset WINDOWS_57008 = getCharset("windows-57008");
     private static final Charset WINDOWS_57007 = getCharset("windows-57007");
     private static final Charset WINDOWS_57006 = getCharset("windows-57006");
     private static final Charset WINDOWS_57005 = getCharset("windows-57005");
     private static final Charset WINDOWS_57004 = getCharset("windows-57004");
     private static final Charset WINDOWS_57003 = getCharset("windows-57003");
     private static final Charset X_ISCII91 = getCharset("x-ISCII91");
     private static final Charset X_MAC_CENTRAL_EUROPE = getCharset("x-MacCentralEurope");
     private static final Charset MAC_CYRILLIC = getCharset("MacCyrillic");
     private static final Charset X_JOHAB = getCharset("x-Johab");
     private static final Charset CP12582 = getCharset("CP1258");
     private static final Charset CP12572 = getCharset("CP1257");
     private static final Charset CP12562 = getCharset("CP1256");
     private static final Charset CP12552 = getCharset("CP1255");
     private static final Charset CP12542 = getCharset("CP1254");
     private static final Charset CP12532 = getCharset("CP1253");
     private static final Charset CP1252 = getCharset("CP1252");
     private static final Charset CP12512 = getCharset("CP1251");
     private static final Charset CP12502 = getCharset("CP1250");
     private static final Charset CP950 = getCharset("CP950");
     private static final Charset CP949 = getCharset("CP949");
     private static final Charset MS9362 = getCharset("MS936");
     private static final Charset MS8742 = getCharset("MS874");
     private static final Charset CP866 = getCharset("CP866");
     private static final Charset CP865 = getCharset("CP865");
     private static final Charset CP864 = getCharset("CP864");
     private static final Charset CP863 = getCharset("CP863");
     private static final Charset CP862 = getCharset("CP862");
     private static final Charset CP860 = getCharset("CP860");
     private static final Charset CP852 = getCharset("CP852");
     private static final Charset CP8502 = getCharset("CP850");
     private static final Charset CP819 = getCharset("CP819");
     private static final Charset WINDOWS_720 = getCharset("windows-720");
     private static final Charset WINDOWS_711 = getCharset("windows-711");
     private static final Charset WINDOWS_710 = getCharset("windows-710");
     private static final Charset WINDOWS_709 = getCharset("windows-709");
     private static final Charset ISO_8859_6 = getCharset("ISO-8859-6");
     private static final Charset CP4372 = getCharset("CP437");
     private static final Charset CP850 = getCharset("cp850");
     private static final Charset CP437 = getCharset("cp437");
     private static final Charset MS874 = getCharset("ms874");
     private static final Charset CP1257 = getCharset("cp1257");
     private static final Charset CP1256 = getCharset("cp1256");
     private static final Charset CP1255 = getCharset("cp1255");
     private static final Charset CP1258 = getCharset("cp1258");
     private static final Charset CP1254 = getCharset("cp1254");
     private static final Charset CP1253 = getCharset("cp1253");
     private static final Charset MS950 = getCharset("ms950");
     private static final Charset MS936 = getCharset("ms936");
     private static final Charset MS1361 = getCharset("ms1361");
     private static final Charset MS932 = getCharset("MS932");
     private static final Charset CP1251 = getCharset("cp1251");
     private static final Charset CP1250 = getCharset("cp1250");
     private static final Charset MAC_THAI = getCharset("MacThai");
     private static final Charset MAC_TURKISH = getCharset("MacTurkish");
     private static final Charset MAC_GREEK = getCharset("MacGreek");
     private static final Charset MAC_ARABIC = getCharset("MacArabic");
     private static final Charset MAC_HEBREW = getCharset("MacHebrew");
     private static final Charset JOHAB = getCharset("johab");
     private static final Charset BIG5 = getCharset("Big5");
     private static final Charset GB2312 = getCharset("GB2312");
     private static final Charset MS949 = getCharset("ms949");
     // The RTF doc has a "font table" that assigns ords
     // (f0, f1, f2, etc.) to fonts and charsets, using the
     // \fcharsetN control word.  This mapping maps from the
     // N to corresponding Java charset:
     private static final Map<Integer, Charset> FCHARSET_MAP = new HashMap<>();
     // The RTF may specify the \ansicpgN charset in the
     // header; this maps the N to the corresponding Java
     // character set:
     private static final Map<Integer, Charset> ANSICPG_MAP = new HashMap<>();

     static {
         FCHARSET_MAP.put(0, WINDOWS_1252); // ANSI
         // charset 1 is Default
         // charset 2 is Symbol

         FCHARSET_MAP.put(77, MAC_ROMAN); // Mac Roman
         FCHARSET_MAP.put(78, SHIFT_JIS); // Mac Shift Jis
         FCHARSET_MAP.put(79, MS949); // Mac Hangul
         FCHARSET_MAP.put(80, GB2312); // Mac GB2312
         FCHARSET_MAP.put(81, BIG5); // Mac Big5
         FCHARSET_MAP.put(82, JOHAB); // Mac Johab (old)
         FCHARSET_MAP.put(83, MAC_HEBREW); // Mac Hebrew
         FCHARSET_MAP.put(84, MAC_ARABIC); // Mac Arabic
         FCHARSET_MAP.put(85, MAC_GREEK); // Mac Greek
         FCHARSET_MAP.put(86, MAC_TURKISH); // Mac Turkish
         FCHARSET_MAP.put(87, MAC_THAI); // Mac Thai
         FCHARSET_MAP.put(88, CP1250); // Mac East Europe
         FCHARSET_MAP.put(89, CP1251); // Mac Russian

         FCHARSET_MAP.put(128, MS932); // Shift JIS
         FCHARSET_MAP.put(129, MS949); // Hangul
         FCHARSET_MAP.put(130, MS1361); // Johab
         FCHARSET_MAP.put(134, MS936); // GB2312
         FCHARSET_MAP.put(136, MS950); // Big5
         FCHARSET_MAP.put(161, CP1253); // Greek
         FCHARSET_MAP.put(162, CP1254); // Turkish
         FCHARSET_MAP.put(163, CP1258); // Vietnamese
         FCHARSET_MAP.put(177, CP1255); // Hebrew
         FCHARSET_MAP.put(178, CP1256); // Arabic
         // FCHARSET_MAP.put( 179, "" ); // Arabic Traditional
         // FCHARSET_MAP.put( 180, "" ); // Arabic user
         // FCHARSET_MAP.put( 181, "" ); // Hebrew user
         FCHARSET_MAP.put(186, CP1257); // Baltic

         FCHARSET_MAP.put(204, CP1251); // Russian
         FCHARSET_MAP.put(222, MS874); // Thai
         FCHARSET_MAP.put(238, CP1250); // Eastern European
         FCHARSET_MAP.put(254, CP437); // PC 437
         FCHARSET_MAP.put(255, CP850); // OEM
     }

     static {
         ANSICPG_MAP.put(437, CP4372);   // US IBM
         ANSICPG_MAP.put(708, ISO_8859_6);   // Arabic (ASMO 708)

         ANSICPG_MAP.put(709, WINDOWS_709);  // Arabic (ASMO 449+, BCON V4)
         ANSICPG_MAP.put(710, WINDOWS_710);  // Arabic (transparent Arabic)
         ANSICPG_MAP.put(710, WINDOWS_711);  // Arabic (Nafitha Enhanced)
         ANSICPG_MAP.put(710, WINDOWS_720);  // Arabic (transparent ASMO)
         ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
         ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)

         ANSICPG_MAP.put(819, CP819);  // Windows 3.1 (US & Western Europe)
         ANSICPG_MAP.put(850, CP8502);  // IBM Multilingual
         ANSICPG_MAP.put(852, CP852);  // Eastern European
         ANSICPG_MAP.put(860, CP860);  // Portuguese
         ANSICPG_MAP.put(862, CP862);  // Hebrew
         ANSICPG_MAP.put(863, CP863);  // French Canadian
         ANSICPG_MAP.put(864, CP864);  // Arabic
         ANSICPG_MAP.put(865, CP865);  // Norwegian
         ANSICPG_MAP.put(866, CP866);  // Soviet Union
         ANSICPG_MAP.put(874, MS8742);  // Thai
         ANSICPG_MAP.put(932, MS932);  // Japanese
         ANSICPG_MAP.put(936, MS9362);  // Simplified Chinese
         ANSICPG_MAP.put(949, CP949);  // Korean
         ANSICPG_MAP.put(950, CP950);  // Traditional Chinese
         ANSICPG_MAP.put(1250, CP12502);  // Eastern European
         ANSICPG_MAP.put(1251, CP12512);  // Cyrillic
         ANSICPG_MAP.put(1252, CP1252);  // Western European
         ANSICPG_MAP.put(1253, CP12532);  // Greek
         ANSICPG_MAP.put(1254, CP12542);  // Turkish
         ANSICPG_MAP.put(1255, CP12552);  // Hebrew
         ANSICPG_MAP.put(1256, CP12562);  // Arabic
         ANSICPG_MAP.put(1257, CP12572);  // Baltic
         ANSICPG_MAP.put(1258, CP12582);  // Vietnamese
         ANSICPG_MAP.put(1361, X_JOHAB);  // Johab
         ANSICPG_MAP.put(10000, MAC_ROMAN);  // Mac Roman
         ANSICPG_MAP.put(10001, SHIFT_JIS);  // Mac Japan
         ANSICPG_MAP.put(10004, MAC_ARABIC);  // Mac Arabic
         ANSICPG_MAP.put(10005, MAC_HEBREW);  // Mac Hebrew
         ANSICPG_MAP.put(10006, MAC_GREEK);  // Mac Hebrew
         ANSICPG_MAP.put(10007, MAC_CYRILLIC);  // Mac Cyrillic
         ANSICPG_MAP.put(10029, X_MAC_CENTRAL_EUROPE);  // MAC Latin2
         ANSICPG_MAP.put(10081, MAC_TURKISH);  // Mac Turkish
         ANSICPG_MAP.put(57002, X_ISCII91);   // Devanagari

         // TODO: in theory these other charsets are simple
         // shifts off of Devanagari, so we could impl that
         // here:
         ANSICPG_MAP.put(57003, WINDOWS_57003);   // Bengali
         ANSICPG_MAP.put(57004, WINDOWS_57004);   // Tamil
         ANSICPG_MAP.put(57005, WINDOWS_57005);   // Telugu
         ANSICPG_MAP.put(57006, WINDOWS_57006);   // Assamese
         ANSICPG_MAP.put(57007, WINDOWS_57007);   // Oriya
         ANSICPG_MAP.put(57008, WINDOWS_57008);   // Kannada
         ANSICPG_MAP.put(57009, WINDOWS_57009);   // Malayalam
         ANSICPG_MAP.put(57010, WINDOWS_57010);   // Gujariti
         ANSICPG_MAP.put(57011, WINDOWS_57011);   // Punjabi
     }

     // Used when we decode bytes -> chars using CharsetDecoder:
     private final char[] outputArray = new char[128];
     private final Buffer outputCharBuffer = CharBuffer.wrap(outputArray);
     // Holds the font table from this RTF doc, mapping
     // the font number (from \fN control word) to the
     // corresponding charset:
     private final Map<Integer, Charset> fontToCharset = new HashMap<>();
     // Group stack: when we open a new group, we push
     // the previous group state onto the stack; when we
     // close the group, we restore it
     private final LinkedList<GroupState> groupStates = new LinkedList<>();
     private final StringBuilder pendingBuffer = new StringBuilder();
     private final XHTMLContentHandler out;
     private final Metadata metadata;
     private final RTFEmbObjHandler embObjHandler;
     // How many next ansi chars we should skip; this
     // is 0 except when we are still in the "ansi
     // shadow" after seeing a unicode escape, at which
     // point it's set to the last ucN skip we had seen:
     int ansiSkip = 0;
     private int written = 0;
     // Hold pending bytes (encoded in the current charset)
     // for text output:
     private byte[] pendingBytes = new byte[16];
     private int pendingByteCount;
     private Buffer pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
     // Holds pending chars for text output
     private char[] pendingChars = new char[10];
     private int pendingCharCount;
     // Holds chars for a still-being-tokenized control word
     private byte[] pendingControl = new byte[10];
     private int pendingControlCount;
     // Reused when possible:
     private CharsetDecoder decoder;
     private Charset lastCharset;
     private Charset globalCharset = WINDOWS_1252;
     private int globalDefaultFont = -1;
     private int curFontID = -1;
     // Current group state; in theory this initial
     // GroupState is unused because the RTF doc should
     // immediately open the top group (start with {):
     private GroupState groupState = new GroupState();
     private boolean inHeader = true;
     //0 not yet in font table, 1 in font table, 2 have processed font table
     private int fontTableState = 0;
     //depth at which the font table started
     private int fontTableDepth;
     // Non null if we are processing metadata (title,
     // keywords, etc.) inside the info group:
     private Property nextMetaData;
     private boolean inParagraph;
     // Non-zero if we are processing inside a field destination:
     private int fieldState;
     // Non-zero list index
     private int pendingListEnd;
     private Map<Integer, ListDescriptor> listTable = new HashMap<>();
     private Map<Integer, ListDescriptor> listOverrideTable = new HashMap<>();
     private Map<Integer, ListDescriptor> currentListTable;
     private ListDescriptor currentList;
     private int listTableLevel = -1;
     private boolean ignoreListMarkup;
     // Non-null if we've seen the url for a HYPERLINK but not yet
     // its text:
     private String pendingURL;
     // Used to process the sub-groups inside the upr
     // group:
     private int uprState = -1;
     // Used when extracting CREATION date:
     private int year, month, day, hour, minute;

     //This keeps track of the following elements as they are
     //written to the handler: p, li, ol, ul
     //This tries to prevent malformed tag orders in the RTF
     //e.g. <p></li></ol></p>
     //from generating malformed xml tags. (TIKA-2899)
     //This may conceal problems with our parser.
     //TODO:
     // 1) do we need to add all elements, a, b, i, etc.
     // 2) are we doing the right thing by ignoring an element
     //    if its match doesn't pop off the stack...or should
     //    we pop all at the first failure.
     private Stack<String> paragraphStack = new Stack<>();
     //this is an arbitrary limit on the size of the stack
     //to defend against DoS with memory consumption
     private int maxStackSize = 1000;

     public TextExtractor(XHTMLContentHandler out, Metadata metadata,
                          RTFEmbObjHandler embObjHandler) {
         this.metadata = metadata;
         this.out = out;
         this.embObjHandler = embObjHandler;
     }

     private static Charset getCharset(String name) {
         try {
             return CharsetUtils.forName(name);
         } catch (IllegalArgumentException e) {
             return ASCII;
         }
     }

     protected static boolean isHexChar(int ch) {
         return (ch >= '0' && ch <= '9') || (ch >= 'a' && ch <= 'f') || (ch >= 'A' && ch <= 'F');
     }

     private static boolean isAlpha(int ch) {
         return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
     }

     private static boolean isDigit(int ch) {
         return ch >= '0' && ch <= '9';
     }

     protected static int hexValue(int ch) {
         if (ch >= '0' && ch <= '9') {
             return ch - '0';
         } else if (ch >= 'a' && ch <= 'z') {
             return 10 + (ch - 'a');
         } else {
             assert ch >= 'A' && ch <= 'Z';
             return 10 + (ch - 'A');
         }
     }

     public boolean isIgnoringLists() {
         return ignoreListMarkup;
     }

     public void setIgnoreListMarkup(boolean ignore) {
         this.ignoreListMarkup = ignore;
     }

     // Push pending bytes or pending chars:
     private void pushText() throws IOException, SAXException, TikaException {
         if (pendingByteCount != 0) {
             assert pendingCharCount == 0;
             pushBytes();
         } else {
             pushChars();
         }
     }

     // Buffers the byte (unit in the current charset) for
     // output:
     private void addOutputByte(int b) throws IOException, SAXException, TikaException {
         assert b >= 0 && b < 256 : "byte value out of range: " + b;

         if (pendingCharCount != 0) {
             pushChars();
         }
         if (groupState.pictDepth > 0) {
             embObjHandler.writeMetadataChar((char) b);
         } else {
             // Save the byte in pending buffer:
             if (pendingByteCount == pendingBytes.length) {
                 // Gradual but exponential growth:
                 final byte[] newArray = new byte[(int) (pendingBytes.length * 1.25)];
                 System.arraycopy(pendingBytes, 0, newArray, 0, pendingBytes.length);
                 pendingBytes = newArray;
                 pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
             }
             pendingBytes[pendingByteCount++] = (byte) b;
         }
     }

     // Buffers a byte as part of a control word:
     private void addControl(int b) {
         assert isAlpha(b);
         // Save the byte in pending buffer:
         if (pendingControlCount == pendingControl.length) {
             // Gradual but exponential growth:
             final byte[] newArray = new byte[(int) (pendingControl.length * 1.25)];
             System.arraycopy(pendingControl, 0, newArray, 0, pendingControl.length);
             pendingControl = newArray;
         }
         pendingControl[pendingControlCount++] = (byte) b;
     }

     // Buffers a UTF16 code unit for output
     private void addOutputChar(char ch) throws IOException, SAXException, TikaException {
         if (pendingByteCount != 0) {
             pushBytes();
         }

         if (inHeader || fieldState == 1) {
             pendingBuffer.append(ch);
         } else if (groupState.sn == true || groupState.sv == true) {
             embObjHandler.writeMetadataChar(ch);
         } else {
             if (pendingCharCount == pendingChars.length) {
                 // Gradual but exponential growth:
                 final char[] newArray = new char[(int) (pendingChars.length * 1.25)];
                 System.arraycopy(pendingChars, 0, newArray, 0, pendingChars.length);
                 pendingChars = newArray;
             }
             pendingChars[pendingCharCount++] = ch;
         }
     }

     // Shallow parses the entire doc, writing output to
     // this.out and this.metadata
     public void extract(InputStream in) throws IOException, SAXException, TikaException {
 //        in = new FilterInputStream(in) {
 //            public int read() throws IOException {
 //                int r = super.read();
 //                System.out.write(r);
 //                System.out.flush();
 //                return r;
 //            }
 //            public int read(byte b[], int off, int len) throws IOException {
 //                int r = super.read(b, off, len);
 //                System.out.write(b, off, r);
 //                System.out.flush();
 //                return r;
 //            }
 //        };
         extract(new PushbackInputStream(in, 2));
     }

     private void extract(PushbackInputStream in) throws IOException, SAXException, TikaException {
         out.startDocument();

         while (true) {
             final int b = in.read();
             if (b == -1) {
                 break;
             } else if (b == '\\') {
                 parseControlToken(in);
             } else if (b == '{') {
                 pushText();
                 processGroupStart(in);
             } else if (b == '}') {
                 pushText();
                 processGroupEnd();
                 if (groupStates.isEmpty()) {
                     // parsed document closing brace
                     break;
                 }
             } else if (groupState.objdata == true || groupState.pictDepth == 1) {
                 embObjHandler.writeHexChar(b);
             } else if (b != '\r' && b != '\n' &&
                     (!groupState.ignore || nextMetaData != null || groupState.sn == true ||
                             groupState.sv == true)) {
                 // Linefeed and carriage return are not
                 // significant
                 if (ansiSkip != 0) {
                     ansiSkip--;
                 } else {
                     addOutputByte(b);
                 }
             }
         }

         endParagraph(false);

         //close out whatever tags were left
         while (paragraphStack.size() > 0) {
             end(paragraphStack.pop());
         }
         out.endDocument();
     }

     private void parseControlToken(PushbackInputStream in)
             throws IOException, SAXException, TikaException {
         int b = in.read();
         if (b == '\'') {
             // escaped hex char
             parseHexChar(in);
         } else if (isAlpha(b)) {
             // control word
             parseControlWord((char) b, in);
         } else if (b == '{' || b == '}' || b == '\\' || b == '\r' || b == '\n') {
             // escaped char
             addOutputByte(b);
         } else if (b != -1) {
             // control symbol, eg \* or \~
             processControlSymbol((char) b);
         }
     }

     private void parseHexChar(PushbackInputStream in)
             throws IOException, SAXException, TikaException {
         int hex1 = in.read();
         if (!isHexChar(hex1)) {
             // DOC ERROR (malformed hex escape): ignore
             in.unread(hex1);
             return;
         }

         int hex2 = in.read();
         if (!isHexChar(hex2)) {
             // TODO: log a warning here, somehow?
             // DOC ERROR (malformed hex escape):
             // ignore
             in.unread(hex2);
             return;
         }

         if (ansiSkip != 0) {
             // Skip this ansi char since we are
             // still in the shadow of a unicode
             // escape:
             ansiSkip--;
         } else {
             // Unescape:
             addOutputByte(16 * hexValue(hex1) + hexValue(hex2));
         }
     }

     private void parseControlWord(int firstChar, PushbackInputStream in)
             throws IOException, SAXException, TikaException {
         addControl(firstChar);

         int b = in.read();
         while (isAlpha(b)) {
             addControl(b);
             b = in.read();
         }

         boolean hasParam = false;
         boolean negParam = false;
         if (b == '-') {
             negParam = true;
             hasParam = true;
             b = in.read();
         }

         int param = 0;
         while (isDigit(b)) {
             param *= 10;
             param += (b - '0');
             hasParam = true;
             b = in.read();
         }

         // space is consumed as part of the
         // control word, but is not added to the
         // control word
         if (b != ' ') {
             in.unread(b);
         }

         if (hasParam) {
             if (negParam) {
                 param = -param;
             }
             processControlWord(param, in);
         } else {
             processControlWord();
         }

         pendingControlCount = 0;
     }

     private void lazyStartParagraph() throws IOException, SAXException, TikaException {

         boolean localInParagraph = inParagraph;
         if (paragraphStack.size() > 0 && paragraphStack.contains(P)) {
             localInParagraph = true;
         }
         if (!localInParagraph) {
             // Ensure </i></b> order
             if (groupState.italic) {
                 end("i");
             }
             if (groupState.bold) {
                 end("b");
             }
             if (pendingListEnd != 0 && groupState.list != pendingListEnd) {
                 endList(pendingListEnd);
                 pendingListEnd = 0;
             }
             if (inList() && pendingListEnd != groupState.list) {
                 startList(groupState.list);
             }
             if (inList()) {
                 start(LI);
                 pushParagraphTag(LI);
             } else {
                 start(P);
                 pushParagraphTag(P);
             }

             // Ensure <b><i> order
             if (groupState.bold) {
                 start("b");
             }
             if (groupState.italic) {
                 start("i");
             }
             inParagraph = true;
         }
     }

     private void pushParagraphTag(String tag) {
         if (paragraphStack.size() < maxStackSize) {
             paragraphStack.push(tag);
         } else {
             //ignore.  Something is very, very wrong...
         }
     }

     private void endParagraph(boolean preserveStyles)
             throws IOException, SAXException, TikaException {
         pushText();
         //maintain consecutive new lines
         if (!inParagraph) {
             lazyStartParagraph();
         }
         if (inParagraph || paragraphStack.size() > 0) {
             if (groupState.italic) {
                 end("i");
                 groupState.italic = preserveStyles;
             }
             if (groupState.bold) {
                 end("b");
                 groupState.bold = preserveStyles;
             }
             boolean badTagAlignment = false;
             if (inList()) {
                 if (paragraphStack.size() > 0) {
                     String lastP = paragraphStack.pop();
                     if (lastP.equals(LI)) {
                         end(LI);
                     } else {
                         pushParagraphTag(lastP);
                         badTagAlignment = true;
                     }
                 } else {
                     //there should have been a starting li
                 }
             } else {
                 if (paragraphStack.size() > 0) {
                     String lastP = paragraphStack.pop();
                     if (P.equals(lastP)) {
                         end(P);
                     } else {
                         pushParagraphTag(lastP);
                         badTagAlignment = true;
                     }
                 }
             }
             //if there was a failure in tag alignment,
             //dump all tags and start fresh.
             if (badTagAlignment) {
                 while (paragraphStack.size() > 0) {
                     end(paragraphStack.pop());
                 }
             }
             if (preserveStyles && (groupState.bold || groupState.italic)) {
                 start(P);
                 pushParagraphTag(P);
                 if (groupState.bold) {
                     start("b");
                 }
                 if (groupState.italic) {
                     start("i");
                 }
                 inParagraph = true;
             } else {
                 inParagraph = false;
             }
         }

         // Ensure closing the list at document end
         if (!preserveStyles && pendingListEnd != 0) {
             endList(pendingListEnd);
             pendingListEnd = 0;
         }
     }

     // Push pending UTF16 units to out ContentHandler
     private void pushChars() throws IOException, SAXException, TikaException {
         if (pendingCharCount != 0) {
             lazyStartParagraph();
             out.characters(pendingChars, 0, pendingCharCount);
             pendingCharCount = 0;
         }
     }

     // Decodes the buffered bytes in pendingBytes
     // into UTF16 code units, and sends the characters
     // to the out ContentHandler, if we are in the body,
     // else appends the characters to the pendingBuffer
     private void pushBytes() throws IOException, SAXException, TikaException {
         if (pendingByteCount > 0 && (!groupState.ignore || nextMetaData != null)) {

             final CharsetDecoder decoder = getDecoder();
             pendingByteBuffer.limit(pendingByteCount);
             assert pendingByteBuffer.position() == 0;
             assert outputCharBuffer.position() == 0;

             while (true) {
                 // We pass true for endOfInput because, when
                 // we are called, we should have seen a
                 // complete sequence of characters for this
                 // charset:
                 final CoderResult result = decoder.decode((ByteBuffer) pendingByteBuffer,
                         (CharBuffer) outputCharBuffer, true);

                 final int pos = outputCharBuffer.position();
                 if (pos > 0) {
                     if (inHeader || fieldState == 1) {
                         pendingBuffer.append(outputArray, 0, pos);
                     } else {
                         lazyStartParagraph();
                         out.characters(outputArray, 0, pos);
                     }
                     outputCharBuffer.position(0);
                 }

                 if (result == CoderResult.UNDERFLOW) {
                     break;
                 }
             }

             while (true) {
                 final CoderResult result = decoder.flush((CharBuffer) outputCharBuffer);

                 final int pos = outputCharBuffer.position();
                 if (pos > 0) {
                     if (inHeader || fieldState == 1) {
                         pendingBuffer.append(outputArray, 0, pos);
                     } else {
                         lazyStartParagraph();
                         out.characters(outputArray, 0, pos);
                     }
                     outputCharBuffer.position(0);
                 }

                 if (result == CoderResult.UNDERFLOW) {
                     break;
                 }
             }

             // Reset for next decode
             decoder.reset();
             pendingByteBuffer.position(0);
         }

         pendingByteCount = 0;
     }

     // NOTE: s must be ascii alpha only
     private boolean equals(String s) {
         if (pendingControlCount != s.length()) {
             return false;
         }
         for (int idx = 0; idx < pendingControlCount; idx++) {
             assert isAlpha(s.charAt(idx));
             if (((byte) s.charAt(idx)) != pendingControl[idx]) {
                 return false;
             }
         }
         return true;
     }

     private void processControlSymbol(char ch) throws IOException, SAXException, TikaException {
         switch (ch) {
             case '~':
                 // Non-breaking space -> unicode NON-BREAKING SPACE
                 addOutputChar('\u00a0');
                 break;
             case '*':
                 // Ignorable destination (control words defined after
                 // the 1987 RTF spec). These are already handled by
                 // processGroupStart()
                 break;
             case '-':
                 // Optional hyphen -> unicode SOFT HYPHEN
                 addOutputChar('\u00ad');
                 break;
             case '_':
                 // Non-breaking hyphen -> unicode NON-BREAKING HYPHEN
                 addOutputChar('\u2011');
                 break;
             default:
                 break;
         }
     }

     private CharsetDecoder getDecoder() throws TikaException {
         Charset charset = getCharset();

         // Common case: charset is same as last time, so
         // just reuse it:
         if (lastCharset == null || !charset.equals(lastCharset)) {
             decoder = charset.newDecoder();
             decoder.onMalformedInput(CodingErrorAction.REPLACE);
             decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
             lastCharset = charset;
         }

         return decoder;
     }

     // Return current charset in-use
     private Charset getCharset() throws TikaException {
         // If a specific font (fN) was set, use its charset
         if (groupState.fontCharset != null) {
             return groupState.fontCharset;
         }

         // Else, if global default font (defN) was set, use that one
         if (globalDefaultFont != -1 && !inHeader) {
             Charset cs = fontToCharset.get(globalDefaultFont);
             if (cs != null) {
                 return cs;
             }
         }

         // Else, use the global charset
         if (globalCharset == null) {
             throw new TikaException("unable to determine charset");
         }

         return globalCharset;
     }

     // Handle control word that takes a parameter:
     private void processControlWord(int param, PushbackInputStream in)
             throws IOException, SAXException, TikaException {
         // TODO: afN?  (associated font number)

         // TODO: do these alter text output...?
         /*
             } else if (equals("stshfdbch")) {
                 // font to be used by default in
                 // style sheet for East Asian chars
                 // arg N is font table entry
             } else if (equals("stshfloch")) {
                 // font to be used by default in
                 // style sheet for ASCII chars
                 // arg N is font table entry
             } else if (equals("stshfhich")) {
                 // font to be used by default in
                 // style sheet for High Ansi chars
                 // arg N is font table entry
             } else if (equals("stshfbi")) {
                 // style sheet for Complex Scripts (BIDI) chars
                 // arg N is font table entry
                 */

         // TODO: inefficient that we check equals N times;
         // we'd get better perf w/ real lexer (eg
         // JFlex), which uses single-pass FSM to do cmp:
         if (inHeader) {
             if (equals("ansicpg")) {
                 // ANSI codepage
                 Charset cs = ANSICPG_MAP.get(param);
                 if (cs != null) {
                     globalCharset = cs;
                 }
             } else if (equals("deff")) {
                 // Default font
                 globalDefaultFont = param;
             } else if (equals("nofpages")) {
                 metadata.add(Office.PAGE_COUNT, Integer.toString(param));
             } else if (equals("nofwords")) {
                 metadata.add(Office.WORD_COUNT, Integer.toString(param));
             } else if (equals("nofchars")) {
                 metadata.add(Office.CHARACTER_COUNT, Integer.toString(param));
             } else if (equals("yr")) {
                 year = param;
             } else if (equals("mo")) {
                 month = param;
             } else if (equals("dy")) {
                 day = param;
             } else if (equals("hr")) {
                 hour = param;
             } else if (equals("min")) {
                 minute = param;
             }

             if (fontTableState == 1) {
                 // Still inside font table -- record the
                 // mappings of fN to the fcharset:
                 if (groupState.depth < fontTableDepth) {
                     fontTableState = 2;
                 } else {
                     if (equals("f")) {
                         // Start new font definition
                         curFontID = param;
                     } else if (equals("fcharset")) {
                         Charset cs = FCHARSET_MAP.get(param);
                         if (cs != null) {
                             fontToCharset.put(curFontID, cs);
                         }
                     }
                 }
             }
             //if you've already seen the font table,
             //you aren't in another header item (e.g. styles)
             //and you see an fX, you're out of the header
             if (fontTableState == 2 && !groupState.ignore && equals("f")) {
                 inHeader = false;
             }

             if (currentList != null) {
                 if (equals("listid")) {
                     currentList.id = param;
                     currentListTable.put(currentList.id, currentList);
                 } else if (equals("listtemplateid")) {
                     currentList.templateID = param;
                 } else if (equals("levelnfc") || equals("levelnfcn")) {
                     //sanity check to make sure list information isn't corrupt
                     if (listTableLevel > -1 && listTableLevel < currentList.numberType.length) {
                         currentList.numberType[listTableLevel] = param;
                     }
                 }
             }
         } else {
             // In document
             if (equals("b")) {
                 // b0
                 assert param == 0;
                 if (groupState.bold) {
                     pushText();
                     if (groupState.italic) {
                         end("i");
                     }
                     end("b");
                     if (groupState.italic) {
                         start("i");
                     }
                     groupState.bold = false;
                 }
             } else if (equals("i")) {
                 // i0
                 assert param == 0;
                 if (groupState.italic) {
                     pushText();
                     end("i");
                     groupState.italic = false;
                 }
             } else if (equals("f")) {
                 // Change current font
                 Charset fontCharset = fontToCharset.get(param);

                 // Push any buffered text before changing
                 // font:
                 pushText();

                 if (fontCharset != null) {
                     groupState.fontCharset = fontCharset;
                 } else {
                     // DOC ERROR: font change referenced a
                     // non-table'd font number
                     // TODO: log a warning?  Throw an exc?
                     groupState.fontCharset = null;
                 }
             } else if (equals("ls")) {
                 groupState.list = param;
             } else if (equals("lslvl")) {
                 groupState.listLevel = param;
             }
         }

         // Process unicode escape. This can appear in doc
         // or in header, since the metadata (info) fields
         // in the header can be unicode escaped as well:
         if (equals("u")) {
             // Unicode escape
             if (!groupState.ignore || groupState.sv || groupState.sn) {
                 final char utf16CodeUnit = (char) (param & 0xffff);
                 addOutputChar(utf16CodeUnit);
             }

             // After seeing a unicode escape we must
             // skip the next ucSkip ansi chars (the
             // "unicode shadow")
             ansiSkip = groupState.ucSkip;
         } else if (equals("uc")) {
             // Change unicode shadow length
             groupState.ucSkip = param;
         } else if (equals("bin")) {
             if (param >= 0) {
                 if (groupState.pictDepth == 1) {
                     try {
                         embObjHandler.writeBytes(in, param);
                     } catch (IOException | TikaException e) {
                         EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                         embObjHandler.reset();
                     }
                 } else {
                     IOUtils.skipFully(in, param);
                 }
             } else {
                 // log some warning?
             }
         }
     }

     private boolean inList() {
         return !ignoreListMarkup && groupState.list != 0;
     }

     /**
      * Marks the current list as pending to end. This is done to be able to merge list items of
      * the same list within the same enclosing list tag (ie. either <code>"ul"</code>, or
      * <code>"ol"</code>).
      */
     private void pendingListEnd() {
         pendingListEnd = groupState.list;
         groupState.list = 0;
     }

     /**
      * Emits the end tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
      * type for the given <code>listID</code>.
      *
      * @param listID The ID of the list.
      * @throws IOException
      * @throws SAXException
      * @throws TikaException
      */
     private void endList(int listID) throws IOException, SAXException, TikaException {
         if (!ignoreListMarkup) {
             String xl = isUnorderedList(listID) ? UL : OL;
             if (paragraphStack.size() > 0) {
                 String p = paragraphStack.pop();
                 if (xl.equals(p)) {
                     end(xl);
                 }
             } else {
                 //stack as empty, the list was never started
             }
         }
     }

     /**
      * Emits the start tag of a list. Uses {@link #isUnorderedList(int)} to determine the list
      * type for the given <code>listID</code>.
      *
      * @param listID The ID of the list.
      * @throws IOException
      * @throws SAXException
      * @throws TikaException
      */
     private void startList(int listID) throws IOException, SAXException, TikaException {
         if (!ignoreListMarkup) {
             String xl = isUnorderedList(listID) ? UL : OL;
             start(xl);
             pushParagraphTag(xl);
         }
     }

     private boolean isUnorderedList(int listID) {
         ListDescriptor list = listTable.get(listID);
         if (list != null) {
             return list.isUnordered(groupState.listLevel);
         }
         return true;
     }

     private void end(String tag) throws IOException, SAXException, TikaException {
         out.endElement(tag);
     }

     private void start(String tag) throws IOException, SAXException, TikaException {
         out.startElement(tag);
     }

     // Handle non-parameter control word:
     private void processControlWord() throws IOException, SAXException, TikaException {
         if (inHeader) {
             if (equals("ansi")) {
                 globalCharset = WINDOWS_1252;
             } else if (equals("pca")) {
                 globalCharset = CP850;
             } else if (equals("pc")) {
                 globalCharset = CP437;
             } else if (equals("mac")) {
                 globalCharset = MAC_ROMAN;
             }

             if (equals("colortbl") || equals("stylesheet") || equals("fonttbl")) {
                 groupState.ignore = true;
             } else if (equals("listtable")) {
                 currentListTable = listTable;
             } else if (equals("listoverridetable")) {
                 currentListTable = listOverrideTable;
             }

             if (uprState == -1) {
                 // TODO: we can also parse \creatim, \revtim,
                 // \printim, \version, etc.
                 if (equals("author")) {
                     nextMetaData = TikaCoreProperties.CREATOR;
                 } else if (equals("title")) {
                     nextMetaData = TikaCoreProperties.TITLE;
                 } else if (equals("subject")) {
                     nextMetaData = OfficeOpenXMLCore.SUBJECT;
                 } else if (equals("keywords")) {
                     nextMetaData = Office.KEYWORDS;
                 } else if (equals("category")) {
                     nextMetaData = OfficeOpenXMLCore.CATEGORY;
                 } else if (equals("comment")) {
                     nextMetaData = TikaCoreProperties.COMMENTS;
                 } else if (equals("company")) {
                     nextMetaData = OfficeOpenXMLExtended.COMPANY;
                 } else if (equals("manager")) {
                     nextMetaData = OfficeOpenXMLExtended.MANAGER;
                 } else if (equals("template")) {
                     nextMetaData = OfficeOpenXMLExtended.TEMPLATE;
                 } else if (equals("creatim")) {
                     nextMetaData = TikaCoreProperties.CREATED;
                 }
             }

             if (fontTableState == 0) {
                 // Didn't see font table yet
                 if (equals("fonttbl")) {
                     fontTableState = 1;
                     fontTableDepth = groupState.depth;
                 }
             } else if (fontTableState == 1) {
                 // Inside font table
                 if (groupState.depth < fontTableDepth) {
                     fontTableState = 2;
                 }
             }

             // List table handling
             if (currentListTable != null) {
                 if (equals("list") || equals("listoverride")) {
                     currentList = new ListDescriptor();
                     listTableLevel = -1;
                 } else if (currentList != null) {
                     if (equals("liststylename")) {
                         currentList.isStyle = true;
                     } else if (equals("listlevel")) {
                         listTableLevel++;
                     }
                 }
             }

             if (!groupState.ignore &&
                     (equals("par") || equals("pard") || equals("sect") || equals("sectd") ||
                             equals("plain") || equals("ltrch") || equals("rtlch") ||
                             equals("htmlrtf") || equals("line"))) {
                 inHeader = false;
             }
         } else {
             if (equals("b")) {
                 if (!groupState.bold) {
                     pushText();
                     lazyStartParagraph();
                     if (groupState.italic) {
                         // Make sure nesting is always <b><i>
                         end("i");
                     }
                     groupState.bold = true;
                     start("b");
                     if (groupState.italic) {
                         start("i");
                     }
                 }
             } else if (equals("i")) {
                 if (!groupState.italic) {
                     pushText();
                     lazyStartParagraph();
                     groupState.italic = true;
                     start("i");
                 }
             }
         }

         final boolean ignored = groupState.ignore;
         if (equals("pard")) {
             // Reset styles
             pushText();
             if (groupState.italic) {
                 end("i");
                 groupState.italic = false;
             }
             if (groupState.bold) {
                 end("b");
                 groupState.bold = false;
             }
             if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
                 pendingListEnd();
             }
         } else if (equals("plain")) {
             if (groupState.italic || groupState.bold) {
                 // Reset styles
                 pushText();
                 if (groupState.italic) {
                     end("i");
                     groupState.italic = false;
                 }
                 if (groupState.bold) {
                     end("b");
                     groupState.bold = false;
                 }
             }
         } else if (equals("par")) {
             if (!ignored) {
                 endParagraph(true);
                 if (inList()) { // && (groupStates.size() == 1 || groupStates.peekLast().list < 0))
                     pendingListEnd();
                 }
             }
         } else if (equals("shptxt")) {
             pushText();
             // Text inside a shape
             groupState.ignore = false;
         } else if (equals("chatn")) {
             addOutputChar(SPACE);
             pushText();
             // Annotation ID
             groupState.ignore = false;
         } else if (equals("atnid")) {
             addOutputChar(SPACE);
             pushText();
             // Annotation ID
             groupState.ignore = false;
         } else if (equals("atnauthor")) {
             addOutputChar(SPACE);
             pushText();
             // Annotation author
             groupState.ignore = false;
         } else if (equals("annotation")) {
             groupState.annotation = true;
             pushText();
             // Annotation
             groupState.ignore = false;
         } else if (equals("listtext")) {
             groupState.ignore = true;
         } else if (equals("cell")) {
             // TODO: we should produce a table output here?
             //addOutputChar(' ');
             endParagraph(true);
         } else if (equals("sp")) {
             groupState.sp = true;
         } else if (equals("sn")) {
             embObjHandler.startSN();
             groupState.sn = true;
         } else if (equals("sv")) {
             embObjHandler.startSV();
             groupState.sv = true;
         } else if (equals("object")) {
             pushText();
             embObjHandler.setInObject(true);
             groupState.object = true;
         } else if (equals("objdata")) {
             groupState.objdata = true;
             embObjHandler.startObjData();
         } else if (equals("pict")) {
             pushText();
             // TODO: create img tag?  but can that support
             // embedded image data?
             groupState.pictDepth = 1;
             embObjHandler.startPict();
         } else if (equals("line")) {
             if (!ignored) {
                 addOutputChar('\n');
             }
         } else if (equals("column")) {
             if (!ignored) {
                 addOutputChar(' ');
             }
         } else if (equals("page")) {
             if (!ignored) {
                 addOutputChar('\n');
             }
         } else if (equals("softline")) {
             if (!ignored) {
                 addOutputChar('\n');
             }
         } else if (equals("softcolumn")) {
             if (!ignored) {
                 addOutputChar(' ');
             }
         } else if (equals("softpage")) {
             if (!ignored) {
                 addOutputChar('\n');
             }
         } else if (equals("tab")) {
             if (!ignored) {
                 addOutputChar('\t');
             }
         } else if (equals("upr")) {
             uprState = 0;
         } else if (equals("ud") && uprState == 1) {
             uprState = -1;
             // 2nd group inside the upr destination, which
             // contains the unicode encoding of the text, so
             // we want to keep that:
             groupState.ignore = false;
         } else if (equals("bullet")) {
             if (!ignored) {
                 // unicode BULLET
                 addOutputChar('\u2022');
             }
         } else if (equals("endash")) {
             if (!ignored) {
                 // unicode EN DASH
                 addOutputChar('\u2013');
             }
         } else if (equals("emdash")) {
             if (!ignored) {
                 // unicode EM DASH
                 addOutputChar('\u2014');
             }
         } else if (equals("enspace")) {
             if (!ignored) {
                 // unicode EN SPACE
                 addOutputChar('\u2002');
             }
         } else if (equals("qmspace")) {
             if (!ignored) {
                 // quarter em space -> unicode FOUR-PER-EM SPACE
                 addOutputChar('\u2005');
             }
         } else if (equals("emspace")) {
             if (!ignored) {
                 // unicode EM SPACE
                 addOutputChar('\u2003');
             }
         } else if (equals("lquote")) {
             if (!ignored) {
                 // unicode LEFT SINGLE QUOTATION MARK
                 addOutputChar('\u2018');
             }
         } else if (equals("rquote")) {
             if (!ignored) {
                 // unicode RIGHT SINGLE QUOTATION MARK
                 addOutputChar('\u2019');
             }
         } else if (equals("ldblquote")) {
             if (!ignored) {
                 // unicode LEFT DOUBLE QUOTATION MARK
                 addOutputChar('\u201C');
             }
         } else if (equals("rdblquote")) {
             if (!ignored) {
                 // unicode RIGHT DOUBLE QUOTATION MARK
                 addOutputChar('\u201D');
             }
         } else if (equals("fldinst")) {
             fieldState = 1;
             groupState.ignore = false;
         } else if (equals("fldrslt") && fieldState == 2) {
             assert pendingURL != null;
             lazyStartParagraph();
             out.startElement("a", "href", pendingURL);
             pendingURL = null;
             fieldState = 3;
             groupState.ignore = false;
         }
     }

     // Push new GroupState
     private void processGroupStart(PushbackInputStream in) throws IOException {
         ansiSkip = 0;
         // Push current groupState onto the stack
         groupStates.add(groupState);

         // Make new GroupState
         groupState = new GroupState(groupState);
         assert groupStates.size() == groupState.depth :
                 "size=" + groupStates.size() + " depth=" + groupState.depth;

         if (uprState == 0) {
             uprState = 1;
             groupState.ignore = true;
         }

         // Check for ignorable groups. Note that
         // sometimes we un-ignore within this group, eg
         // when handling upr escape.
         int b2 = in.read();
         if (b2 == '\\') {
             int b3 = in.read();
             if (b3 == '*') {
                 groupState.ignore = true;
             }
             in.unread(b3);
         }
         in.unread(b2);
     }

     // Pop current GroupState
     private void processGroupEnd() throws IOException, SAXException, TikaException {
         if (inHeader) {
             if (nextMetaData != null) {
                 if (nextMetaData == TikaCoreProperties.CREATED) {
                     Calendar cal = Calendar.getInstance(TimeZone.getDefault(), Locale.ROOT);
                     cal.set(year, month - 1, day, hour, minute, 0);
                     metadata.set(nextMetaData, cal.getTime());
                 } else if (nextMetaData.isMultiValuePermitted()) {
                     metadata.add(nextMetaData, pendingBuffer.toString());
                 } else {
                     metadata.set(nextMetaData, pendingBuffer.toString());
                 }
                 nextMetaData = null;
             }
             pendingBuffer.setLength(0);
         }

         assert groupState.depth > 0;
         ansiSkip = 0;

         if (groupState.objdata == true) {
             try {
                 embObjHandler.handleCompletedObject();
             } catch (TikaException | IOException e) {
                 EmbeddedDocumentUtil.recordException(e, metadata);
             }
             groupState.objdata = false;
         } else if (groupState.pictDepth > 0) {
             if (groupState.sn == true) {
                 embObjHandler.endSN();
             } else if (groupState.sv == true) {
                 embObjHandler.endSV();
             } else if (groupState.sp == true) {
                 embObjHandler.endSP();
             } else if (groupState.pictDepth == 1) {
                 embObjHandler.handleCompletedObject();
             }
         }
         if (groupState.annotation == true) {
             addOutputChar(SPACE);
         }
         if (groupState.object == true) {
             embObjHandler.setInObject(false);
         }

         // Be robust if RTF doc is corrupt (has too many
         // closing }s):
         // TODO: log a warning?
         if (groupStates.size() > 0) {
             // Restore group state:
             final GroupState outerGroupState = groupStates.removeLast();

             // Close italic, if outer does not have italic or
             // bold changed:
             if (groupState.italic) {
                 if (!outerGroupState.italic || groupState.bold != outerGroupState.bold) {
                     end("i");
                     groupState.italic = false;
                 }
             }

             // Close bold
             if (groupState.bold && !outerGroupState.bold) {
                 end("b");
             }

             // Open bold
             if (!groupState.bold && outerGroupState.bold) {
                 start("b");
             }

             // Open italic
             if (!groupState.italic && outerGroupState.italic) {
                 start("i");
             }
             groupState = outerGroupState;
         }
         assert groupStates.size() == groupState.depth;

         if (fieldState == 1) {
             String s = pendingBuffer.toString().trim();
             pendingBuffer.setLength(0);
             if (s.startsWith("HYPERLINK")) {
                 s = s.substring(9).trim();
                 // TODO: what other instructions can be in a
                 // HYPERLINK destination?
                 final boolean isLocalLink = s.contains("\\l ");
                 int idx = s.indexOf('"');
                 if (idx != -1) {
                     int idx2 = s.indexOf('"', 1 + idx);
                     if (idx2 != -1) {
                         s = s.substring(1 + idx, idx2);
                     }
                 }
                 pendingURL = (isLocalLink ? "#" : "") + s;
                 fieldState = 2;
             } else {
                 fieldState = 0;
             }

             // TODO: we could process the other known field
             // types.  Right now, we will extract their text
             // inlined, but fail to record them in metadata
             // as a field value.
         } else if (fieldState == 3) {
             end("a");
             fieldState = 0;
         }
     }
 }