tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFWordExtractorDecorator.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.microsoft.ooxml;

 import java.io.IOException;
 import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.Deque;
 import java.util.HashMap;
 import java.util.List;
 import javax.xml.namespace.QName;

 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
 import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
 import org.apache.poi.xwpf.usermodel.BodyType;
 import org.apache.poi.xwpf.usermodel.IBody;
 import org.apache.poi.xwpf.usermodel.IBodyElement;
 import org.apache.poi.xwpf.usermodel.ICell;
 import org.apache.poi.xwpf.usermodel.IRunElement;
 import org.apache.poi.xwpf.usermodel.ISDTContent;
 import org.apache.poi.xwpf.usermodel.XWPFDocument;
 import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
 import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
 import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
 import org.apache.poi.xwpf.usermodel.XWPFParagraph;
 import org.apache.poi.xwpf.usermodel.XWPFPicture;
 import org.apache.poi.xwpf.usermodel.XWPFPictureData;
 import org.apache.poi.xwpf.usermodel.XWPFRelation;
 import org.apache.poi.xwpf.usermodel.XWPFRun;
 import org.apache.poi.xwpf.usermodel.XWPFSDT;
 import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
 import org.apache.poi.xwpf.usermodel.XWPFStyle;
 import org.apache.poi.xwpf.usermodel.XWPFStyles;
 import org.apache.poi.xwpf.usermodel.XWPFTable;
 import org.apache.poi.xwpf.usermodel.XWPFTableCell;
 import org.apache.poi.xwpf.usermodel.XWPFTableRow;
 import org.apache.xmlbeans.XmlCursor;
 import org.apache.xmlbeans.XmlException;
 import org.apache.xmlbeans.XmlObject;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;

 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.FormattingUtils;
 import org.apache.tika.parser.microsoft.WordExtractor;
 import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
 import org.apache.tika.sax.XHTMLContentHandler;

 public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {

     // could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3,
     // Part 3, Step 3
     private static final String LIST_DELIMITER = " ";


     //include all parts that might have embedded objects
     private final static String[] MAIN_PART_RELATIONS =
             new String[]{XWPFRelation.HEADER.getRelation(), XWPFRelation.FOOTER.getRelation(),
                     XWPFRelation.FOOTNOTE.getRelation(),
                     "http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
                     "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
                     AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA};


     private XWPFDocument document;
     private XWPFStyles styles;
     private Metadata metadata;

     public XWPFWordExtractorDecorator(Metadata metadata, ParseContext context,
                                       XWPFWordExtractor extractor) {
         super(context, extractor);
         this.metadata = metadata;
         document = (XWPFDocument) extractor.getDocument();
         styles = document.getStyles();
     }

     /**
      * @param context
      * @param extractor
      * @deprecated use {@link XWPFWordExtractorDecorator#XWPFWordExtractorDecorator(Metadata,
      * ParseContext, XWPFWordExtractor)}
      */
     @Deprecated
     public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) {
         this(new Metadata(), context, extractor);
     }

     /**
      * @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
      */
     @Override
     protected void buildXHTML(XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
         XWPFListManager listManager = new XWPFListManager(document.getNumbering());
         // headers
         if (hfPolicy != null && config.isIncludeHeadersAndFooters()) {
             extractHeaders(xhtml, hfPolicy, listManager);
         }

         // process text in the order that it occurs in
         extractIBodyText(document, listManager, xhtml);

         //handle the diagram data
         handleGeneralTextContainingPart(RELATION_DIAGRAM_DATA, "diagram-data",
                 document.getPackagePart(), metadata,
                 new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
                         new HashMap<>()//empty
                 ));
         //handle chart data
         handleGeneralTextContainingPart(XSSFRelation.CHART.getRelation(), "chart",
                 document.getPackagePart(), metadata,
                 new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
                         new HashMap<>()//empty
                 ));

         // then all document footers
         if (hfPolicy != null && config.isIncludeHeadersAndFooters()) {
             extractFooters(xhtml, hfPolicy, listManager);
         }
     }

     private void extractIBodyText(IBody bodyElement, XWPFListManager listManager,
                                   XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         for (IBodyElement element : bodyElement.getBodyElements()) {
             if (element instanceof XWPFParagraph) {
                 XWPFParagraph paragraph = (XWPFParagraph) element;
                 extractParagraph(paragraph, listManager, xhtml);
             }
             if (element instanceof XWPFTable) {
                 XWPFTable table = (XWPFTable) element;
                 extractTable(table, listManager, xhtml);
             }
             if (element instanceof XWPFSDT) {
                 extractSDT((XWPFSDT) element, xhtml);
             }

         }
     }

     private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         ISDTContent content = element.getContent();
         String tag = "p";
         xhtml.startElement(tag);
         xhtml.characters(content.getText());
         xhtml.endElement(tag);
     }

     private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager,
                                   XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         // If this paragraph is actually a whole new section, then
         //  it could have its own headers and footers
         // Check and handle if so
         XWPFHeaderFooterPolicy headerFooterPolicy = null;
         if (paragraph.getCTP().getPPr() != null) {
             CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
             if (ctSectPr != null && config.isIncludeHeadersAndFooters()) {
                 headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
                 extractHeaders(xhtml, headerFooterPolicy, listManager);
             }
         }

         // Is this a paragraph, or a heading?
         String tag = "p";
         String styleClass = null;
         //TIKA-2144 check that styles is not null
         if (paragraph.getStyleID() != null && styles != null) {
             XWPFStyle style = styles.getStyle(paragraph.getStyleID());

             if (style != null && style.getName() != null) {
                 TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(),
                         paragraph.getPartType() == BodyType.TABLECELL);
                 tag = tas.getTag();
                 styleClass = tas.getStyleClass();
             }
         }

         if (styleClass == null) {
             xhtml.startElement(tag);
         } else {
             xhtml.startElement(tag, "class", styleClass);
         }

         writeParagraphNumber(paragraph, listManager, xhtml);
         // Output placeholder for any embedded docs:

         // TODO: replace w/ XPath/XQuery:
         for (XWPFRun run : paragraph.getRuns()) {
             XmlCursor c = run.getCTR().newCursor();
             c.selectPath("./*");
             while (c.toNextSelection()) {
                 XmlObject o = c.getObject();
                 if (o instanceof CTObject) {
                     XmlCursor c2 = o.newCursor();
                     c2.selectPath("./*");
                     while (c2.toNextSelection()) {
                         XmlObject o2 = c2.getObject();

                         XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
                         if (embedAtt != null &&
                                 embedAtt.getDomNode().getNodeValue().equals("Embed")) {
                             // Type is "Embed"
                             XmlObject relIDAtt = o2.selectAttribute(new QName(
                                     "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
                                     "id"));
                             if (relIDAtt != null) {
                                 String relID = relIDAtt.getDomNode().getNodeValue();
                                 AttributesImpl attributes = new AttributesImpl();
                                 attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                                 attributes.addAttribute("", "id", "id", "CDATA", relID);
                                 xhtml.startElement("div", attributes);
                                 xhtml.endElement("div");
                             }
                         }
                     }
                     c2.dispose();
                 }
             }

             c.dispose();
         }

         // Attach bookmarks for the paragraph
         // (In future, we might put them in the right place, for now
         //  we just put them in the correct paragraph)
         for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
             CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
             xhtml.startElement("a", "name", bookmark.getName());
             xhtml.endElement("a");
         }

         Deque<FormattingUtils.Tag> formattingState = new ArrayDeque<>();

         //hyperlinks may or may not have hyperlink ids
         String lastHyperlinkId = null;
         boolean inHyperlink = false;
         // Do the iruns
         for (IRunElement run : paragraph.getIRuns()) {
             if (run instanceof XWPFHyperlinkRun) {
                 XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
                 if (hyperlinkRun.getHyperlinkId() == null ||
                         !hyperlinkRun.getHyperlinkId().equals(lastHyperlinkId)) {
                     if (inHyperlink) {
                         //close out the old one
                         FormattingUtils.closeStyleTags(xhtml, formattingState);
                         xhtml.endElement("a");
                         inHyperlink = false;
                     }
                     lastHyperlinkId = hyperlinkRun.getHyperlinkId();
                     FormattingUtils.closeStyleTags(xhtml, formattingState);
                     XWPFHyperlink link = hyperlinkRun.getHyperlink(document);
                     if (link != null && link.getURL() != null) {
                         xhtml.startElement("a", "href", link.getURL());
                         inHyperlink = true;
                     } else if (hyperlinkRun.getAnchor() != null &&
                             hyperlinkRun.getAnchor().length() > 0) {
                         xhtml.startElement("a", "href", "#" + hyperlinkRun.getAnchor());
                         inHyperlink = true;
                     }
                 }
             } else if (inHyperlink) {
                 //if this isn't a hyperlink, but the last one was
                 FormattingUtils.closeStyleTags(xhtml, formattingState);
                 xhtml.endElement("a");
                 lastHyperlinkId = null;
                 inHyperlink = false;
             }

             if (run instanceof XWPFSDT) {
                 FormattingUtils.closeStyleTags(xhtml, formattingState);
                 processSDTRun((XWPFSDT) run, xhtml);
                 //for now, we're ignoring formatting in sdt
                 //if you hit an sdt reset to false
             } else {
                 processRun((XWPFRun) run, paragraph, xhtml, formattingState);
             }
         }
         FormattingUtils.closeStyleTags(xhtml, formattingState);
         if (inHyperlink) {
             xhtml.endElement("a");
         }


         // Now do any comments for the paragraph
         XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
         String commentText = comments.getCommentText();
         if (commentText != null && commentText.length() > 0) {
             xhtml.characters(commentText);
         }

         String footnameText = paragraph.getFootnoteText();
         if (footnameText != null && footnameText.length() > 0) {
             xhtml.characters(footnameText + "\n");
         }

         // Also extract any paragraphs embedded in text boxes
         //Note "w:txbxContent//"...must look for all descendant paragraphs
         //not just the immediate children of txbxContent -- TIKA-2807
         if (config.isIncludeShapeBasedContent()) {
             for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath(
                     "declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent//w:p")) {
                 extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()),
                         paragraph.getBody()), listManager, xhtml);
             }
         }

         // Finish this paragraph
         xhtml.endElement(tag);

         if (headerFooterPolicy != null && config.isIncludeHeadersAndFooters()) {
             extractFooters(xhtml, headerFooterPolicy, listManager);
         }
     }

     private void writeParagraphNumber(XWPFParagraph paragraph, XWPFListManager listManager,
                                       XHTMLContentHandler xhtml) throws SAXException {
         if (paragraph.getNumIlvl() == null) {
             return;
         }
         String number = listManager.getFormattedNumber(paragraph);
         if (number != null) {
             xhtml.characters(number);
         }

     }

     private void processRun(XWPFRun run, XWPFParagraph paragraph, XHTMLContentHandler xhtml,
                             Deque<FormattingUtils.Tag> formattingState)
             throws SAXException, XmlException, IOException {
         // open/close required tags if run changes formatting
         FormattingUtils.ensureFormattingState(xhtml, FormattingUtils.toTags(run), formattingState);

         if (config.isConcatenatePhoneticRuns()) {
             xhtml.characters(run.toString());
         } else {
             xhtml.characters(run.text());
         }

         // If we have any pictures, output them
         for (XWPFPicture picture : run.getEmbeddedPictures()) {
             if (paragraph.getDocument() != null) {
                 XWPFPictureData data = picture.getPictureData();
                 if (data != null) {
                     AttributesImpl attr = new AttributesImpl();

                     attr.addAttribute("", "src", "src", "CDATA", "embedded:" + data.getFileName());
                     attr.addAttribute("", "alt", "alt", "CDATA", picture.getDescription());

                     xhtml.startElement("img", attr);
                     xhtml.endElement("img");
                 }
             }
         }
     }

     private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         xhtml.characters(run.getContent().getText());
     }

     private void extractTable(XWPFTable table, XWPFListManager listManager,
                               XHTMLContentHandler xhtml)
             throws SAXException, XmlException, IOException {
         xhtml.startElement("table");
         xhtml.startElement("tbody");
         for (XWPFTableRow row : table.getRows()) {
             xhtml.startElement("tr");
             for (ICell cell : row.getTableICells()) {
                 xhtml.startElement("td");
                 if (cell instanceof XWPFTableCell) {
                     extractIBodyText((XWPFTableCell) cell, listManager, xhtml);
                 } else if (cell instanceof XWPFSDTCell) {
                     xhtml.characters(((XWPFSDTCell) cell).getContent().getText());
                 }
                 xhtml.endElement("td");
             }
             xhtml.endElement("tr");
         }
         xhtml.endElement("tbody");
         xhtml.endElement("table");
     }

     private void extractFooters(XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
                                 XWPFListManager listManager)
             throws SAXException, XmlException, IOException {
         // footers
         if (hfPolicy.getFirstPageFooter() != null) {
             extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(), listManager);
         }
         if (hfPolicy.getEvenPageFooter() != null) {
             extractHeaderText(xhtml, hfPolicy.getEvenPageFooter(), listManager);
         }
         if (hfPolicy.getDefaultFooter() != null) {
             extractHeaderText(xhtml, hfPolicy.getDefaultFooter(), listManager);
         }
     }

     private void extractHeaders(XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
                                 XWPFListManager listManager)
             throws SAXException, XmlException, IOException {
         if (hfPolicy == null) {
             return;
         }

         if (hfPolicy.getFirstPageHeader() != null) {
             extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), listManager);
         }

         if (hfPolicy.getEvenPageHeader() != null) {
             extractHeaderText(xhtml, hfPolicy.getEvenPageHeader(), listManager);
         }

         if (hfPolicy.getDefaultHeader() != null) {
             extractHeaderText(xhtml, hfPolicy.getDefaultHeader(), listManager);
         }
     }

     private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header,
                                    XWPFListManager listManager)
             throws SAXException, XmlException, IOException {

         for (IBodyElement e : header.getBodyElements()) {
             if (e instanceof XWPFParagraph) {
                 extractParagraph((XWPFParagraph) e, listManager, xhtml);
             } else if (e instanceof XWPFTable) {
                 extractTable((XWPFTable) e, listManager, xhtml);
             } else if (e instanceof XWPFSDT) {
                 extractSDT((XWPFSDT) e, xhtml);
             }
         }
     }

     /**
      * Include main body and anything else that can
      * have an attachment/embedded object
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() {
         List<PackagePart> parts = new ArrayList<>();
         parts.add(document.getPackagePart());
         addRelatedParts(document.getPackagePart(), parts);
         return parts;
     }

     private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
         for (String relation : MAIN_PART_RELATIONS) {
             PackageRelationshipCollection prc = null;
             try {
                 prc = documentPart.getRelationshipsByType(relation);
                 if (prc != null) {
                     for (int i = 0; i < prc.size(); i++) {
                         PackagePart packagePart =
                                 documentPart.getRelatedPart(prc.getRelationship(i));
                         relatedParts.add(packagePart);
                     }
                 }
             } catch (InvalidFormatException e) {
                 //swallow
             }
         }

     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.microsoft.ooxml;

	import java.io.IOException;
	import java.util.ArrayDeque;
	import java.util.ArrayList;
	import java.util.Deque;
	import java.util.HashMap;
	import java.util.List;
	import javax.xml.namespace.QName;

	import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
	import org.apache.poi.openxml4j.opc.PackagePart;
	import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
	import org.apache.poi.xssf.usermodel.XSSFRelation;
	import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
	import org.apache.poi.xwpf.model.XWPFCommentsDecorator;
	import org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy;
	import org.apache.poi.xwpf.usermodel.BodyType;
	import org.apache.poi.xwpf.usermodel.IBody;
	import org.apache.poi.xwpf.usermodel.IBodyElement;
	import org.apache.poi.xwpf.usermodel.ICell;
	import org.apache.poi.xwpf.usermodel.IRunElement;
	import org.apache.poi.xwpf.usermodel.ISDTContent;
	import org.apache.poi.xwpf.usermodel.XWPFDocument;
	import org.apache.poi.xwpf.usermodel.XWPFHeaderFooter;
	import org.apache.poi.xwpf.usermodel.XWPFHyperlink;
	import org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun;
	import org.apache.poi.xwpf.usermodel.XWPFParagraph;
	import org.apache.poi.xwpf.usermodel.XWPFPicture;
	import org.apache.poi.xwpf.usermodel.XWPFPictureData;
	import org.apache.poi.xwpf.usermodel.XWPFRelation;
	import org.apache.poi.xwpf.usermodel.XWPFRun;
	import org.apache.poi.xwpf.usermodel.XWPFSDT;
	import org.apache.poi.xwpf.usermodel.XWPFSDTCell;
	import org.apache.poi.xwpf.usermodel.XWPFStyle;
	import org.apache.poi.xwpf.usermodel.XWPFStyles;
	import org.apache.poi.xwpf.usermodel.XWPFTable;
	import org.apache.poi.xwpf.usermodel.XWPFTableCell;
	import org.apache.poi.xwpf.usermodel.XWPFTableRow;
	import org.apache.xmlbeans.XmlCursor;
	import org.apache.xmlbeans.XmlException;
	import org.apache.xmlbeans.XmlObject;
	import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark;
	import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject;
	import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
	import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr;
	import org.xml.sax.SAXException;
	import org.xml.sax.helpers.AttributesImpl;

	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.microsoft.FormattingUtils;
	import org.apache.tika.parser.microsoft.WordExtractor;
	import org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle;
	import org.apache.tika.sax.XHTMLContentHandler;

	public class XWPFWordExtractorDecorator extends AbstractOOXMLExtractor {

	// could be improved by using the real delimiter in xchFollow [MS-DOC], v20140721, 2.4.6.3,
	// Part 3, Step 3
	private static final String LIST_DELIMITER = " ";


	//include all parts that might have embedded objects
	private final static String[] MAIN_PART_RELATIONS =
	new String[]{XWPFRelation.HEADER.getRelation(), XWPFRelation.FOOTER.getRelation(),
	XWPFRelation.FOOTNOTE.getRelation(),
	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
	"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
	AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA};


	private XWPFDocument document;
	private XWPFStyles styles;
	private Metadata metadata;

	public XWPFWordExtractorDecorator(Metadata metadata, ParseContext context,
	XWPFWordExtractor extractor) {
	super(context, extractor);
	this.metadata = metadata;
	document = (XWPFDocument) extractor.getDocument();
	styles = document.getStyles();
	}

	/**
	* @param context
	* @param extractor
	* @deprecated use {@link XWPFWordExtractorDecorator#XWPFWordExtractorDecorator(Metadata,
	* ParseContext, XWPFWordExtractor)}
	*/
	@Deprecated
	public XWPFWordExtractorDecorator(ParseContext context, XWPFWordExtractor extractor) {
	this(new Metadata(), context, extractor);
	}

	/**
	* @see org.apache.poi.xwpf.extractor.XWPFWordExtractor#getText()
	*/
	@Override
	protected void buildXHTML(XHTMLContentHandler xhtml)
	throws SAXException, XmlException, IOException {
	XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
	XWPFListManager listManager = new XWPFListManager(document.getNumbering());
	// headers
	if (hfPolicy != null && config.isIncludeHeadersAndFooters()) {
	extractHeaders(xhtml, hfPolicy, listManager);
	}

	// process text in the order that it occurs in
	extractIBodyText(document, listManager, xhtml);

	//handle the diagram data
	handleGeneralTextContainingPart(RELATION_DIAGRAM_DATA, "diagram-data",
	document.getPackagePart(), metadata,
	new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
	new HashMap<>()//empty
	));
	//handle chart data
	handleGeneralTextContainingPart(XSSFRelation.CHART.getRelation(), "chart",
	document.getPackagePart(), metadata,
	new OOXMLWordAndPowerPointTextHandler(new OOXMLTikaBodyPartHandler(xhtml),
	new HashMap<>()//empty
	));

	// then all document footers
	if (hfPolicy != null && config.isIncludeHeadersAndFooters()) {
	extractFooters(xhtml, hfPolicy, listManager);
	}
	}

	private void extractIBodyText(IBody bodyElement, XWPFListManager listManager,
	XHTMLContentHandler xhtml)
	throws SAXException, XmlException, IOException {
	for (IBodyElement element : bodyElement.getBodyElements()) {
	if (element instanceof XWPFParagraph) {
	XWPFParagraph paragraph = (XWPFParagraph) element;
	extractParagraph(paragraph, listManager, xhtml);
	}
	if (element instanceof XWPFTable) {
	XWPFTable table = (XWPFTable) element;
	extractTable(table, listManager, xhtml);
	}
	if (element instanceof XWPFSDT) {
	extractSDT((XWPFSDT) element, xhtml);
	}

	}
	}

	private void extractSDT(XWPFSDT element, XHTMLContentHandler xhtml)
	throws SAXException, XmlException, IOException {
	ISDTContent content = element.getContent();
	String tag = "p";
	xhtml.startElement(tag);
	xhtml.characters(content.getText());
	xhtml.endElement(tag);
	}

	private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager,
	XHTMLContentHandler xhtml)
	throws SAXException, XmlException, IOException {
	// If this paragraph is actually a whole new section, then
	// it could have its own headers and footers
	// Check and handle if so
	XWPFHeaderFooterPolicy headerFooterPolicy = null;
	if (paragraph.getCTP().getPPr() != null) {
	CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
	if (ctSectPr != null && config.isIncludeHeadersAndFooters()) {
	headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
	extractHeaders(xhtml, headerFooterPolicy, listManager);
	}
	}

	// Is this a paragraph, or a heading?
	String tag = "p";
	String styleClass = null;
	//TIKA-2144 check that styles is not null
	if (paragraph.getStyleID() != null && styles != null) {
	XWPFStyle style = styles.getStyle(paragraph.getStyleID());

	if (style != null && style.getName() != null) {
	TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(),
	paragraph.getPartType() == BodyType.TABLECELL);
	tag = tas.getTag();
	styleClass = tas.getStyleClass();
	}
	}

	if (styleClass == null) {
	xhtml.startElement(tag);
	} else {
	xhtml.startElement(tag, "class", styleClass);
	}

	writeParagraphNumber(paragraph, listManager, xhtml);
	// Output placeholder for any embedded docs:

	// TODO: replace w/ XPath/XQuery:
	for (XWPFRun run : paragraph.getRuns()) {
	XmlCursor c = run.getCTR().newCursor();
	c.selectPath("./*");
	while (c.toNextSelection()) {
	XmlObject o = c.getObject();
	if (o instanceof CTObject) {
	XmlCursor c2 = o.newCursor();
	c2.selectPath("./*");
	while (c2.toNextSelection()) {
	XmlObject o2 = c2.getObject();

	XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
	if (embedAtt != null &&
	embedAtt.getDomNode().getNodeValue().equals("Embed")) {
	// Type is "Embed"
	XmlObject relIDAtt = o2.selectAttribute(new QName(
	"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
	"id"));
	if (relIDAtt != null) {
	String relID = relIDAtt.getDomNode().getNodeValue();
	AttributesImpl attributes = new AttributesImpl();
	attributes.addAttribute("", "class", "class", "CDATA", "embedded");
	attributes.addAttribute("", "id", "id", "CDATA", relID);
	xhtml.startElement("div", attributes);
	xhtml.endElement("div");
	}
	}
	}
	c2.dispose();
	}
	}

	c.dispose();
	}

	// Attach bookmarks for the paragraph
	// (In future, we might put them in the right place, for now
	// we just put them in the correct paragraph)
	for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
	CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
	xhtml.startElement("a", "name", bookmark.getName());
	xhtml.endElement("a");
	}

	Deque<FormattingUtils.Tag> formattingState = new ArrayDeque<>();

	//hyperlinks may or may not have hyperlink ids
	String lastHyperlinkId = null;
	boolean inHyperlink = false;
	// Do the iruns
	for (IRunElement run : paragraph.getIRuns()) {
	if (run instanceof XWPFHyperlinkRun) {
	XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
	if (hyperlinkRun.getHyperlinkId() == null \|\|
	!hyperlinkRun.getHyperlinkId().equals(lastHyperlinkId)) {
	if (inHyperlink) {
	//close out the old one
	FormattingUtils.closeStyleTags(xhtml, formattingState);
	xhtml.endElement("a");
	inHyperlink = false;
	}
	lastHyperlinkId = hyperlinkRun.getHyperlinkId();
	FormattingUtils.closeStyleTags(xhtml, formattingState);
	XWPFHyperlink link = hyperlinkRun.getHyperlink(document);
	if (link != null && link.getURL() != null) {
	xhtml.startElement("a", "href", link.getURL());
	inHyperlink = true;
	} else if (hyperlinkRun.getAnchor() != null &&
	hyperlinkRun.getAnchor().length() > 0) {
	xhtml.startElement("a", "href", "#" + hyperlinkRun.getAnchor());
	inHyperlink = true;
	}
	}
	} else if (inHyperlink) {
	//if this isn't a hyperlink, but the last one was
	FormattingUtils.closeStyleTags(xhtml, formattingState);
	xhtml.endElement("a");
	lastHyperlinkId = null;
	inHyperlink = false;
	}

	if (run instanceof XWPFSDT) {
	FormattingUtils.closeStyleTags(xhtml, formattingState);
	processSDTRun((XWPFSDT) run, xhtml);
	//for now, we're ignoring formatting in sdt
	//if you hit an sdt reset to false
	} else {
	processRun((XWPFRun) run, paragraph, xhtml, formattingState);
	}
	}
	FormattingUtils.closeStyleTags(xhtml, formattingState);
	if (inHyperlink) {
	xhtml.endElement("a");
	}


	// Now do any comments for the paragraph
	XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
	String commentText = comments.getCommentText();
	if (commentText != null && commentText.length() > 0) {
	xhtml.characters(commentText);
	}

	String footnameText = paragraph.getFootnoteText();
	if (footnameText != null && footnameText.length() > 0) {
	xhtml.characters(footnameText + "\n");
	}

	// Also extract any paragraphs embedded in text boxes
	//Note "w:txbxContent//"...must look for all descendant paragraphs
	//not just the immediate children of txbxContent -- TIKA-2807
	if (config.isIncludeShapeBasedContent()) {
	for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath(
	"declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent//w:p")) {
	extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()),
	paragraph.getBody()), listManager, xhtml);
	}
	}

	// Finish this paragraph
	xhtml.endElement(tag);

	if (headerFooterPolicy != null && config.isIncludeHeadersAndFooters()) {
	extractFooters(xhtml, headerFooterPolicy, listManager);
	}
	}

	private void writeParagraphNumber(XWPFParagraph paragraph, XWPFListManager listManager,
	XHTMLContentHandler xhtml) throws SAXException {
	if (paragraph.getNumIlvl() == null) {
	return;
	}
	String number = listManager.getFormattedNumber(paragraph);
	if (number != null) {
	xhtml.characters(number);
	}

	}

	private void processRun(XWPFRun run, XWPFParagraph paragraph, XHTMLContentHandler xhtml,
	Deque<FormattingUtils.Tag> formattingState)
	throws SAXException, XmlException, IOException {
	// open/close required tags if run changes formatting
	FormattingUtils.ensureFormattingState(xhtml, FormattingUtils.toTags(run), formattingState);

	if (config.isConcatenatePhoneticRuns()) {
	xhtml.characters(run.toString());
	} else {
	xhtml.characters(run.text());
	}

	// If we have any pictures, output them
	for (XWPFPicture picture : run.getEmbeddedPictures()) {
	if (paragraph.getDocument() != null) {
	XWPFPictureData data = picture.getPictureData();
	if (data != null) {
	AttributesImpl attr = new AttributesImpl();

	attr.addAttribute("", "src", "src", "CDATA", "embedded:" + data.getFileName());
	attr.addAttribute("", "alt", "alt", "CDATA", picture.getDescription());

	xhtml.startElement("img", attr);
	xhtml.endElement("img");
	}
	}
	}
	}

	private void processSDTRun(XWPFSDT run, XHTMLContentHandler xhtml)
	throws SAXException, XmlException, IOException {
	xhtml.characters(run.getContent().getText());
	}

	private void extractTable(XWPFTable table, XWPFListManager listManager,
	XHTMLContentHandler xhtml)
	throws SAXException, XmlException, IOException {
	xhtml.startElement("table");
	xhtml.startElement("tbody");
	for (XWPFTableRow row : table.getRows()) {
	xhtml.startElement("tr");
	for (ICell cell : row.getTableICells()) {
	xhtml.startElement("td");
	if (cell instanceof XWPFTableCell) {
	extractIBodyText((XWPFTableCell) cell, listManager, xhtml);
	} else if (cell instanceof XWPFSDTCell) {
	xhtml.characters(((XWPFSDTCell) cell).getContent().getText());
	}
	xhtml.endElement("td");
	}
	xhtml.endElement("tr");
	}
	xhtml.endElement("tbody");
	xhtml.endElement("table");
	}

	private void extractFooters(XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
	XWPFListManager listManager)
	throws SAXException, XmlException, IOException {
	// footers
	if (hfPolicy.getFirstPageFooter() != null) {
	extractHeaderText(xhtml, hfPolicy.getFirstPageFooter(), listManager);
	}
	if (hfPolicy.getEvenPageFooter() != null) {
	extractHeaderText(xhtml, hfPolicy.getEvenPageFooter(), listManager);
	}
	if (hfPolicy.getDefaultFooter() != null) {
	extractHeaderText(xhtml, hfPolicy.getDefaultFooter(), listManager);
	}
	}

	private void extractHeaders(XHTMLContentHandler xhtml, XWPFHeaderFooterPolicy hfPolicy,
	XWPFListManager listManager)
	throws SAXException, XmlException, IOException {
	if (hfPolicy == null) {
	return;
	}

	if (hfPolicy.getFirstPageHeader() != null) {
	extractHeaderText(xhtml, hfPolicy.getFirstPageHeader(), listManager);
	}

	if (hfPolicy.getEvenPageHeader() != null) {
	extractHeaderText(xhtml, hfPolicy.getEvenPageHeader(), listManager);
	}

	if (hfPolicy.getDefaultHeader() != null) {
	extractHeaderText(xhtml, hfPolicy.getDefaultHeader(), listManager);
	}
	}

	private void extractHeaderText(XHTMLContentHandler xhtml, XWPFHeaderFooter header,
	XWPFListManager listManager)
	throws SAXException, XmlException, IOException {

	for (IBodyElement e : header.getBodyElements()) {
	if (e instanceof XWPFParagraph) {
	extractParagraph((XWPFParagraph) e, listManager, xhtml);
	} else if (e instanceof XWPFTable) {
	extractTable((XWPFTable) e, listManager, xhtml);
	} else if (e instanceof XWPFSDT) {
	extractSDT((XWPFSDT) e, xhtml);
	}
	}
	}

	/**
	* Include main body and anything else that can
	* have an attachment/embedded object
	*/
	@Override
	protected List<PackagePart> getMainDocumentParts() {
	List<PackagePart> parts = new ArrayList<>();
	parts.add(document.getPackagePart());
	addRelatedParts(document.getPackagePart(), parts);
	return parts;
	}

	private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
	for (String relation : MAIN_PART_RELATIONS) {
	PackageRelationshipCollection prc = null;
	try {
	prc = documentPart.getRelationshipsByType(relation);
	if (prc != null) {
	for (int i = 0; i < prc.size(); i++) {
	PackagePart packagePart =
	documentPart.getRelatedPart(prc.getRelationship(i));
	relatedParts.add(packagePart);
	}
	}
	} catch (InvalidFormatException e) {
	//swallow
	}
	}

	}

	}