blob: bd03dedb567edc7b5cd016b769912b1b23b41ce2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipException;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xwpf.usermodel.XWPFNumbering;
import org.apache.poi.xwpf.usermodel.XWPFRelation;
import org.apache.xmlbeans.XmlException;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFStylesShim;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.OfflineContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.XMLReaderUtils;
/**
* This is an experimental, alternative extractor for docx files.
* This streams the main document content rather than loading the
* full document into memory.
* <p>
* This will be better for some use cases than the classic docx extractor; and,
* it will be worse for others.
* </p>
*
* @since 1.15
*/
public class SXWPFWordExtractorDecorator extends AbstractOOXMLExtractor {
//include all parts that might have embedded objects
private final static String[] MAIN_PART_RELATIONS =
new String[]{XWPFRelation.HEADER.getRelation(), XWPFRelation.FOOTER.getRelation(),
XWPFRelation.FOOTNOTE.getRelation(),
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes",
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"};
//a docx file should have one of these "main story" parts
private final static String[] MAIN_STORY_PART_RELATIONS =
new String[]{XWPFRelation.DOCUMENT.getContentType(),
XWPFRelation.MACRO_DOCUMENT.getContentType(),
XWPFRelation.TEMPLATE.getContentType(),
XWPFRelation.MACRO_TEMPLATE_DOCUMENT.getContentType()
};
private final OPCPackage opcPackage;
private final ParseContext context;
private final Metadata metadata;
public SXWPFWordExtractorDecorator(Metadata metadata, ParseContext context,
XWPFEventBasedWordExtractor extractor) {
super(context, extractor);
this.metadata = metadata;
this.context = context;
this.opcPackage = extractor.getPackage();
}
@Override
protected void buildXHTML(XHTMLContentHandler xhtml)
throws SAXException, XmlException, IOException {
//handle main document
List<PackagePart> pps = getStoryDocumentParts();
if (pps != null) {
for (PackagePart pp : pps) {
//likely only one, but why not...
handleDocumentPart(pp, xhtml);
}
}
//handle glossary document
pps = opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
if (pps != null) {
if (pps.size() > 0) {
xhtml.startElement("div", "class", "glossary");
for (PackagePart pp : pps) {
//likely only one, but why not...
handleDocumentPart(pp, xhtml);
}
xhtml.endElement("div");
}
}
}
private void handleDocumentPart(PackagePart documentPart, XHTMLContentHandler xhtml)
throws IOException, SAXException {
//load the numbering/list manager and styles from the main document part
XWPFNumbering numbering = loadNumbering(documentPart);
XWPFListManager listManager = new XWPFListManager(numbering);
XWPFStylesShim styles = null;
try {
styles = loadStyles(documentPart);
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
if (config.isIncludeHeadersAndFooters()) {
//headers
try {
PackageRelationshipCollection headersPRC =
documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
if (headersPRC != null) {
for (int i = 0; i < headersPRC.size(); i++) {
PackagePart header =
documentPart.getRelatedPart(headersPRC.getRelationship(i));
handlePart(header, styles, listManager, xhtml);
}
}
} catch (InvalidFormatException | ZipException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
}
//main document
try {
handlePart(documentPart, styles, listManager, xhtml);
} catch (ZipException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
//for now, just dump other components at end
for (String rel : new String[]{AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
XSSFRelation.CHART.getRelation(), XWPFRelation.FOOTNOTE.getRelation(),
XWPFRelation.COMMENT.getRelation(), XWPFRelation.FOOTER.getRelation(),
XWPFRelation.ENDNOTE.getRelation(),}) {
//skip footers if we shouldn't extract them
if (!config.isIncludeHeadersAndFooters() &&
rel.equals(XWPFRelation.FOOTER.getRelation())) {
continue;
}
try {
PackageRelationshipCollection prc = documentPart.getRelationshipsByType(rel);
if (prc != null) {
for (int i = 0; i < prc.size(); i++) {
PackagePart packagePart =
documentPart.getRelatedPart(prc.getRelationship(i));
handlePart(packagePart, styles, listManager, xhtml);
}
}
} catch (InvalidFormatException | ZipException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
}
}
private void handlePart(PackagePart packagePart, XWPFStylesShim styles,
XWPFListManager listManager, XHTMLContentHandler xhtml)
throws IOException, SAXException {
Map<String, String> linkedRelationships =
loadLinkedRelationships(packagePart, true, metadata);
try (InputStream stream = packagePart.getInputStream()) {
XMLReaderUtils.parseSAX(new CloseShieldInputStream(stream), new OfflineContentHandler(
new EmbeddedContentHandler(new OOXMLWordAndPowerPointTextHandler(
new OOXMLTikaBodyPartHandler(xhtml, styles, listManager, config),
linkedRelationships, config.isIncludeShapeBasedContent(),
config.isConcatenatePhoneticRuns()))), context);
} catch (TikaException | IOException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
}
private XWPFStylesShim loadStyles(PackagePart packagePart)
throws InvalidFormatException, TikaException, IOException, SAXException {
PackageRelationshipCollection stylesParts =
packagePart.getRelationshipsByType(XWPFRelation.STYLES.getRelation());
if (stylesParts.size() > 0) {
PackageRelationship stylesRelationShip = stylesParts.getRelationship(0);
if (stylesRelationShip == null) {
return null;
}
PackagePart stylesPart = packagePart.getRelatedPart(stylesRelationShip);
if (stylesPart == null) {
return null;
}
return new XWPFStylesShim(stylesPart, context);
}
return null;
}
private XWPFNumbering loadNumbering(PackagePart packagePart) {
try {
PackageRelationshipCollection numberingParts =
packagePart.getRelationshipsByType(XWPFRelation.NUMBERING.getRelation());
if (numberingParts.size() > 0) {
PackageRelationship numberingRelationShip = numberingParts.getRelationship(0);
if (numberingRelationShip == null) {
return null;
}
PackagePart numberingPart = packagePart.getRelatedPart(numberingRelationShip);
if (numberingPart == null) {
return null;
}
return new XWPFNumberingShim(numberingPart);
}
} catch (IOException | OpenXML4JException e) {
//swallow
}
return null;
}
/**
* This returns all items that might contain embedded objects:
* main document, headers, footers, comments, etc.
*/
@Override
protected List<PackagePart> getMainDocumentParts() {
List<PackagePart> mainStoryDocs = getStoryDocumentParts();
List<PackagePart> relatedParts = new ArrayList<>();
mainStoryDocs.addAll(opcPackage
.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType()));
for (PackagePart pp : mainStoryDocs) {
addRelatedParts(pp, relatedParts);
}
relatedParts.addAll(mainStoryDocs);
return relatedParts;
}
private void addRelatedParts(PackagePart documentPart, List<PackagePart> relatedParts) {
for (String relation : MAIN_PART_RELATIONS) {
PackageRelationshipCollection prc = null;
try {
prc = documentPart.getRelationshipsByType(relation);
if (prc != null) {
for (int i = 0; i < prc.size(); i++) {
PackagePart packagePart =
documentPart.getRelatedPart(prc.getRelationship(i));
relatedParts.add(packagePart);
}
}
} catch (InvalidFormatException e) {
//swallow
}
}
}
/**
* @return the first non-empty main story document part; empty list if no
* main story is found.
*/
private List<PackagePart> getStoryDocumentParts() {
for (String contentType : MAIN_STORY_PART_RELATIONS) {
List<PackagePart> pps = opcPackage.getPartsByContentType(contentType);
if (pps.size() > 0) {
return pps;
}
}
return new ArrayList<>();
}
}