blob: de45e284e56c17485f75fa239ad9ea5261f17226 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.openxml4j.opc.PackagePartName;
import org.apache.poi.openxml4j.opc.PackageRelationship;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.HeaderFooter;
import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
import org.apache.poi.xssf.eventusermodel.XSSFReader;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
import org.apache.poi.xssf.model.CommentsTable;
import org.apache.poi.xssf.model.StylesTable;
import org.apache.poi.xssf.usermodel.XSSFComment;
import org.apache.poi.xssf.usermodel.XSSFRelation;
import org.apache.poi.xssf.usermodel.XSSFShape;
import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.xmlbeans.XmlException;
import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
/**
* Allows access to headers/footers from raw xml strings
*/
protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
protected final DataFormatter formatter;
protected final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
protected final Map<String, String> drawingHyperlinks = new HashMap<>();
protected Metadata metadata;
protected ParseContext parseContext;
public XSSFExcelExtractorDecorator(
ParseContext context, POIXMLTextExtractor extractor, Locale locale) {
super(context, extractor);
this.parseContext = context;
this.extractor = (XSSFEventBasedExcelExtractor)extractor;
configureExtractor(this.extractor, locale);
if (locale == null) {
formatter = new TikaExcelDataFormatter();
} else {
formatter = new TikaExcelDataFormatter(locale);
}
}
protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent());
((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
((XSSFEventBasedExcelExtractor)extractor).setLocale(locale);
}
@Override
public void getXHTML(
ContentHandler handler, Metadata metadata, ParseContext context)
throws SAXException, XmlException, IOException, TikaException {
this.metadata = metadata;
this.parseContext = context;
metadata.set(TikaMetadataKeys.PROTECTED, "false");
super.getXHTML(handler, metadata, context);
}
/**
* @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
*/
@Override
protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
XmlException, IOException {
OPCPackage container = extractor.getPackage();
ReadOnlySharedStringsTable strings;
XSSFReader.SheetIterator iter;
XSSFReader xssfReader;
StylesTable styles;
try {
xssfReader = new XSSFReader(container);
styles = xssfReader.getStylesTable();
iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
strings = new ReadOnlySharedStringsTable(container);
} catch (InvalidFormatException e) {
throw new XmlException(e);
} catch (OpenXML4JException oe) {
throw new XmlException(oe);
}
//temporary workaround for POI-61034
//remove once POI 3.17-beta1 is released
Set<String> seen = new HashSet<>();
while (iter.hasNext()) {
SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
PackagePart sheetPart = null;
try (InputStream stream = iter.next()) {
sheetPart = iter.getSheetPart();
final String partName = sheetPart.getPartName().toString();
if (seen.contains(partName)) {
continue;
}
seen.add(partName);
addDrawingHyperLinks(sheetPart);
sheetParts.add(sheetPart);
CommentsTable comments = iter.getSheetComments();
// Start, and output the sheet name
xhtml.startElement("div");
xhtml.element("h1", iter.getSheetName());
// Extract the main sheet contents
xhtml.startElement("table");
xhtml.startElement("tbody");
processSheet(sheetExtractor, comments, styles, strings, stream);
}
xhtml.endElement("tbody");
xhtml.endElement("table");
// Output any headers and footers
// (Need to process the sheet to get them, so we can't
// do the headers before the contents)
for (String header : sheetExtractor.headers) {
extractHeaderFooter(header, xhtml);
}
for (String footer : sheetExtractor.footers) {
extractHeaderFooter(footer, xhtml);
}
// Do text held in shapes, if required
if (config.getIncludeShapeBasedContent()) {
List<XSSFShape> shapes = iter.getShapes();
processShapes(shapes, xhtml);
}
//for now dump sheet hyperlinks at bottom of page
//consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
//step 1: extract hyperlink info from bottom of page
//step 2: process as we do now, but with cached hyperlink relationship info
extractHyperLinks(sheetPart, xhtml);
// All done with this sheet
xhtml.endElement("div");
}
}
protected void addDrawingHyperLinks(PackagePart sheetPart) {
try {
for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
PackagePart part = rel.getPackage().getPart(relName);
//parts can go missing, and Excel quietly ignores missing images -- TIKA-2134
if (part == null) {
continue;
}
for (PackageRelationship drawRel : part
.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
}
}
}
} catch (InvalidFormatException e) {
//swallow
//an exception trying to extract
//hyperlinks on drawings should not cause a parse failure
}
}
private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
try {
for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
xhtml.startElement("a", "href", rel.getTargetURI().toString());
xhtml.characters(rel.getTargetURI().toString());
xhtml.endElement("a");
}
} catch (InvalidFormatException e) {
//swallow
}
}
protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
throws SAXException {
String content = ExcelExtractor._extractHeaderFooter(
new HeaderFooterFromString(hf));
if (content.length() > 0) {
xhtml.element("p", content);
}
}
private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
if (shapes == null) {
return;
}
for (XSSFShape shape : shapes) {
if (shape instanceof XSSFSimpleShape) {
String sText = ((XSSFSimpleShape) shape).getText();
if (sText != null && sText.length() > 0) {
xhtml.element("p", sText);
}
extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
}
}
}
private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException {
if (ctShape == null)
return;
CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
if (nvSpPR == null)
return;
CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
if (cNvPr == null)
return;
CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
if (ctHyperlink == null)
return;
String url = drawingHyperlinks.get(ctHyperlink.getId());
if (url != null) {
xhtml.startElement("a", "href", url);
xhtml.characters(url);
xhtml.endElement("a");
}
CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
if (ctHoverHyperlink == null)
return;
url = drawingHyperlinks.get(ctHoverHyperlink.getId());
if (url != null) {
xhtml.startElement("a", "href", url);
xhtml.characters(url);
xhtml.endElement("a");
}
}
public void processSheet(
SheetContentsHandler sheetContentsExtractor,
CommentsTable comments,
StylesTable styles,
ReadOnlySharedStringsTable strings,
InputStream sheetInputStream)
throws IOException, SAXException {
InputSource sheetSource = new InputSource(sheetInputStream);
try {
XMLReader sheetParser = parseContext.getXMLReader();
XSSFSheetInterestingPartsCapturer handler =
new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler(
styles, comments, strings, sheetContentsExtractor, formatter, false));
sheetParser.setContentHandler(handler);
sheetParser.parse(sheetSource);
sheetInputStream.close();
if (handler.hasProtection) {
metadata.set(TikaMetadataKeys.PROTECTED, "true");
}
} catch (TikaException e) {
throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
}
}
/**
* In Excel files, sheets have things embedded in them,
* and sheet drawings which have the images
*/
@Override
protected List<PackagePart> getMainDocumentParts() throws TikaException {
List<PackagePart> parts = new ArrayList<PackagePart>();
for (PackagePart part : sheetParts) {
// Add the sheet
parts.add(part);
// If it has drawings, return those too
try {
for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
if (rel.getTargetMode() == TargetMode.INTERNAL) {
PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
parts.add(rel.getPackage().getPart(relName));
}
}
} catch (InvalidFormatException e) {
throw new TikaException("Broken OOXML file", e);
}
}
//add main document so that macros can be extracted
//by AbstractOOXMLExtractor
for (PackagePart part : extractor.getPackage().
getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
parts.add(part);
}
return parts;
}
/**
* Turns formatted sheet events into HTML
*/
protected static class SheetTextAsHTML implements SheetContentsHandler {
private XHTMLContentHandler xhtml;
protected List<String> headers;
protected List<String> footers;
protected SheetTextAsHTML(XHTMLContentHandler xhtml) {
this.xhtml = xhtml;
headers = new ArrayList<String>();
footers = new ArrayList<String>();
}
public void startRow(int rowNum) {
try {
xhtml.startElement("tr");
} catch (SAXException e) {
}
}
public void endRow(int rowNum) {
try {
xhtml.endElement("tr");
} catch (SAXException e) {
}
}
public void cell(String cellRef, String formattedValue, XSSFComment comment) {
try {
xhtml.startElement("td");
// Main cell contents
if (formattedValue != null) {
xhtml.characters(formattedValue);
}
// Comments
if (comment != null) {
xhtml.startElement("br");
xhtml.endElement("br");
xhtml.characters(comment.getAuthor());
xhtml.characters(": ");
xhtml.characters(comment.getString().getString());
}
xhtml.endElement("td");
} catch (SAXException e) {
}
}
public void headerFooter(String text, boolean isHeader, String tagName) {
if (isHeader) {
headers.add(text);
} else {
footers.add(text);
}
}
}
protected static class HeaderFooterFromString implements HeaderFooter {
private String text;
protected HeaderFooterFromString(String text) {
this.text = text;
}
public String getCenter() {
return hfHelper.getCenterSection(text);
}
public void setCenter(String paramString) {
}
public String getLeft() {
return hfHelper.getLeftSection(text);
}
public void setLeft(String paramString) {
}
public String getRight() {
return hfHelper.getRightSection(text);
}
public void setRight(String paramString) {
}
}
/**
* Captures information on interesting tags, whilst
* delegating the main work to the formatting handler
*/
protected static class XSSFSheetInterestingPartsCapturer implements ContentHandler {
private ContentHandler delegate;
private boolean hasProtection = false;
protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
this.delegate = delegate;
}
public void startElement(String uri, String localName, String qName,
Attributes atts) throws SAXException {
if ("sheetProtection".equals(qName)) {
hasProtection = true;
}
delegate.startElement(uri, localName, qName, atts);
}
public void characters(char[] ch, int start, int length)
throws SAXException {
delegate.characters(ch, start, length);
}
public void endDocument() throws SAXException {
delegate.endDocument();
}
public void endElement(String uri, String localName, String qName)
throws SAXException {
delegate.endElement(uri, localName, qName);
}
public void endPrefixMapping(String prefix) throws SAXException {
delegate.endPrefixMapping(prefix);
}
public void ignorableWhitespace(char[] ch, int start, int length)
throws SAXException {
delegate.ignorableWhitespace(ch, start, length);
}
public void processingInstruction(String target, String data)
throws SAXException {
delegate.processingInstruction(target, data);
}
public void setDocumentLocator(Locator locator) {
delegate.setDocumentLocator(locator);
}
public void skippedEntity(String name) throws SAXException {
delegate.skippedEntity(name);
}
public void startDocument() throws SAXException {
delegate.startDocument();
}
public void startPrefixMapping(String prefix, String uri)
throws SAXException {
delegate.startPrefixMapping(prefix, uri);
}
}
}