tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.microsoft.ooxml;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;

 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
 import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackagePartName;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.ss.usermodel.DataFormatter;
 import org.apache.poi.ss.usermodel.HeaderFooter;
 import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
 import org.apache.poi.xssf.eventusermodel.XSSFReader;
 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
 import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
 import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
 import org.apache.poi.xssf.model.CommentsTable;
 import org.apache.poi.xssf.model.StylesTable;
 import org.apache.poi.xssf.usermodel.XSSFComment;
 import org.apache.poi.xssf.usermodel.XSSFRelation;
 import org.apache.poi.xssf.usermodel.XSSFShape;
 import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
 import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.xmlbeans.XmlException;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
 import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
 import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.InputSource;
 import org.xml.sax.Locator;
 import org.xml.sax.SAXException;
 import org.xml.sax.XMLReader;

 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
     /**
      * Allows access to headers/footers from raw xml strings
      */
     protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
     protected final DataFormatter formatter;
     protected final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
     protected final Map<String, String> drawingHyperlinks = new HashMap<>();
     protected Metadata metadata;
     protected ParseContext parseContext;

     public XSSFExcelExtractorDecorator(
             ParseContext context, POIXMLTextExtractor extractor, Locale locale) {
         super(context, extractor);

         this.parseContext = context;
         this.extractor = (XSSFEventBasedExcelExtractor)extractor;
         configureExtractor(this.extractor, locale);

         if (locale == null) {
             formatter = new TikaExcelDataFormatter();
         } else {
             formatter = new TikaExcelDataFormatter(locale);
         }
     }

     protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
         ((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent());
         ((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
         ((XSSFEventBasedExcelExtractor)extractor).setLocale(locale);
     }

     @Override
     public void getXHTML(
             ContentHandler handler, Metadata metadata, ParseContext context)
             throws SAXException, XmlException, IOException, TikaException {

         this.metadata = metadata;
         this.parseContext = context;
         metadata.set(TikaMetadataKeys.PROTECTED, "false");

         super.getXHTML(handler, metadata, context);
     }

     /**
      * @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
      */
     @Override
     protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
             XmlException, IOException {
         OPCPackage container = extractor.getPackage();

         ReadOnlySharedStringsTable strings;
         XSSFReader.SheetIterator iter;
         XSSFReader xssfReader;
         StylesTable styles;
         try {
             xssfReader = new XSSFReader(container);
             styles = xssfReader.getStylesTable();

             iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
             strings = new ReadOnlySharedStringsTable(container);
         } catch (InvalidFormatException e) {
             throw new XmlException(e);
         } catch (OpenXML4JException oe) {
             throw new XmlException(oe);
         }

         //temporary workaround for POI-61034
         //remove once POI 3.17-beta1 is released
         Set<String> seen = new HashSet<>();

         while (iter.hasNext()) {

             SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
             PackagePart sheetPart = null;
             try (InputStream stream = iter.next()) {
                 sheetPart = iter.getSheetPart();
                 final String partName = sheetPart.getPartName().toString();
                 if (seen.contains(partName)) {
                     continue;
                 }
                 seen.add(partName);

                 addDrawingHyperLinks(sheetPart);
                 sheetParts.add(sheetPart);

                 CommentsTable comments = iter.getSheetComments();

                 // Start, and output the sheet name
                 xhtml.startElement("div");
                 xhtml.element("h1", iter.getSheetName());

                 // Extract the main sheet contents
                 xhtml.startElement("table");
                 xhtml.startElement("tbody");

                 processSheet(sheetExtractor, comments, styles, strings, stream);
             }
             xhtml.endElement("tbody");
             xhtml.endElement("table");

             // Output any headers and footers
             // (Need to process the sheet to get them, so we can't
             //  do the headers before the contents)
             for (String header : sheetExtractor.headers) {
                 extractHeaderFooter(header, xhtml);
             }
             for (String footer : sheetExtractor.footers) {
                 extractHeaderFooter(footer, xhtml);
             }

             // Do text held in shapes, if required
             if (config.getIncludeShapeBasedContent()) {
                 List<XSSFShape> shapes = iter.getShapes();
                 processShapes(shapes, xhtml);
             }

             //for now dump sheet hyperlinks at bottom of page
             //consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
             //step 1: extract hyperlink info from bottom of page
             //step 2: process as we do now, but with cached hyperlink relationship info
             extractHyperLinks(sheetPart, xhtml);
             // All done with this sheet
             xhtml.endElement("div");
         }
     }

     protected void addDrawingHyperLinks(PackagePart sheetPart) {
         try {
             for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
                 if (rel.getTargetMode() == TargetMode.INTERNAL) {
                     PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
                     PackagePart part = rel.getPackage().getPart(relName);
                     //parts can go missing, and Excel quietly ignores missing images -- TIKA-2134
                     if (part == null) {
                         continue;
                     }
                     for (PackageRelationship drawRel : part
                             .getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
                         drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
                     }
                 }
             }
         } catch (InvalidFormatException e) {
             //swallow
             //an exception trying to extract
             //hyperlinks on drawings should not cause a parse failure
         }

     }


     private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
         try {
             for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
                 xhtml.startElement("a", "href", rel.getTargetURI().toString());
                 xhtml.characters(rel.getTargetURI().toString());
                 xhtml.endElement("a");
             }
         } catch (InvalidFormatException e) {
             //swallow
         }
     }

     protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
             throws SAXException {
         String content = ExcelExtractor._extractHeaderFooter(
                 new HeaderFooterFromString(hf));
         if (content.length() > 0) {
             xhtml.element("p", content);
         }
     }

     private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
         if (shapes == null) {
             return;
         }
         for (XSSFShape shape : shapes) {
             if (shape instanceof XSSFSimpleShape) {
                 String sText = ((XSSFSimpleShape) shape).getText();
                 if (sText != null && sText.length() > 0) {
                     xhtml.element("p", sText);
                 }
                 extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
             }
         }
     }

     private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException {

         if (ctShape == null)
             return;

         CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
         if (nvSpPR == null)
             return;

         CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
         if (cNvPr == null)
             return;

         CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
         if (ctHyperlink == null)
             return;

         String url = drawingHyperlinks.get(ctHyperlink.getId());
         if (url != null) {
             xhtml.startElement("a", "href", url);
             xhtml.characters(url);
             xhtml.endElement("a");
         }

         CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
         if (ctHoverHyperlink == null)
             return;

         url = drawingHyperlinks.get(ctHoverHyperlink.getId());
         if (url != null) {
             xhtml.startElement("a", "href", url);
             xhtml.characters(url);
             xhtml.endElement("a");
         }

     }

     public void processSheet(
             SheetContentsHandler sheetContentsExtractor,
             CommentsTable comments,
             StylesTable styles,
             ReadOnlySharedStringsTable strings,
             InputStream sheetInputStream)
             throws IOException, SAXException {
         InputSource sheetSource = new InputSource(sheetInputStream);
         try {
             XMLReader sheetParser = parseContext.getXMLReader();
             XSSFSheetInterestingPartsCapturer handler =
                     new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler(
                             styles, comments, strings, sheetContentsExtractor, formatter, false));
             sheetParser.setContentHandler(handler);
             sheetParser.parse(sheetSource);
             sheetInputStream.close();

             if (handler.hasProtection) {
                 metadata.set(TikaMetadataKeys.PROTECTED, "true");
             }
         } catch (TikaException e) {
             throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
         }
     }

     /**
      * In Excel files, sheets have things embedded in them,
      * and sheet drawings which have the images
      */
     @Override
     protected List<PackagePart> getMainDocumentParts() throws TikaException {
         List<PackagePart> parts = new ArrayList<PackagePart>();
         for (PackagePart part : sheetParts) {
             // Add the sheet
             parts.add(part);

             // If it has drawings, return those too
             try {
                 for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
                     if (rel.getTargetMode() == TargetMode.INTERNAL) {
                         PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
                         parts.add(rel.getPackage().getPart(relName));
                     }
                 }
                 for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
                     if (rel.getTargetMode() == TargetMode.INTERNAL) {
                         PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
                         parts.add(rel.getPackage().getPart(relName));
                     }
                 }
             } catch (InvalidFormatException e) {
                 throw new TikaException("Broken OOXML file", e);
             }
         }

         //add main document so that macros can be extracted
         //by AbstractOOXMLExtractor
         for (PackagePart part : extractor.getPackage().
                 getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
             parts.add(part);
         }

         return parts;
     }

     /**
      * Turns formatted sheet events into HTML
      */
     protected static class SheetTextAsHTML implements SheetContentsHandler {
         private XHTMLContentHandler xhtml;
         protected List<String> headers;
         protected List<String> footers;

         protected SheetTextAsHTML(XHTMLContentHandler xhtml) {
             this.xhtml = xhtml;
             headers = new ArrayList<String>();
             footers = new ArrayList<String>();
         }

         public void startRow(int rowNum) {
             try {
                 xhtml.startElement("tr");
             } catch (SAXException e) {
             }
         }

         public void endRow(int rowNum) {
             try {
                 xhtml.endElement("tr");
             } catch (SAXException e) {
             }
         }

         public void cell(String cellRef, String formattedValue, XSSFComment comment) {
             try {
                 xhtml.startElement("td");

                 // Main cell contents
                 if (formattedValue != null) {
                     xhtml.characters(formattedValue);
                 }

                 // Comments
                 if (comment != null) {
                     xhtml.startElement("br");
                     xhtml.endElement("br");
                     xhtml.characters(comment.getAuthor());
                     xhtml.characters(": ");
                     xhtml.characters(comment.getString().getString());
                 }

                 xhtml.endElement("td");
             } catch (SAXException e) {
             }
         }

         public void headerFooter(String text, boolean isHeader, String tagName) {
             if (isHeader) {
                 headers.add(text);
             } else {
                 footers.add(text);
             }
         }
     }

     protected static class HeaderFooterFromString implements HeaderFooter {
         private String text;

         protected HeaderFooterFromString(String text) {
             this.text = text;
         }

         public String getCenter() {
             return hfHelper.getCenterSection(text);
         }

         public void setCenter(String paramString) {
         }

         public String getLeft() {
             return hfHelper.getLeftSection(text);
         }

         public void setLeft(String paramString) {
         }

         public String getRight() {
             return hfHelper.getRightSection(text);
         }

         public void setRight(String paramString) {
         }
     }

     /**
      * Captures information on interesting tags, whilst
      * delegating the main work to the formatting handler
      */
     protected static class XSSFSheetInterestingPartsCapturer implements ContentHandler {
         private ContentHandler delegate;
         private boolean hasProtection = false;

         protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
             this.delegate = delegate;
         }

         public void startElement(String uri, String localName, String qName,
                                  Attributes atts) throws SAXException {
             if ("sheetProtection".equals(qName)) {
                 hasProtection = true;
             }
             delegate.startElement(uri, localName, qName, atts);
         }

         public void characters(char[] ch, int start, int length)
                 throws SAXException {
             delegate.characters(ch, start, length);
         }

         public void endDocument() throws SAXException {
             delegate.endDocument();
         }

         public void endElement(String uri, String localName, String qName)
                 throws SAXException {
             delegate.endElement(uri, localName, qName);
         }

         public void endPrefixMapping(String prefix) throws SAXException {
             delegate.endPrefixMapping(prefix);
         }

         public void ignorableWhitespace(char[] ch, int start, int length)
                 throws SAXException {
             delegate.ignorableWhitespace(ch, start, length);
         }

         public void processingInstruction(String target, String data)
                 throws SAXException {
             delegate.processingInstruction(target, data);
         }

         public void setDocumentLocator(Locator locator) {
             delegate.setDocumentLocator(locator);
         }

         public void skippedEntity(String name) throws SAXException {
             delegate.skippedEntity(name);
         }

         public void startDocument() throws SAXException {
             delegate.startDocument();
         }

         public void startPrefixMapping(String prefix, String uri)
                 throws SAXException {
             delegate.startPrefixMapping(prefix, uri);
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.microsoft.ooxml;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Locale;
	import java.util.Map;
	import java.util.Set;

	import org.apache.poi.POIXMLTextExtractor;
	import org.apache.poi.hssf.extractor.ExcelExtractor;
	import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
	import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
	import org.apache.poi.openxml4j.opc.OPCPackage;
	import org.apache.poi.openxml4j.opc.PackagePart;
	import org.apache.poi.openxml4j.opc.PackagePartName;
	import org.apache.poi.openxml4j.opc.PackageRelationship;
	import org.apache.poi.openxml4j.opc.PackagingURIHelper;
	import org.apache.poi.openxml4j.opc.TargetMode;
	import org.apache.poi.ss.usermodel.DataFormatter;
	import org.apache.poi.ss.usermodel.HeaderFooter;
	import org.apache.poi.xssf.eventusermodel.ReadOnlySharedStringsTable;
	import org.apache.poi.xssf.eventusermodel.XSSFReader;
	import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler;
	import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler;
	import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor;
	import org.apache.poi.xssf.model.CommentsTable;
	import org.apache.poi.xssf.model.StylesTable;
	import org.apache.poi.xssf.usermodel.XSSFComment;
	import org.apache.poi.xssf.usermodel.XSSFRelation;
	import org.apache.poi.xssf.usermodel.XSSFShape;
	import org.apache.poi.xssf.usermodel.XSSFSimpleShape;
	import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.metadata.TikaMetadataKeys;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
	import org.apache.tika.sax.XHTMLContentHandler;
	import org.apache.xmlbeans.XmlException;
	import org.openxmlformats.schemas.drawingml.x2006.main.CTHyperlink;
	import org.openxmlformats.schemas.drawingml.x2006.main.CTNonVisualDrawingProps;
	import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShape;
	import org.openxmlformats.schemas.drawingml.x2006.spreadsheetDrawing.CTShapeNonVisual;
	import org.xml.sax.Attributes;
	import org.xml.sax.ContentHandler;
	import org.xml.sax.InputSource;
	import org.xml.sax.Locator;
	import org.xml.sax.SAXException;
	import org.xml.sax.XMLReader;

	public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
	/**
	* Allows access to headers/footers from raw xml strings
	*/
	protected static HeaderFooterHelper hfHelper = new HeaderFooterHelper();
	protected final DataFormatter formatter;
	protected final List<PackagePart> sheetParts = new ArrayList<PackagePart>();
	protected final Map<String, String> drawingHyperlinks = new HashMap<>();
	protected Metadata metadata;
	protected ParseContext parseContext;

	public XSSFExcelExtractorDecorator(
	ParseContext context, POIXMLTextExtractor extractor, Locale locale) {
	super(context, extractor);

	this.parseContext = context;
	this.extractor = (XSSFEventBasedExcelExtractor)extractor;
	configureExtractor(this.extractor, locale);

	if (locale == null) {
	formatter = new TikaExcelDataFormatter();
	} else {
	formatter = new TikaExcelDataFormatter(locale);
	}
	}

	protected void configureExtractor(POIXMLTextExtractor extractor, Locale locale) {
	((XSSFEventBasedExcelExtractor)extractor).setIncludeTextBoxes(config.getIncludeShapeBasedContent());
	((XSSFEventBasedExcelExtractor)extractor).setFormulasNotResults(false);
	((XSSFEventBasedExcelExtractor)extractor).setLocale(locale);
	}

	@Override
	public void getXHTML(
	ContentHandler handler, Metadata metadata, ParseContext context)
	throws SAXException, XmlException, IOException, TikaException {

	this.metadata = metadata;
	this.parseContext = context;
	metadata.set(TikaMetadataKeys.PROTECTED, "false");

	super.getXHTML(handler, metadata, context);
	}

	/**
	* @see org.apache.poi.xssf.extractor.XSSFExcelExtractor#getText()
	*/
	@Override
	protected void buildXHTML(XHTMLContentHandler xhtml) throws SAXException,
	XmlException, IOException {
	OPCPackage container = extractor.getPackage();

	ReadOnlySharedStringsTable strings;
	XSSFReader.SheetIterator iter;
	XSSFReader xssfReader;
	StylesTable styles;
	try {
	xssfReader = new XSSFReader(container);
	styles = xssfReader.getStylesTable();

	iter = (XSSFReader.SheetIterator) xssfReader.getSheetsData();
	strings = new ReadOnlySharedStringsTable(container);
	} catch (InvalidFormatException e) {
	throw new XmlException(e);
	} catch (OpenXML4JException oe) {
	throw new XmlException(oe);
	}

	//temporary workaround for POI-61034
	//remove once POI 3.17-beta1 is released
	Set<String> seen = new HashSet<>();

	while (iter.hasNext()) {

	SheetTextAsHTML sheetExtractor = new SheetTextAsHTML(xhtml);
	PackagePart sheetPart = null;
	try (InputStream stream = iter.next()) {
	sheetPart = iter.getSheetPart();
	final String partName = sheetPart.getPartName().toString();
	if (seen.contains(partName)) {
	continue;
	}
	seen.add(partName);

	addDrawingHyperLinks(sheetPart);
	sheetParts.add(sheetPart);

	CommentsTable comments = iter.getSheetComments();

	// Start, and output the sheet name
	xhtml.startElement("div");
	xhtml.element("h1", iter.getSheetName());

	// Extract the main sheet contents
	xhtml.startElement("table");
	xhtml.startElement("tbody");

	processSheet(sheetExtractor, comments, styles, strings, stream);
	}
	xhtml.endElement("tbody");
	xhtml.endElement("table");

	// Output any headers and footers
	// (Need to process the sheet to get them, so we can't
	// do the headers before the contents)
	for (String header : sheetExtractor.headers) {
	extractHeaderFooter(header, xhtml);
	}
	for (String footer : sheetExtractor.footers) {
	extractHeaderFooter(footer, xhtml);
	}

	// Do text held in shapes, if required
	if (config.getIncludeShapeBasedContent()) {
	List<XSSFShape> shapes = iter.getShapes();
	processShapes(shapes, xhtml);
	}

	//for now dump sheet hyperlinks at bottom of page
	//consider a double-pass of the inputstream to reunite hyperlinks with cells/textboxes
	//step 1: extract hyperlink info from bottom of page
	//step 2: process as we do now, but with cached hyperlink relationship info
	extractHyperLinks(sheetPart, xhtml);
	// All done with this sheet
	xhtml.endElement("div");
	}
	}

	protected void addDrawingHyperLinks(PackagePart sheetPart) {
	try {
	for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
	if (rel.getTargetMode() == TargetMode.INTERNAL) {
	PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
	PackagePart part = rel.getPackage().getPart(relName);
	//parts can go missing, and Excel quietly ignores missing images -- TIKA-2134
	if (part == null) {
	continue;
	}
	for (PackageRelationship drawRel : part
	.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
	drawingHyperlinks.put(drawRel.getId(), drawRel.getTargetURI().toString());
	}
	}
	}
	} catch (InvalidFormatException e) {
	//swallow
	//an exception trying to extract
	//hyperlinks on drawings should not cause a parse failure
	}

	}


	private void extractHyperLinks(PackagePart sheetPart, XHTMLContentHandler xhtml) throws SAXException {
	try {
	for (PackageRelationship rel : sheetPart.getRelationshipsByType(XSSFRelation.SHEET_HYPERLINKS.getRelation())) {
	xhtml.startElement("a", "href", rel.getTargetURI().toString());
	xhtml.characters(rel.getTargetURI().toString());
	xhtml.endElement("a");
	}
	} catch (InvalidFormatException e) {
	//swallow
	}
	}

	protected void extractHeaderFooter(String hf, XHTMLContentHandler xhtml)
	throws SAXException {
	String content = ExcelExtractor._extractHeaderFooter(
	new HeaderFooterFromString(hf));
	if (content.length() > 0) {
	xhtml.element("p", content);
	}
	}

	private void processShapes(List<XSSFShape> shapes, XHTMLContentHandler xhtml) throws SAXException {
	if (shapes == null) {
	return;
	}
	for (XSSFShape shape : shapes) {
	if (shape instanceof XSSFSimpleShape) {
	String sText = ((XSSFSimpleShape) shape).getText();
	if (sText != null && sText.length() > 0) {
	xhtml.element("p", sText);
	}
	extractHyperLinksFromShape(((XSSFSimpleShape)shape).getCTShape(), xhtml);
	}
	}
	}

	private void extractHyperLinksFromShape(CTShape ctShape, XHTMLContentHandler xhtml) throws SAXException {

	if (ctShape == null)
	return;

	CTShapeNonVisual nvSpPR = ctShape.getNvSpPr();
	if (nvSpPR == null)
	return;

	CTNonVisualDrawingProps cNvPr = nvSpPR.getCNvPr();
	if (cNvPr == null)
	return;

	CTHyperlink ctHyperlink = cNvPr.getHlinkClick();
	if (ctHyperlink == null)
	return;

	String url = drawingHyperlinks.get(ctHyperlink.getId());
	if (url != null) {
	xhtml.startElement("a", "href", url);
	xhtml.characters(url);
	xhtml.endElement("a");
	}

	CTHyperlink ctHoverHyperlink = cNvPr.getHlinkHover();
	if (ctHoverHyperlink == null)
	return;

	url = drawingHyperlinks.get(ctHoverHyperlink.getId());
	if (url != null) {
	xhtml.startElement("a", "href", url);
	xhtml.characters(url);
	xhtml.endElement("a");
	}

	}

	public void processSheet(
	SheetContentsHandler sheetContentsExtractor,
	CommentsTable comments,
	StylesTable styles,
	ReadOnlySharedStringsTable strings,
	InputStream sheetInputStream)
	throws IOException, SAXException {
	InputSource sheetSource = new InputSource(sheetInputStream);
	try {
	XMLReader sheetParser = parseContext.getXMLReader();
	XSSFSheetInterestingPartsCapturer handler =
	new XSSFSheetInterestingPartsCapturer(new XSSFSheetXMLHandler(
	styles, comments, strings, sheetContentsExtractor, formatter, false));
	sheetParser.setContentHandler(handler);
	sheetParser.parse(sheetSource);
	sheetInputStream.close();

	if (handler.hasProtection) {
	metadata.set(TikaMetadataKeys.PROTECTED, "true");
	}
	} catch (TikaException e) {
	throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage());
	}
	}

	/**
	* In Excel files, sheets have things embedded in them,
	* and sheet drawings which have the images
	*/
	@Override
	protected List<PackagePart> getMainDocumentParts() throws TikaException {
	List<PackagePart> parts = new ArrayList<PackagePart>();
	for (PackagePart part : sheetParts) {
	// Add the sheet
	parts.add(part);

	// If it has drawings, return those too
	try {
	for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.DRAWINGS.getRelation())) {
	if (rel.getTargetMode() == TargetMode.INTERNAL) {
	PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
	parts.add(rel.getPackage().getPart(relName));
	}
	}
	for (PackageRelationship rel : part.getRelationshipsByType(XSSFRelation.VML_DRAWINGS.getRelation())) {
	if (rel.getTargetMode() == TargetMode.INTERNAL) {
	PackagePartName relName = PackagingURIHelper.createPartName(rel.getTargetURI());
	parts.add(rel.getPackage().getPart(relName));
	}
	}
	} catch (InvalidFormatException e) {
	throw new TikaException("Broken OOXML file", e);
	}
	}

	//add main document so that macros can be extracted
	//by AbstractOOXMLExtractor
	for (PackagePart part : extractor.getPackage().
	getPartsByRelationshipType(RELATION_OFFICE_DOCUMENT)) {
	parts.add(part);
	}

	return parts;
	}

	/**
	* Turns formatted sheet events into HTML
	*/
	protected static class SheetTextAsHTML implements SheetContentsHandler {
	private XHTMLContentHandler xhtml;
	protected List<String> headers;
	protected List<String> footers;

	protected SheetTextAsHTML(XHTMLContentHandler xhtml) {
	this.xhtml = xhtml;
	headers = new ArrayList<String>();
	footers = new ArrayList<String>();
	}

	public void startRow(int rowNum) {
	try {
	xhtml.startElement("tr");
	} catch (SAXException e) {
	}
	}

	public void endRow(int rowNum) {
	try {
	xhtml.endElement("tr");
	} catch (SAXException e) {
	}
	}

	public void cell(String cellRef, String formattedValue, XSSFComment comment) {
	try {
	xhtml.startElement("td");

	// Main cell contents
	if (formattedValue != null) {
	xhtml.characters(formattedValue);
	}

	// Comments
	if (comment != null) {
	xhtml.startElement("br");
	xhtml.endElement("br");
	xhtml.characters(comment.getAuthor());
	xhtml.characters(": ");
	xhtml.characters(comment.getString().getString());
	}

	xhtml.endElement("td");
	} catch (SAXException e) {
	}
	}

	public void headerFooter(String text, boolean isHeader, String tagName) {
	if (isHeader) {
	headers.add(text);
	} else {
	footers.add(text);
	}
	}
	}

	protected static class HeaderFooterFromString implements HeaderFooter {
	private String text;

	protected HeaderFooterFromString(String text) {
	this.text = text;
	}

	public String getCenter() {
	return hfHelper.getCenterSection(text);
	}

	public void setCenter(String paramString) {
	}

	public String getLeft() {
	return hfHelper.getLeftSection(text);
	}

	public void setLeft(String paramString) {
	}

	public String getRight() {
	return hfHelper.getRightSection(text);
	}

	public void setRight(String paramString) {
	}
	}

	/**
	* Captures information on interesting tags, whilst
	* delegating the main work to the formatting handler
	*/
	protected static class XSSFSheetInterestingPartsCapturer implements ContentHandler {
	private ContentHandler delegate;
	private boolean hasProtection = false;

	protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
	this.delegate = delegate;
	}

	public void startElement(String uri, String localName, String qName,
	Attributes atts) throws SAXException {
	if ("sheetProtection".equals(qName)) {
	hasProtection = true;
	}
	delegate.startElement(uri, localName, qName, atts);
	}

	public void characters(char[] ch, int start, int length)
	throws SAXException {
	delegate.characters(ch, start, length);
	}

	public void endDocument() throws SAXException {
	delegate.endDocument();
	}

	public void endElement(String uri, String localName, String qName)
	throws SAXException {
	delegate.endElement(uri, localName, qName);
	}

	public void endPrefixMapping(String prefix) throws SAXException {
	delegate.endPrefixMapping(prefix);
	}

	public void ignorableWhitespace(char[] ch, int start, int length)
	throws SAXException {
	delegate.ignorableWhitespace(ch, start, length);
	}

	public void processingInstruction(String target, String data)
	throws SAXException {
	delegate.processingInstruction(target, data);
	}

	public void setDocumentLocator(Locator locator) {
	delegate.setDocumentLocator(locator);
	}

	public void skippedEntity(String name) throws SAXException {
	delegate.skippedEntity(name);
	}

	public void startDocument() throws SAXException {
	delegate.startDocument();
	}

	public void startPrefixMapping(String prefix, String uri)
	throws SAXException {
	delegate.startPrefixMapping(prefix, uri);
	}
	}
	}