tika-parsers/src/main/java/org/apache/tika/parser/epub/EpubParser.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.epub;

 import static java.nio.charset.StandardCharsets.UTF_8;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.UnsupportedEncodingException;
 import java.net.URLDecoder;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.Enumeration;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.zip.ZipException;

 import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
 import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
 import org.apache.commons.compress.archivers.zip.ZipFile;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.tika.config.Field;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.utils.ZipSalvager;
 import org.apache.tika.parser.xml.DcXMLParser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.OfflineContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.ParserUtils;
 import org.apache.tika.utils.XMLReaderUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;

 /**
  * Epub parser
  */
 public class EpubParser extends AbstractParser {

     /** Serial version UID */
     private static final long serialVersionUID = 215176772484050550L;

     private static final Set<MediaType> SUPPORTED_TYPES =
             Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
             		MediaType.application("epub+zip"),
                   MediaType.application("x-ibooks+zip")
             )));

     private Parser meta = new DcXMLParser();

     private Parser content = new EpubContentParser();

     public Parser getMetaParser() {
         return meta;
     }

     public void setMetaParser(Parser meta) {
         this.meta = meta;
     }

     public Parser getContentParser() {
         return content;
     }

     public void setContentParser(Parser content) {
         this.content = content;
     }

     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }

     @Field
     boolean streaming = false;

     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
         // Because an EPub file is often made up of multiple XHTML files,
         //  we need explicit control over the start and end of the document
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
         IOException caughtException = null;
         ContentHandler childHandler = new EmbeddedContentHandler(
                 new BodyContentHandler(xhtml));
         if (streaming) {
             try {
                 streamingParse(stream, childHandler, metadata, context);
             } catch (IOException e) {
                 caughtException = e;
             }
         } else {
             try {
                 bufferedParse(stream, childHandler, xhtml, metadata, context);
             } catch (IOException e) {
                 caughtException = e;
             }
         }
         // Finish everything
         xhtml.endDocument();
         if (caughtException != null) {
             throw caughtException;
         }
     }

     private void streamingParse(InputStream stream, ContentHandler bodyHandler,
                                 Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
         ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);

         ZipArchiveEntry entry = zip.getNextZipEntry();
         while (entry != null) {
             if (entry.getName().equals("mimetype")) {
                 updateMimeType(zip, metadata);
             } else if (entry.getName().equals("metadata.xml")) {
                 meta.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".opf")) {
                 meta.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".htm") ||
                     entry.getName().endsWith(".html") ||
                     entry.getName().endsWith(".xhtml") ||
                     entry.getName().endsWith(".xml")) {
                 content.parse(zip, bodyHandler, metadata, context);
             }
             entry = zip.getNextZipEntry();
         }
     }

     private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
         String type = IOUtils.toString(is, UTF_8);
         //often has trailing new lines
         if (type != null) {
             type = type.trim();
         }
         metadata.set(Metadata.CONTENT_TYPE, type);

     }

     private void bufferedParse(InputStream stream,
                                ContentHandler bodyHandler, XHTMLContentHandler xhtml,
                                Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
         TikaInputStream tis;
         TemporaryResources temporaryResources = null;
         if (TikaInputStream.isTikaInputStream(stream)) {
             tis = TikaInputStream.cast(stream);
             if (tis.getOpenContainer() instanceof ZipFile) {
                 bufferedParseZipFile(
                         (ZipFile)tis.getOpenContainer(),
                         bodyHandler, xhtml, metadata, context, true);
                 return;
             }
         } else {
             temporaryResources = new TemporaryResources();
             tis = TikaInputStream.get(new CloseShieldInputStream(stream), temporaryResources);
         }
         ZipFile zipFile = null;
         try {
             zipFile = new ZipFile(tis.getPath().toFile());
         } catch (ZipException e) {
             ParserUtils.recordParserFailure(this, e, metadata);
             trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
             return;
         } finally {
             //if we had to wrap tis
             if (temporaryResources != null) {
                 tis.close();
             }
         }
         try {
             bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
         } finally {
             zipFile.close();
         }
     }

     private void trySalvage(Path brokenZip, ContentHandler bodyHandler,
                                XHTMLContentHandler xhtml,
                                Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
         TemporaryResources resources = new TemporaryResources();
         try {
             Path salvaged = resources.createTempFile();
             ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile());
             boolean success = false;
             try (ZipFile zipFile = new ZipFile(salvaged.toFile())) {
                 success = bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false);
             }
             if (! success) {
                 try (InputStream is = TikaInputStream.get(salvaged)) {
                     streamingParse(is, xhtml, metadata, context);
                 }
             }
         } finally {
             resources.close();
         }
     }

     private boolean bufferedParseZipFile(ZipFile zipFile,
                                          ContentHandler bodyHandler, XHTMLContentHandler xhtml,
                                          Metadata metadata, ParseContext context,
                                          boolean isStrict) throws IOException, TikaException, SAXException {
         String rootOPF = getRoot(zipFile, context);
         if (rootOPF == null) {
             return false;
         }
         ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
         if (zae == null || !zipFile.canReadEntryData(zae)) {
             return false;
         }
         meta.parse(zipFile.getInputStream(zae), new DefaultHandler(), metadata, context);

         ContentOrderScraper contentOrderScraper = new ContentOrderScraper();
         try (InputStream is = zipFile.getInputStream(zae)) {
             XMLReaderUtils.parseSAX(is,
                     new OfflineContentHandler(contentOrderScraper), context);
         }
         //if no content items, false
         if (contentOrderScraper.contentItems.size() == 0) {
             return false;
         }
         String relativePath = "";
         if (rootOPF.lastIndexOf("/") > -1) {
             relativePath = rootOPF.substring(0, rootOPF.lastIndexOf("/") + 1);
         }

         if (isStrict) {
             int found = 0;
             for (String id : contentOrderScraper.contentItems) {
                 HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
                 if (hRefMediaPair != null && hRefMediaPair.href != null) {
                     zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
                     if (zae != null && zipFile.canReadEntryData(zae)) {
                         found++;
                     }
                 }
             }
             //if not perfect match btwn items and readable items
             //return false
             if (found != contentOrderScraper.contentItems.size()) {
                 return false;
             }
         }

         extractMetadata(zipFile, metadata, context);
         Set<String> processed = new HashSet<>();
         for (String id : contentOrderScraper.contentItems) {
             HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
             if (hRefMediaPair != null && hRefMediaPair.href != null) {
                 //we need to test for xhtml/xml because the content parser
                 //expects that.
                 boolean shouldParse = false;
                 String href = hRefMediaPair.href.toLowerCase(Locale.US);
                 if (hRefMediaPair.media != null) {
                     String mediaType = hRefMediaPair.media.toLowerCase(Locale.US);
                     if (mediaType.contains("html")) {
                         shouldParse = true;
                     }
                 } else if (href.endsWith("htm") || href.endsWith("html") || href.endsWith(".xml")) {
                     shouldParse = true;
                 }
                 if (shouldParse) {
                     zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
                     if (zae != null) {
                         try (InputStream is = zipFile.getInputStream(zae)) {
                             content.parse(is, bodyHandler, metadata, context);
                             processed.add(id);
                         }
                     }
                 }
             }
         }

         //now handle embedded files
         EmbeddedDocumentExtractor embeddedDocumentExtractor =
                 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
         for (String id : contentOrderScraper.locationMap.keySet()) {
             if (! processed.contains(id)) {
                 HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
                 if (shouldHandleEmbedded(hRefMediaPair.media)) {
                     handleEmbedded(zipFile, relativePath,
                             hRefMediaPair, embeddedDocumentExtractor, xhtml, metadata);
                 }
             }
         }
         return true;
     }

     private boolean shouldHandleEmbedded(String media) {
         if (media == null) {
             return true;
         }
         String lc = media.toLowerCase(Locale.US);
         if (lc.contains("css")) {
             return false;
         } else if (lc.contains("svg")) {
             return false;
         } else if (lc.endsWith("/xml")) {
             return false;
         } else if (lc.contains("x-ibooks")) {
             return false;
         } else if (lc.equals("application/x-dtbncx+xml")) {
             return false;
         }
         return true;
     }

     private void handleEmbedded(ZipFile zipFile, String relativePath,
                                 HRefMediaPair hRefMediaPair,
                                 EmbeddedDocumentExtractor embeddedDocumentExtractor,
                                 XHTMLContentHandler xhtml, Metadata parentMetadata) throws IOException, SAXException {
         if (hRefMediaPair.href == null) {
             return;
         }
         String fullPath = relativePath + hRefMediaPair.href;

         ZipArchiveEntry ze = zipFile.getEntry(fullPath);
         if (ze == null || !zipFile.canReadEntryData(ze)) {
             return;
         }
         Metadata embeddedMetadata = new Metadata();
         if (!StringUtils.isBlank(hRefMediaPair.media)) {
             embeddedMetadata.set(Metadata.CONTENT_TYPE, hRefMediaPair.media);
         }
         if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
             return;
         }

         TikaInputStream stream = null;
         try {
             stream = TikaInputStream.get(zipFile.getInputStream(ze));
         } catch (IOException e) {
             //store this exception in the parent's metadata
             EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
             return;
         }

         xhtml.startElement("div", "class", "embedded");
         try {
             embeddedDocumentExtractor.parseEmbedded(
                     stream,
                     new EmbeddedContentHandler(xhtml),
                     embeddedMetadata, false);

         } finally {
             IOUtils.closeQuietly(stream);
         }
         xhtml.endElement("div");
     }

     private void extractMetadata(ZipFile zipFile, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
         ZipArchiveEntry zae = zipFile.getEntry("mimetype");
         if (zae != null && zipFile.canReadEntryData(zae)) {
             try (InputStream is = zipFile.getInputStream(zae)) {
                 updateMimeType(is, metadata);
             }
         }
         zae = zipFile.getEntry("metadata.xml");
         if (zae != null && zipFile.canReadEntryData(zae)) {
             try (InputStream is = zipFile.getInputStream(zae)) {
                 meta.parse(is, new DefaultHandler(), metadata, context);
             }
         }
     }

     private String getRoot(ZipFile zipFile, ParseContext context) throws IOException, TikaException, SAXException {
         ZipArchiveEntry container = zipFile.getEntry("META-INF/container.xml");
         if (container != null) {
             RootFinder rootFinder = new RootFinder();
             try (InputStream is = zipFile.getInputStream(container)) {
                 XMLReaderUtils.parseSAX(is, new OfflineContentHandler(rootFinder), context);
             }
             return rootFinder.root;
         } else {
             Enumeration<ZipArchiveEntry> entryEnum = zipFile.getEntries();
             while (entryEnum.hasMoreElements()) {
                 ZipArchiveEntry ze = entryEnum.nextElement();
                 if (ze.getName().toLowerCase(Locale.US).endsWith(".opf") &&
                         zipFile.canReadEntryData(ze)) {
                     return ze.getName();
                 }
             }
             return null;
         }
     }

     private static class RootFinder extends DefaultHandler {
         String root = null;
         @Override
         public void startElement(
                 String uri, String localName, String name, Attributes atts)
                 throws SAXException {
             if ("rootfile".equalsIgnoreCase(localName)) {
                 root = XMLReaderUtils.getAttrValue("full-path", atts);
             }
         }
     }

     private static class ContentOrderScraper extends DefaultHandler {

         Map<String, HRefMediaPair> locationMap = new HashMap<>();
         List<String> contentItems = new ArrayList<>();
         boolean inManifest = false;
         boolean inSpine = false;

         @Override
         public void startElement(
                 String uri, String localName, String name, Attributes atts)
                 throws SAXException {
             if ("manifest".equalsIgnoreCase(localName)) {
                 inManifest = true;
             } else if ("spine".equalsIgnoreCase(localName)) {
                 inSpine = true;
             }
             if (inManifest) {
                 if ("item".equalsIgnoreCase(localName)) {
                     String id = XMLReaderUtils.getAttrValue("id", atts);
                     String href = XMLReaderUtils.getAttrValue("href", atts);
                     String mime = XMLReaderUtils.getAttrValue("media-type", atts);
                     if (id != null && href != null) {
                         try {
                             href = URLDecoder.decode(href, UTF_8.name());
                         } catch (UnsupportedEncodingException e) {
                         }
                         locationMap.put(id, new HRefMediaPair(href, mime));
                     }
                 }
             }
             if (inSpine) {
                 if ("itemRef".equalsIgnoreCase(localName)) {
                     String id = XMLReaderUtils.getAttrValue("idref", atts);
                     if (id != null) {
                         contentItems.add(id);
                     }
                 }
             }
         }


         @Override
         public void endElement(
                 String uri, String localName, String name)
                 throws SAXException {
             if ("manifest".equalsIgnoreCase(localName)) {
                 inManifest = false;
             } else if ("spine".equalsIgnoreCase(localName)) {
                 inSpine = false;
             }
         }
     }
     private static class HRefMediaPair {
         private final String href;
         private final String media;

         HRefMediaPair(String href, String media) {
             this.href = href;
             this.media = media;
         }

         @Override
         public String toString() {
             return "HRefMediaPair{" +
                     "href='" + href + '\'' +
                     ", media='" + media + '\'' +
                     '}';
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.epub;

	import static java.nio.charset.StandardCharsets.UTF_8;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.UnsupportedEncodingException;
	import java.net.URLDecoder;
	import java.nio.file.Path;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.Enumeration;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Locale;
	import java.util.Map;
	import java.util.Set;
	import java.util.zip.ZipException;

	import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
	import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
	import org.apache.commons.compress.archivers.zip.ZipFile;
	import org.apache.commons.io.IOUtils;
	import org.apache.commons.io.input.CloseShieldInputStream;
	import org.apache.commons.lang3.StringUtils;
	import org.apache.tika.config.Field;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.extractor.EmbeddedDocumentExtractor;
	import org.apache.tika.extractor.EmbeddedDocumentUtil;
	import org.apache.tika.io.TemporaryResources;
	import org.apache.tika.io.TikaInputStream;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.mime.MediaType;
	import org.apache.tika.parser.AbstractParser;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.parser.utils.ZipSalvager;
	import org.apache.tika.parser.xml.DcXMLParser;
	import org.apache.tika.sax.BodyContentHandler;
	import org.apache.tika.sax.EmbeddedContentHandler;
	import org.apache.tika.sax.OfflineContentHandler;
	import org.apache.tika.sax.XHTMLContentHandler;
	import org.apache.tika.utils.ParserUtils;
	import org.apache.tika.utils.XMLReaderUtils;
	import org.xml.sax.Attributes;
	import org.xml.sax.ContentHandler;
	import org.xml.sax.SAXException;
	import org.xml.sax.helpers.DefaultHandler;

	/**
	* Epub parser
	*/
	public class EpubParser extends AbstractParser {

	/** Serial version UID */
	private static final long serialVersionUID = 215176772484050550L;

	private static final Set<MediaType> SUPPORTED_TYPES =
	Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
	MediaType.application("epub+zip"),
	MediaType.application("x-ibooks+zip")
	)));

	private Parser meta = new DcXMLParser();

	private Parser content = new EpubContentParser();

	public Parser getMetaParser() {
	return meta;
	}

	public void setMetaParser(Parser meta) {
	this.meta = meta;
	}

	public Parser getContentParser() {
	return content;
	}

	public void setContentParser(Parser content) {
	this.content = content;
	}

	public Set<MediaType> getSupportedTypes(ParseContext context) {
	return SUPPORTED_TYPES;
	}

	@Field
	boolean streaming = false;

	public void parse(
	InputStream stream, ContentHandler handler,
	Metadata metadata, ParseContext context)
	throws IOException, SAXException, TikaException {
	// Because an EPub file is often made up of multiple XHTML files,
	// we need explicit control over the start and end of the document
	XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
	xhtml.startDocument();
	IOException caughtException = null;
	ContentHandler childHandler = new EmbeddedContentHandler(
	new BodyContentHandler(xhtml));
	if (streaming) {
	try {
	streamingParse(stream, childHandler, metadata, context);
	} catch (IOException e) {
	caughtException = e;
	}
	} else {
	try {
	bufferedParse(stream, childHandler, xhtml, metadata, context);
	} catch (IOException e) {
	caughtException = e;
	}
	}
	// Finish everything
	xhtml.endDocument();
	if (caughtException != null) {
	throw caughtException;
	}
	}

	private void streamingParse(InputStream stream, ContentHandler bodyHandler,
	Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
	ZipArchiveInputStream zip = new ZipArchiveInputStream(stream);

	ZipArchiveEntry entry = zip.getNextZipEntry();
	while (entry != null) {
	if (entry.getName().equals("mimetype")) {
	updateMimeType(zip, metadata);
	} else if (entry.getName().equals("metadata.xml")) {
	meta.parse(zip, new DefaultHandler(), metadata, context);
	} else if (entry.getName().endsWith(".opf")) {
	meta.parse(zip, new DefaultHandler(), metadata, context);
	} else if (entry.getName().endsWith(".htm") \|\|
	entry.getName().endsWith(".html") \|\|
	entry.getName().endsWith(".xhtml") \|\|
	entry.getName().endsWith(".xml")) {
	content.parse(zip, bodyHandler, metadata, context);
	}
	entry = zip.getNextZipEntry();
	}
	}

	private void updateMimeType(InputStream is, Metadata metadata) throws IOException {
	String type = IOUtils.toString(is, UTF_8);
	//often has trailing new lines
	if (type != null) {
	type = type.trim();
	}
	metadata.set(Metadata.CONTENT_TYPE, type);

	}

	private void bufferedParse(InputStream stream,
	ContentHandler bodyHandler, XHTMLContentHandler xhtml,
	Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
	TikaInputStream tis;
	TemporaryResources temporaryResources = null;
	if (TikaInputStream.isTikaInputStream(stream)) {
	tis = TikaInputStream.cast(stream);
	if (tis.getOpenContainer() instanceof ZipFile) {
	bufferedParseZipFile(
	(ZipFile)tis.getOpenContainer(),
	bodyHandler, xhtml, metadata, context, true);
	return;
	}
	} else {
	temporaryResources = new TemporaryResources();
	tis = TikaInputStream.get(new CloseShieldInputStream(stream), temporaryResources);
	}
	ZipFile zipFile = null;
	try {
	zipFile = new ZipFile(tis.getPath().toFile());
	} catch (ZipException e) {
	ParserUtils.recordParserFailure(this, e, metadata);
	trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
	return;
	} finally {
	//if we had to wrap tis
	if (temporaryResources != null) {
	tis.close();
	}
	}
	try {
	bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true);
	} finally {
	zipFile.close();
	}
	}

	private void trySalvage(Path brokenZip, ContentHandler bodyHandler,
	XHTMLContentHandler xhtml,
	Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
	TemporaryResources resources = new TemporaryResources();
	try {
	Path salvaged = resources.createTempFile();
	ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile());
	boolean success = false;
	try (ZipFile zipFile = new ZipFile(salvaged.toFile())) {
	success = bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false);
	}
	if (! success) {
	try (InputStream is = TikaInputStream.get(salvaged)) {
	streamingParse(is, xhtml, metadata, context);
	}
	}
	} finally {
	resources.close();
	}
	}

	private boolean bufferedParseZipFile(ZipFile zipFile,
	ContentHandler bodyHandler, XHTMLContentHandler xhtml,
	Metadata metadata, ParseContext context,
	boolean isStrict) throws IOException, TikaException, SAXException {
	String rootOPF = getRoot(zipFile, context);
	if (rootOPF == null) {
	return false;
	}
	ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
	if (zae == null \|\| !zipFile.canReadEntryData(zae)) {
	return false;
	}
	meta.parse(zipFile.getInputStream(zae), new DefaultHandler(), metadata, context);

	ContentOrderScraper contentOrderScraper = new ContentOrderScraper();
	try (InputStream is = zipFile.getInputStream(zae)) {
	XMLReaderUtils.parseSAX(is,
	new OfflineContentHandler(contentOrderScraper), context);
	}
	//if no content items, false
	if (contentOrderScraper.contentItems.size() == 0) {
	return false;
	}
	String relativePath = "";
	if (rootOPF.lastIndexOf("/") > -1) {
	relativePath = rootOPF.substring(0, rootOPF.lastIndexOf("/") + 1);
	}

	if (isStrict) {
	int found = 0;
	for (String id : contentOrderScraper.contentItems) {
	HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
	if (hRefMediaPair != null && hRefMediaPair.href != null) {
	zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
	if (zae != null && zipFile.canReadEntryData(zae)) {
	found++;
	}
	}
	}
	//if not perfect match btwn items and readable items
	//return false
	if (found != contentOrderScraper.contentItems.size()) {
	return false;
	}
	}

	extractMetadata(zipFile, metadata, context);
	Set<String> processed = new HashSet<>();
	for (String id : contentOrderScraper.contentItems) {
	HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
	if (hRefMediaPair != null && hRefMediaPair.href != null) {
	//we need to test for xhtml/xml because the content parser
	//expects that.
	boolean shouldParse = false;
	String href = hRefMediaPair.href.toLowerCase(Locale.US);
	if (hRefMediaPair.media != null) {
	String mediaType = hRefMediaPair.media.toLowerCase(Locale.US);
	if (mediaType.contains("html")) {
	shouldParse = true;
	}
	} else if (href.endsWith("htm") \|\| href.endsWith("html") \|\| href.endsWith(".xml")) {
	shouldParse = true;
	}
	if (shouldParse) {
	zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
	if (zae != null) {
	try (InputStream is = zipFile.getInputStream(zae)) {
	content.parse(is, bodyHandler, metadata, context);
	processed.add(id);
	}
	}
	}
	}
	}

	//now handle embedded files
	EmbeddedDocumentExtractor embeddedDocumentExtractor =
	EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
	for (String id : contentOrderScraper.locationMap.keySet()) {
	if (! processed.contains(id)) {
	HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id);
	if (shouldHandleEmbedded(hRefMediaPair.media)) {
	handleEmbedded(zipFile, relativePath,
	hRefMediaPair, embeddedDocumentExtractor, xhtml, metadata);
	}
	}
	}
	return true;
	}

	private boolean shouldHandleEmbedded(String media) {
	if (media == null) {
	return true;
	}
	String lc = media.toLowerCase(Locale.US);
	if (lc.contains("css")) {
	return false;
	} else if (lc.contains("svg")) {
	return false;
	} else if (lc.endsWith("/xml")) {
	return false;
	} else if (lc.contains("x-ibooks")) {
	return false;
	} else if (lc.equals("application/x-dtbncx+xml")) {
	return false;
	}
	return true;
	}

	private void handleEmbedded(ZipFile zipFile, String relativePath,
	HRefMediaPair hRefMediaPair,
	EmbeddedDocumentExtractor embeddedDocumentExtractor,
	XHTMLContentHandler xhtml, Metadata parentMetadata) throws IOException, SAXException {
	if (hRefMediaPair.href == null) {
	return;
	}
	String fullPath = relativePath + hRefMediaPair.href;

	ZipArchiveEntry ze = zipFile.getEntry(fullPath);
	if (ze == null \|\| !zipFile.canReadEntryData(ze)) {
	return;
	}
	Metadata embeddedMetadata = new Metadata();
	if (!StringUtils.isBlank(hRefMediaPair.media)) {
	embeddedMetadata.set(Metadata.CONTENT_TYPE, hRefMediaPair.media);
	}
	if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
	return;
	}

	TikaInputStream stream = null;
	try {
	stream = TikaInputStream.get(zipFile.getInputStream(ze));
	} catch (IOException e) {
	//store this exception in the parent's metadata
	EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
	return;
	}

	xhtml.startElement("div", "class", "embedded");
	try {
	embeddedDocumentExtractor.parseEmbedded(
	stream,
	new EmbeddedContentHandler(xhtml),
	embeddedMetadata, false);

	} finally {
	IOUtils.closeQuietly(stream);
	}
	xhtml.endElement("div");
	}

	private void extractMetadata(ZipFile zipFile, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
	ZipArchiveEntry zae = zipFile.getEntry("mimetype");
	if (zae != null && zipFile.canReadEntryData(zae)) {
	try (InputStream is = zipFile.getInputStream(zae)) {
	updateMimeType(is, metadata);
	}
	}
	zae = zipFile.getEntry("metadata.xml");
	if (zae != null && zipFile.canReadEntryData(zae)) {
	try (InputStream is = zipFile.getInputStream(zae)) {
	meta.parse(is, new DefaultHandler(), metadata, context);
	}
	}
	}

	private String getRoot(ZipFile zipFile, ParseContext context) throws IOException, TikaException, SAXException {
	ZipArchiveEntry container = zipFile.getEntry("META-INF/container.xml");
	if (container != null) {
	RootFinder rootFinder = new RootFinder();
	try (InputStream is = zipFile.getInputStream(container)) {
	XMLReaderUtils.parseSAX(is, new OfflineContentHandler(rootFinder), context);
	}
	return rootFinder.root;
	} else {
	Enumeration<ZipArchiveEntry> entryEnum = zipFile.getEntries();
	while (entryEnum.hasMoreElements()) {
	ZipArchiveEntry ze = entryEnum.nextElement();
	if (ze.getName().toLowerCase(Locale.US).endsWith(".opf") &&
	zipFile.canReadEntryData(ze)) {
	return ze.getName();
	}
	}
	return null;
	}
	}

	private static class RootFinder extends DefaultHandler {
	String root = null;
	@Override
	public void startElement(
	String uri, String localName, String name, Attributes atts)
	throws SAXException {
	if ("rootfile".equalsIgnoreCase(localName)) {
	root = XMLReaderUtils.getAttrValue("full-path", atts);
	}
	}
	}

	private static class ContentOrderScraper extends DefaultHandler {

	Map<String, HRefMediaPair> locationMap = new HashMap<>();
	List<String> contentItems = new ArrayList<>();
	boolean inManifest = false;
	boolean inSpine = false;

	@Override
	public void startElement(
	String uri, String localName, String name, Attributes atts)
	throws SAXException {
	if ("manifest".equalsIgnoreCase(localName)) {
	inManifest = true;
	} else if ("spine".equalsIgnoreCase(localName)) {
	inSpine = true;
	}
	if (inManifest) {
	if ("item".equalsIgnoreCase(localName)) {
	String id = XMLReaderUtils.getAttrValue("id", atts);
	String href = XMLReaderUtils.getAttrValue("href", atts);
	String mime = XMLReaderUtils.getAttrValue("media-type", atts);
	if (id != null && href != null) {
	try {
	href = URLDecoder.decode(href, UTF_8.name());
	} catch (UnsupportedEncodingException e) {
	}
	locationMap.put(id, new HRefMediaPair(href, mime));
	}
	}
	}
	if (inSpine) {
	if ("itemRef".equalsIgnoreCase(localName)) {
	String id = XMLReaderUtils.getAttrValue("idref", atts);
	if (id != null) {
	contentItems.add(id);
	}
	}
	}
	}


	@Override
	public void endElement(
	String uri, String localName, String name)
	throws SAXException {
	if ("manifest".equalsIgnoreCase(localName)) {
	inManifest = false;
	} else if ("spine".equalsIgnoreCase(localName)) {
	inSpine = false;
	}
	}
	}
	private static class HRefMediaPair {
	private final String href;
	private final String media;

	HRefMediaPair(String href, String media) {
	this.href = href;
	this.media = media;
	}

	@Override
	public String toString() {
	return "HRefMediaPair{" +
	"href='" + href + '\'' +
	", media='" + media + '\'' +
	'}';
	}
	}
	}