| /* ==================================================================== |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==================================================================== */ |
| package org.apache.poi.extractor; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.File; |
| import java.io.FileNotFoundException; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| |
| import org.apache.poi.EncryptedDocumentException; |
| import org.apache.poi.POIOLE2TextExtractor; |
| import org.apache.poi.POITextExtractor; |
| import org.apache.poi.POIXMLTextExtractor; |
| import org.apache.poi.hsmf.MAPIMessage; |
| import org.apache.poi.hsmf.datatypes.AttachmentChunks; |
| import org.apache.poi.hsmf.extractor.OutlookTextExtactor; |
| import org.apache.poi.hssf.extractor.ExcelExtractor; |
| import org.apache.poi.hssf.record.crypto.Biff8EncryptionKey; |
| import org.apache.poi.hwpf.extractor.WordExtractor; |
| import org.apache.poi.openxml4j.exceptions.OpenXML4JException; |
| import org.apache.poi.openxml4j.opc.OPCPackage; |
| import org.apache.poi.openxml4j.opc.PackageAccess; |
| import org.apache.poi.openxml4j.opc.PackagePart; |
| import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; |
| import org.apache.poi.openxml4j.opc.PackageRelationshipTypes; |
| import org.apache.poi.poifs.crypt.Decryptor; |
| import org.apache.poi.poifs.crypt.EncryptionInfo; |
| import org.apache.poi.poifs.filesystem.DirectoryEntry; |
| import org.apache.poi.poifs.filesystem.DirectoryNode; |
| import org.apache.poi.poifs.filesystem.Entry; |
| import org.apache.poi.poifs.filesystem.FileMagic; |
| import org.apache.poi.poifs.filesystem.NPOIFSFileSystem; |
| import org.apache.poi.poifs.filesystem.NotOLE2FileException; |
| import org.apache.poi.poifs.filesystem.OPOIFSFileSystem; |
| import org.apache.poi.poifs.filesystem.OfficeXmlFileException; |
| import org.apache.poi.poifs.filesystem.POIFSFileSystem; |
| import org.apache.poi.util.IOUtils; |
| import org.apache.poi.util.NotImplemented; |
| import org.apache.poi.util.POILogFactory; |
| import org.apache.poi.util.POILogger; |
| import org.apache.poi.xdgf.extractor.XDGFVisioExtractor; |
| import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; |
| import org.apache.poi.xslf.usermodel.XSLFRelation; |
| import org.apache.poi.xslf.usermodel.XSLFSlideShow; |
| import org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor; |
| import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; |
| import org.apache.poi.xssf.extractor.XSSFExcelExtractor; |
| import org.apache.poi.xssf.usermodel.XSSFRelation; |
| import org.apache.poi.xwpf.extractor.XWPFWordExtractor; |
| import org.apache.poi.xwpf.usermodel.XWPFRelation; |
| import org.apache.xmlbeans.XmlException; |
| |
| /** |
| * Figures out the correct POITextExtractor for your supplied |
| * document, and returns it. |
| * |
| * <p>Note 1 - will fail for many file formats if the POI Scratchpad jar is |
| * not present on the runtime classpath</p> |
| * <p>Note 2 - rather than using this, for most cases you would be better |
| * off switching to <a href="http://tika.apache.org">Apache Tika</a> instead!</p> |
| */ |
| @SuppressWarnings("WeakerAccess") |
| public class ExtractorFactory { |
| private static final POILogger logger = POILogFactory.getLogger(ExtractorFactory.class); |
| |
| public static final String CORE_DOCUMENT_REL = PackageRelationshipTypes.CORE_DOCUMENT; |
| protected static final String VISIO_DOCUMENT_REL = PackageRelationshipTypes.VISIO_CORE_DOCUMENT; |
| protected static final String STRICT_DOCUMENT_REL = PackageRelationshipTypes.STRICT_CORE_DOCUMENT; |
| |
| /** |
| * Should this thread prefer event based over usermodel based extractors? |
| * (usermodel extractors tend to be more accurate, but use more memory) |
| * Default is false. |
| */ |
| public static boolean getThreadPrefersEventExtractors() { |
| return OLE2ExtractorFactory.getThreadPrefersEventExtractors(); |
| } |
| |
| /** |
| * Should all threads prefer event based over usermodel based extractors? |
| * (usermodel extractors tend to be more accurate, but use more memory) |
| * Default is to use the thread level setting, which defaults to false. |
| */ |
| public static Boolean getAllThreadsPreferEventExtractors() { |
| return OLE2ExtractorFactory.getAllThreadsPreferEventExtractors(); |
| } |
| |
| /** |
| * Should this thread prefer event based over usermodel based extractors? |
| * Will only be used if the All Threads setting is null. |
| */ |
| public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) { |
| OLE2ExtractorFactory.setThreadPrefersEventExtractors(preferEventExtractors); |
| } |
| |
| /** |
| * Should all threads prefer event based over usermodel based extractors? |
| * If set, will take preference over the Thread level setting. |
| */ |
| public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) { |
| OLE2ExtractorFactory.setAllThreadsPreferEventExtractors(preferEventExtractors); |
| } |
| |
| /** |
| * Should this thread use event based extractors is available? |
| * Checks the all-threads one first, then thread specific. |
| */ |
| protected static boolean getPreferEventExtractor() { |
| return OLE2ExtractorFactory.getPreferEventExtractor(); |
| } |
| |
| public static POITextExtractor createExtractor(File f) throws IOException, OpenXML4JException, XmlException { |
| NPOIFSFileSystem fs = null; |
| try { |
| fs = new NPOIFSFileSystem(f); |
| if (fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY)) { |
| return createEncyptedOOXMLExtractor(fs); |
| } |
| POIOLE2TextExtractor extractor = createExtractor(fs); |
| extractor.setFilesystem(fs); |
| return extractor; |
| |
| } catch (OfficeXmlFileException e) { |
| // ensure file-handle release |
| IOUtils.closeQuietly(fs); |
| return createExtractor(OPCPackage.open(f.toString(), PackageAccess.READ)); |
| |
| } catch (NotOLE2FileException ne) { |
| // ensure file-handle release |
| IOUtils.closeQuietly(fs); |
| throw new IllegalArgumentException("Your File was neither an OLE2 file, nor an OOXML file"); |
| |
| } catch (OpenXML4JException e) { |
| // ensure file-handle release |
| IOUtils.closeQuietly(fs); |
| throw e; |
| |
| } catch (XmlException e) { |
| // ensure file-handle release |
| IOUtils.closeQuietly(fs); |
| throw e; |
| |
| } catch (IOException e) { |
| // ensure file-handle release |
| IOUtils.closeQuietly(fs); |
| throw e; |
| |
| } catch (RuntimeException e) { |
| // ensure file-handle release |
| IOUtils.closeQuietly(fs); |
| throw e; |
| } catch (Error e) { |
| // ensure file-handle release |
| IOUtils.closeQuietly(fs); |
| throw e; |
| } |
| } |
| |
| public static POITextExtractor createExtractor(InputStream inp) throws IOException, OpenXML4JException, XmlException { |
| InputStream is = FileMagic.prepareToCheckMagic(inp); |
| |
| FileMagic fm = FileMagic.valueOf(is); |
| |
| switch (fm) { |
| case OLE2: |
| NPOIFSFileSystem fs = new NPOIFSFileSystem(is); |
| boolean isEncrypted = fs.getRoot().hasEntry(Decryptor.DEFAULT_POIFS_ENTRY); |
| return isEncrypted ? createEncyptedOOXMLExtractor(fs) : createExtractor(fs); |
| case OOXML: |
| return createExtractor(OPCPackage.open(is)); |
| default: |
| throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream"); |
| } |
| } |
| |
| /** |
| * Tries to determine the actual type of file and produces a matching text-extractor for it. |
| * |
| * @param pkg An {@link OPCPackage}. |
| * @return A {@link POIXMLTextExtractor} for the given file. |
| * @throws IOException If an error occurs while reading the file |
| * @throws OpenXML4JException If an error parsing the OpenXML file format is found. |
| * @throws XmlException If an XML parsing error occurs. |
| * @throws IllegalArgumentException If no matching file type could be found. |
| */ |
| public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException { |
| try { |
| // Check for the normal Office core document |
| PackageRelationshipCollection core; |
| core = pkg.getRelationshipsByType(CORE_DOCUMENT_REL); |
| |
| // If nothing was found, try some of the other OOXML-based core types |
| if (core.size() == 0) { |
| // Could it be an OOXML-Strict one? |
| core = pkg.getRelationshipsByType(STRICT_DOCUMENT_REL); |
| } |
| if (core.size() == 0) { |
| // Could it be a visio one? |
| core = pkg.getRelationshipsByType(VISIO_DOCUMENT_REL); |
| if (core.size() == 1) |
| return new XDGFVisioExtractor(pkg); |
| } |
| |
| // Should just be a single core document, complain if not |
| if (core.size() != 1) { |
| throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size()); |
| } |
| |
| // Grab the core document part, and try to identify from that |
| final PackagePart corePart = pkg.getPart(core.getRelationship(0)); |
| final String contentType = corePart.getContentType(); |
| |
| // Is it XSSF? |
| for (XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) { |
| if ( rel.getContentType().equals( contentType ) ) { |
| if (getPreferEventExtractor()) { |
| return new XSSFEventBasedExcelExtractor(pkg); |
| } |
| return new XSSFExcelExtractor(pkg); |
| } |
| } |
| |
| // Is it XWPF? |
| for (XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) { |
| if ( rel.getContentType().equals( contentType ) ) { |
| return new XWPFWordExtractor(pkg); |
| } |
| } |
| |
| // Is it XSLF? |
| for (XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) { |
| if ( rel.getContentType().equals( contentType ) ) { |
| return new XSLFPowerPointExtractor(pkg); |
| } |
| } |
| |
| // special handling for SlideShow-Theme-files, |
| if (XSLFRelation.THEME_MANAGER.getContentType().equals(contentType)) { |
| return new XSLFPowerPointExtractor(new XSLFSlideShow(pkg)); |
| } |
| |
| // How about xlsb? |
| for (XSSFRelation rel : XSSFBEventBasedExcelExtractor.SUPPORTED_TYPES) { |
| if (rel.getContentType().equals(contentType)) { |
| return new XSSFBEventBasedExcelExtractor(pkg); |
| } |
| } |
| |
| throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+contentType+")"); |
| |
| } catch (IOException e) { |
| // ensure that we close the package again if there is an error opening it, however |
| // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! |
| pkg.revert(); |
| throw e; |
| } catch (OpenXML4JException e) { |
| // ensure that we close the package again if there is an error opening it, however |
| // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! |
| pkg.revert(); |
| throw e; |
| } catch (XmlException e) { |
| // ensure that we close the package again if there is an error opening it, however |
| // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! |
| pkg.revert(); |
| throw e; |
| } catch (RuntimeException e) { |
| // ensure that we close the package again if there is an error opening it, however |
| // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! |
| pkg.revert(); |
| throw e; |
| } catch (Error e) { |
| // ensure that we close the package again if there is an error opening it, however |
| // we need to revert the package to not re-write the file via close(), which is very likely not wanted for a TextExtractor! |
| pkg.revert(); |
| throw e; |
| } |
| } |
| |
| public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { |
| return OLE2ExtractorFactory.createExtractor(fs); |
| } |
| public static POIOLE2TextExtractor createExtractor(NPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { |
| return OLE2ExtractorFactory.createExtractor(fs); |
| } |
| public static POIOLE2TextExtractor createExtractor(OPOIFSFileSystem fs) throws IOException, OpenXML4JException, XmlException { |
| return OLE2ExtractorFactory.createExtractor(fs); |
| } |
| |
| public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException, OpenXML4JException, XmlException |
| { |
| // First, check for OOXML |
| for (String entryName : poifsDir.getEntryNames()) { |
| if (entryName.equals("Package")) { |
| OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package")); |
| return createExtractor(pkg); |
| } |
| } |
| |
| // If not, ask the OLE2 code to check, with Scratchpad if possible |
| return OLE2ExtractorFactory.createExtractor(poifsDir); |
| } |
| |
| /** |
| * Returns an array of text extractors, one for each of |
| * the embedded documents in the file (if there are any). |
| * If there are no embedded documents, you'll get back an |
| * empty array. Otherwise, you'll get one open |
| * {@link POITextExtractor} for each embedded file. |
| */ |
| public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException { |
| // All the embedded directories we spotted |
| ArrayList<Entry> dirs = new ArrayList<Entry>(); |
| // For anything else not directly held in as a POIFS directory |
| ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>(); |
| |
| // Find all the embedded directories |
| DirectoryEntry root = ext.getRoot(); |
| if (root == null) { |
| throw new IllegalStateException("The extractor didn't know which POIFS it came from!"); |
| } |
| |
| if (ext instanceof ExcelExtractor) { |
| // These are in MBD... under the root |
| Iterator<Entry> it = root.getEntries(); |
| while (it.hasNext()) { |
| Entry entry = it.next(); |
| if (entry.getName().startsWith("MBD")) { |
| dirs.add(entry); |
| } |
| } |
| } else if (ext instanceof WordExtractor) { |
| // These are in ObjectPool -> _... under the root |
| try { |
| DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool"); |
| Iterator<Entry> it = op.getEntries(); |
| while (it.hasNext()) { |
| Entry entry = it.next(); |
| if (entry.getName().startsWith("_")) { |
| dirs.add(entry); |
| } |
| } |
| } catch (FileNotFoundException e) { |
| logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage()); |
| // ignored here |
| } |
| //} else if(ext instanceof PowerPointExtractor) { |
| // Tricky, not stored directly in poifs |
| // TODO |
| } else if (ext instanceof OutlookTextExtactor) { |
| // Stored in the Attachment blocks |
| MAPIMessage msg = ((OutlookTextExtactor)ext).getMAPIMessage(); |
| for (AttachmentChunks attachment : msg.getAttachmentFiles()) { |
| if (attachment.getAttachData() != null) { |
| byte[] data = attachment.getAttachData().getValue(); |
| nonPOIFS.add( new ByteArrayInputStream(data) ); |
| } else if (attachment.getAttachmentDirectory() != null) { |
| dirs.add(attachment.getAttachmentDirectory().getDirectory()); |
| } |
| } |
| } |
| |
| // Create the extractors |
| if (dirs.size() == 0 && nonPOIFS.size() == 0){ |
| return new POITextExtractor[0]; |
| } |
| |
| ArrayList<POITextExtractor> textExtractors = new ArrayList<POITextExtractor>(); |
| for (Entry dir : dirs) { |
| textExtractors.add(createExtractor((DirectoryNode) dir)); |
| } |
| for (InputStream nonPOIF : nonPOIFS) { |
| try { |
| textExtractors.add(createExtractor(nonPOIF)); |
| } catch (IllegalArgumentException e) { |
| // Ignore, just means it didn't contain |
| // a format we support as yet |
| logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage()); |
| } catch (XmlException e) { |
| throw new IOException(e.getMessage(), e); |
| } catch (OpenXML4JException e) { |
| throw new IOException(e.getMessage(), e); |
| } |
| } |
| return textExtractors.toArray(new POITextExtractor[textExtractors.size()]); |
| } |
| |
| /** |
| * Returns an array of text extractors, one for each of |
| * the embedded documents in the file (if there are any). |
| * If there are no embedded documents, you'll get back an |
| * empty array. Otherwise, you'll get one open |
| * {@link POITextExtractor} for each embedded file. |
| */ |
| @NotImplemented |
| @SuppressWarnings("UnusedParameters") |
| public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) { |
| throw new IllegalStateException("Not yet supported"); |
| } |
| |
| private static POIXMLTextExtractor createEncyptedOOXMLExtractor(NPOIFSFileSystem fs) |
| throws IOException { |
| String pass = Biff8EncryptionKey.getCurrentUserPassword(); |
| if (pass == null) { |
| pass = Decryptor.DEFAULT_PASSWORD; |
| } |
| |
| EncryptionInfo ei = new EncryptionInfo(fs); |
| Decryptor dec = ei.getDecryptor(); |
| InputStream is = null; |
| try { |
| if (!dec.verifyPassword(pass)) { |
| throw new EncryptedDocumentException("Invalid password specified - use Biff8EncryptionKey.setCurrentUserPassword() before calling extractor"); |
| } |
| is = dec.getDataStream(fs); |
| return createExtractor(OPCPackage.open(is)); |
| } catch (IOException e) { |
| throw e; |
| } catch (Exception e) { |
| throw new EncryptedDocumentException(e); |
| } finally { |
| IOUtils.closeQuietly(is); |
| } |
| } |
| } |