| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.microsoft; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.nio.charset.StandardCharsets; |
| import java.security.GeneralSecurityException; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.commons.io.input.CloseShieldInputStream; |
| import org.apache.poi.hdgf.extractor.VisioTextExtractor; |
| import org.apache.poi.hpbf.extractor.PublisherTextExtractor; |
| import org.apache.poi.poifs.crypt.Decryptor; |
| import org.apache.poi.poifs.crypt.EncryptionInfo; |
| import org.apache.poi.poifs.filesystem.DirectoryEntry; |
| import org.apache.poi.poifs.filesystem.DirectoryNode; |
| import org.apache.poi.poifs.filesystem.Entry; |
| import org.apache.poi.poifs.filesystem.POIFSFileSystem; |
| import org.apache.poi.poifs.macros.VBAMacroReader; |
| import org.apache.poi.util.IOUtils; |
| import org.apache.poi.util.LocaleUtil; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.detect.microsoft.POIFSContainerDetector; |
| import org.apache.tika.exception.EncryptedDocumentException; |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.extractor.EmbeddedDocumentExtractor; |
| import org.apache.tika.extractor.EmbeddedDocumentUtil; |
| import org.apache.tika.io.TikaInputStream; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.TikaCoreProperties; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.parser.PasswordProvider; |
| import org.apache.tika.parser.microsoft.ooxml.OOXMLParser; |
| import org.apache.tika.sax.BodyContentHandler; |
| import org.apache.tika.sax.EmbeddedContentHandler; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| /** |
| * Defines a Microsoft document content extractor. |
| */ |
| public class OfficeParser extends AbstractOfficeParser { |
| |
| /** |
| * Serial version UID |
| */ |
| private static final long serialVersionUID = 7393462244028653479L; |
| |
| private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( |
| new HashSet<>(Arrays.asList(POIFSDocumentType.WORKBOOK.type, |
| POIFSDocumentType.OLE10_NATIVE.type, POIFSDocumentType.WORDDOCUMENT.type, |
| POIFSDocumentType.UNKNOWN.type, POIFSDocumentType.ENCRYPTED.type, |
| POIFSDocumentType.POWERPOINT.type, POIFSDocumentType.PUBLISHER.type, |
| POIFSDocumentType.PROJECT.type, POIFSDocumentType.VISIO.type, |
| // Works isn't supported |
| POIFSDocumentType.XLR.type, // but Works 7.0 Spreadsheet is |
| POIFSDocumentType.OUTLOOK.type, POIFSDocumentType.SOLIDWORKS_PART.type, |
| POIFSDocumentType.SOLIDWORKS_ASSEMBLY.type, |
| POIFSDocumentType.SOLIDWORKS_DRAWING.type))); |
| |
| /** |
| * Helper to extract macros from an NPOIFS/vbaProject.bin |
| * <p> |
| * As of POI-3.15-final, there are still some bugs in VBAMacroReader. |
| * For now, we are swallowing NPE and other runtime exceptions |
| * |
| * @param fs NPOIFS to extract from |
| * @param xhtml SAX writer |
| * @param embeddedDocumentExtractor extractor for embedded documents |
| * @throws IOException on IOException if it occurs during the extraction of the embedded doc |
| * @throws SAXException on SAXException for writing to xhtml |
| */ |
| public static void extractMacros(POIFSFileSystem fs, ContentHandler xhtml, |
| EmbeddedDocumentExtractor embeddedDocumentExtractor) |
| throws IOException, SAXException { |
| |
| VBAMacroReader reader = null; |
| Map<String, String> macros = null; |
| try { |
| reader = new VBAMacroReader(fs); |
| macros = reader.readMacros(); |
| } catch (SecurityException e) { |
| throw e; |
| } catch (Exception e) { |
| Metadata m = new Metadata(); |
| m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, |
| TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); |
| m.set(Metadata.CONTENT_TYPE, "text/x-vbasic"); |
| EmbeddedDocumentUtil.recordException(e, m); |
| if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { |
| embeddedDocumentExtractor.parseEmbedded( |
| //pass in space character so that we don't trigger a zero-byte exception |
| new ByteArrayInputStream(new byte[]{'\u0020'}), xhtml, m, true); |
| } |
| return; |
| } |
| for (Map.Entry<String, String> e : macros.entrySet()) { |
| Metadata m = new Metadata(); |
| m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, |
| TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); |
| m.set(Metadata.CONTENT_TYPE, "text/x-vbasic"); |
| if (embeddedDocumentExtractor.shouldParseEmbedded(m)) { |
| embeddedDocumentExtractor.parseEmbedded( |
| new ByteArrayInputStream(e.getValue().getBytes(StandardCharsets.UTF_8)), |
| xhtml, m, true); |
| } |
| } |
| } |
| |
| public Set<MediaType> getSupportedTypes(ParseContext context) { |
| return SUPPORTED_TYPES; |
| } |
| |
| /** |
| * Extracts properties and text from an MS Document input stream |
| */ |
| public void parse(InputStream stream, ContentHandler handler, Metadata metadata, |
| ParseContext context) throws IOException, SAXException, TikaException { |
| |
| configure(context); |
| XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); |
| xhtml.startDocument(); |
| |
| final DirectoryNode root; |
| TikaInputStream tstream = TikaInputStream.cast(stream); |
| POIFSFileSystem mustCloseFs = null; |
| try { |
| if (tstream == null) { |
| mustCloseFs = new POIFSFileSystem(new CloseShieldInputStream(stream)); |
| root = mustCloseFs.getRoot(); |
| } else { |
| final Object container = tstream.getOpenContainer(); |
| if (container instanceof POIFSFileSystem) { |
| root = ((POIFSFileSystem) container).getRoot(); |
| } else if (container instanceof DirectoryNode) { |
| root = (DirectoryNode) container; |
| } else { |
| POIFSFileSystem fs = null; |
| if (tstream.hasFile()) { |
| fs = new POIFSFileSystem(tstream.getFile(), true); |
| } else { |
| fs = new POIFSFileSystem(new CloseShieldInputStream(tstream)); |
| } |
| //tstream will close the fs, no need to close this below |
| tstream.setOpenContainer(fs); |
| root = fs.getRoot(); |
| |
| } |
| } |
| parse(root, context, metadata, xhtml); |
| OfficeParserConfig officeParserConfig = context.get(OfficeParserConfig.class); |
| |
| if (officeParserConfig.isExtractMacros()) { |
| //now try to get macros. |
| //Note that macros are handled separately for ppt in HSLFExtractor. |
| |
| //We might consider not bothering to check for macros in root, |
| //if we know we're processing ppt based on content-type identified in metadata |
| extractMacros(root.getFileSystem(), xhtml, |
| EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context)); |
| |
| } |
| } finally { |
| IOUtils.closeQuietly(mustCloseFs); |
| } |
| xhtml.endDocument(); |
| } |
| |
| protected void parse(DirectoryNode root, ParseContext context, Metadata metadata, |
| XHTMLContentHandler xhtml) |
| throws IOException, SAXException, TikaException { |
| |
| // Parse summary entries first, to make metadata available early |
| new SummaryExtractor(metadata).parseSummaries(root); |
| |
| // Parse remaining document entries |
| POIFSDocumentType type = POIFSDocumentType.detectType(root); |
| |
| if (type != POIFSDocumentType.UNKNOWN) { |
| setType(metadata, type.getType()); |
| } |
| |
| switch (type) { |
| case SOLIDWORKS_PART: |
| case SOLIDWORKS_ASSEMBLY: |
| case SOLIDWORKS_DRAWING: |
| break; |
| case PUBLISHER: |
| PublisherTextExtractor publisherTextExtractor = new PublisherTextExtractor(root); |
| xhtml.element("p", publisherTextExtractor.getText()); |
| break; |
| case WORDDOCUMENT: |
| new WordExtractor(context, metadata).parse(root, xhtml); |
| break; |
| case POWERPOINT: |
| new HSLFExtractor(context, metadata).parse(root, xhtml); |
| break; |
| case WORKBOOK: |
| case XLR: |
| Locale locale = context.get(Locale.class, LocaleUtil.getUserLocale()); |
| new ExcelExtractor(context, metadata).parse(root, xhtml, locale); |
| break; |
| case PROJECT: |
| // We currently can't do anything beyond the metadata |
| break; |
| case VISIO: |
| VisioTextExtractor visioTextExtractor = new VisioTextExtractor(root); |
| for (String text : visioTextExtractor.getAllText()) { |
| xhtml.element("p", text); |
| } |
| break; |
| case OUTLOOK: |
| OutlookExtractor extractor = new OutlookExtractor(root, context); |
| |
| extractor.parse(xhtml, metadata); |
| break; |
| case ENCRYPTED: |
| EncryptionInfo info = new EncryptionInfo(root); |
| Decryptor d = Decryptor.getInstance(info); |
| |
| try { |
| // By default, use the default Office Password |
| String password = Decryptor.DEFAULT_PASSWORD; |
| |
| // If they supplied a Password Provider, ask that for the password, |
| // and use the provider given one if available (stick with default if not) |
| PasswordProvider passwordProvider = context.get(PasswordProvider.class); |
| if (passwordProvider != null) { |
| String suppliedPassword = passwordProvider.getPassword(metadata); |
| if (suppliedPassword != null) { |
| password = suppliedPassword; |
| } |
| } |
| |
| // Check if we've the right password or not |
| if (!d.verifyPassword(password)) { |
| throw new EncryptedDocumentException(); |
| } |
| |
| // Decrypt the OLE2 stream, and delegate the resulting OOXML |
| // file to the regular OOXML parser for normal handling |
| OOXMLParser parser = new OOXMLParser(); |
| try (TikaInputStream tis = TikaInputStream.get(d.getDataStream(root))) { |
| parser.parse(tis, new EmbeddedContentHandler(new BodyContentHandler(xhtml)), |
| metadata, context); |
| } |
| } catch (GeneralSecurityException ex) { |
| throw new EncryptedDocumentException(ex); |
| } |
| default: |
| // For unsupported / unhandled types, just the metadata |
| // is extracted, which happened above |
| break; |
| } |
| } |
| |
| private void setType(Metadata metadata, MediaType type) { |
| metadata.set(Metadata.CONTENT_TYPE, type.toString()); |
| } |
| |
| public enum POIFSDocumentType { |
| WORKBOOK("xls", MediaType.application("vnd.ms-excel")), |
| OLE10_NATIVE("ole", POIFSContainerDetector.OLE10_NATIVE), |
| COMP_OBJ("ole", POIFSContainerDetector.COMP_OBJ), |
| WORDDOCUMENT("doc", MediaType.application("msword")), |
| UNKNOWN("unknown", MediaType.application("x-tika-msoffice")), |
| ENCRYPTED("ole", MediaType.application("x-tika-ooxml-protected")), |
| POWERPOINT("ppt", MediaType.application("vnd.ms-powerpoint")), |
| PUBLISHER("pub", MediaType.application("x-mspublisher")), |
| PROJECT("mpp", MediaType.application("vnd.ms-project")), |
| VISIO("vsd", MediaType.application("vnd.visio")), |
| WORKS("wps", MediaType.application("vnd.ms-works")), |
| XLR("xlr", MediaType.application("x-tika-msworks-spreadsheet")), |
| OUTLOOK("msg", MediaType.application("vnd.ms-outlook")), |
| SOLIDWORKS_PART("sldprt", MediaType.application("sldworks")), |
| SOLIDWORKS_ASSEMBLY("sldasm", MediaType.application("sldworks")), |
| SOLIDWORKS_DRAWING("slddrw", MediaType.application("sldworks")), |
| GRAPH("", MediaType.application("vnd.ms-graph")); |
| |
| private final String extension; |
| private final MediaType type; |
| |
| POIFSDocumentType(String extension, MediaType type) { |
| this.extension = extension; |
| this.type = type; |
| } |
| |
| public static POIFSDocumentType detectType(POIFSFileSystem fs) { |
| return detectType(fs.getRoot()); |
| } |
| |
| public static POIFSDocumentType detectType(DirectoryEntry node) { |
| Set<String> names = new HashSet<>(); |
| for (Entry entry : node) { |
| names.add(entry.getName()); |
| } |
| MediaType type = POIFSContainerDetector.detect(names, node); |
| for (POIFSDocumentType poifsType : values()) { |
| if (type.equals(poifsType.type)) { |
| return poifsType; |
| } |
| } |
| return UNKNOWN; |
| } |
| |
| public String getExtension() { |
| return extension; |
| } |
| |
| public MediaType getType() { |
| return type; |
| } |
| } |
| |
| } |