| /* ==================================================================== |
| Licensed to the Apache Software Foundation (ASF) under one or more |
| contributor license agreements. See the NOTICE file distributed with |
| this work for additional information regarding copyright ownership. |
| The ASF licenses this file to You under the Apache License, Version 2.0 |
| (the "License"); you may not use this file except in compliance with |
| the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, software |
| distributed under the License is distributed on an "AS IS" BASIS, |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| See the License for the specific language governing permissions and |
| limitations under the License. |
| ==================================================================== */ |
| |
| package org.apache.poi.ss.extractor; |
| |
| import static org.apache.poi.util.StringUtil.endsWithIgnoreCase; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.Iterator; |
| import java.util.List; |
| |
| import org.apache.logging.log4j.LogManager; |
| import org.apache.logging.log4j.Logger; |
| import org.apache.poi.hpsf.ClassID; |
| import org.apache.poi.hpsf.ClassIDPredefined; |
| import org.apache.poi.poifs.filesystem.DirectoryNode; |
| import org.apache.poi.poifs.filesystem.DocumentInputStream; |
| import org.apache.poi.poifs.filesystem.Entry; |
| import org.apache.poi.poifs.filesystem.Ole10Native; |
| import org.apache.poi.poifs.filesystem.Ole10NativeException; |
| import org.apache.poi.poifs.filesystem.POIFSFileSystem; |
| import org.apache.poi.ss.usermodel.Drawing; |
| import org.apache.poi.ss.usermodel.ObjectData; |
| import org.apache.poi.ss.usermodel.Picture; |
| import org.apache.poi.ss.usermodel.PictureData; |
| import org.apache.poi.ss.usermodel.Shape; |
| import org.apache.poi.ss.usermodel.ShapeContainer; |
| import org.apache.poi.ss.usermodel.Sheet; |
| import org.apache.poi.ss.usermodel.Workbook; |
| import org.apache.poi.util.Beta; |
| import org.apache.poi.util.IOUtils; |
| import org.apache.poi.util.LocaleUtil; |
| |
| /** |
| * This extractor class tries to identify various embedded documents within Excel files |
| * and provide them via a common interface, i.e. the EmbeddedData instances |
| */ |
| @Beta |
| public class EmbeddedExtractor implements Iterable<EmbeddedExtractor> { |
| private static final Logger LOG = LogManager.getLogger(EmbeddedExtractor.class); |
| //arbitrarily selected; may need to increase |
| private static final int MAX_RECORD_LENGTH = 1_000_000; |
| |
| // contentType |
| private static final String CONTENT_TYPE_BYTES = "binary/octet-stream"; |
| private static final String CONTENT_TYPE_PDF = "application/pdf"; |
| private static final String CONTENT_TYPE_DOC = "application/msword"; |
| private static final String CONTENT_TYPE_XLS = "application/vnd.ms-excel"; |
| |
| /** |
| * @return the list of known extractors, if you provide custom extractors, override this method |
| */ |
| @Override |
| public Iterator<EmbeddedExtractor> iterator() { |
| EmbeddedExtractor[] ee = { |
| new Ole10Extractor(), new PdfExtractor(), new BiffExtractor(), new OOXMLExtractor(), new FsExtractor() |
| }; |
| return Arrays.asList(ee).iterator(); |
| } |
| |
| public EmbeddedData extractOne(DirectoryNode src) throws IOException { |
| for (EmbeddedExtractor ee : this) { |
| if (ee.canExtract(src)) { |
| return ee.extract(src); |
| } |
| } |
| return null; |
| } |
| |
| public EmbeddedData extractOne(Picture src) throws IOException { |
| for (EmbeddedExtractor ee : this) { |
| if (ee.canExtract(src)) { |
| return ee.extract(src); |
| } |
| } |
| return null; |
| } |
| |
| public List<EmbeddedData> extractAll(Sheet sheet) throws IOException { |
| Drawing<?> patriarch = sheet.getDrawingPatriarch(); |
| if (null == patriarch){ |
| return Collections.emptyList(); |
| } |
| List<EmbeddedData> embeddings = new ArrayList<>(); |
| extractAll(patriarch, embeddings); |
| return embeddings; |
| } |
| |
| protected void extractAll(ShapeContainer<?> parent, List<EmbeddedData> embeddings) throws IOException { |
| for (Shape shape : parent) { |
| EmbeddedData data = null; |
| if (shape instanceof ObjectData) { |
| ObjectData od = (ObjectData)shape; |
| try { |
| if (od.hasDirectoryEntry()) { |
| data = extractOne((DirectoryNode)od.getDirectory()); |
| } else { |
| data = new EmbeddedData(od.getFileName(), od.getObjectData(), od.getContentType()); |
| } |
| } catch (Exception e) { |
| LOG.atWarn().withThrowable(e).log("Entry not found / readable - ignoring OLE embedding"); |
| } |
| } else if (shape instanceof Picture) { |
| data = extractOne((Picture)shape); |
| } else if (shape instanceof ShapeContainer) { |
| extractAll((ShapeContainer<?>)shape, embeddings); |
| } |
| |
| if (data == null) { |
| continue; |
| } |
| |
| data.setShape(shape); |
| String filename = data.getFilename(); |
| String extension = (filename == null || filename.lastIndexOf('.') == -1) ? ".bin" : filename.substring(filename.lastIndexOf('.')); |
| |
| // try to find an alternative name |
| if (filename == null || filename.isEmpty() || filename.startsWith("MBD") || filename.startsWith("Root Entry")) { |
| filename = shape.getShapeName(); |
| if (filename != null) { |
| filename += extension; |
| } |
| } |
| // default to dummy name |
| if (filename == null || filename.isEmpty()) { |
| filename = "picture_" + embeddings.size() + extension; |
| } |
| filename = filename.trim(); |
| data.setFilename(filename); |
| |
| embeddings.add(data); |
| } |
| } |
| |
| |
| public boolean canExtract(DirectoryNode source) { |
| return false; |
| } |
| |
| public boolean canExtract(Picture source) { |
| return false; |
| } |
| |
| protected EmbeddedData extract(DirectoryNode dn) throws IOException { |
| assert(canExtract(dn)); |
| ByteArrayOutputStream bos = new ByteArrayOutputStream(20000); |
| try (POIFSFileSystem dest = new POIFSFileSystem()) { |
| copyNodes(dn, dest.getRoot()); |
| // start with a reasonable big size |
| dest.writeFilesystem(bos); |
| } |
| |
| return new EmbeddedData(dn.getName(), bos.toByteArray(), CONTENT_TYPE_BYTES); |
| } |
| |
| protected EmbeddedData extract(Picture source) throws IOException { |
| return null; |
| } |
| |
| public static class Ole10Extractor extends EmbeddedExtractor { |
| @Override |
| public boolean canExtract(DirectoryNode dn) { |
| ClassID clsId = dn.getStorageClsid(); |
| return ClassIDPredefined.lookup(clsId) == ClassIDPredefined.OLE_V1_PACKAGE; |
| } |
| |
| @Override |
| public EmbeddedData extract(DirectoryNode dn) throws IOException { |
| try { |
| // TODO: inspect the CompObj record for more details, i.e. the content type |
| Ole10Native ole10 = Ole10Native.createFromEmbeddedOleObject(dn); |
| return new EmbeddedData(ole10.getFileName(), ole10.getDataBuffer(), CONTENT_TYPE_BYTES); |
| } catch (Ole10NativeException e) { |
| throw new IOException(e); |
| } |
| } |
| } |
| |
| static class PdfExtractor extends EmbeddedExtractor { |
| @Override |
| public boolean canExtract(DirectoryNode dn) { |
| ClassID clsId = dn.getStorageClsid(); |
| return (ClassIDPredefined.PDF.equals(clsId) || dn.hasEntry("CONTENTS")); |
| } |
| |
| @Override |
| public EmbeddedData extract(DirectoryNode dn) throws IOException { |
| try(ByteArrayOutputStream bos = new ByteArrayOutputStream(); |
| InputStream is = dn.createDocumentInputStream("CONTENTS")) { |
| IOUtils.copy(is, bos); |
| return new EmbeddedData(dn.getName() + ".pdf", bos.toByteArray(), CONTENT_TYPE_PDF); |
| } |
| } |
| |
| @Override |
| public boolean canExtract(Picture source) { |
| PictureData pd = source.getPictureData(); |
| return (pd != null && pd.getPictureType() == Workbook.PICTURE_TYPE_EMF); |
| } |
| |
| /** |
| * Mac Office encodes embedded objects inside the picture, e.g. PDF is part of an EMF. |
| * If an embedded stream is inside an EMF picture, this method extracts the payload. |
| * |
| * @return the embedded data in an EMF picture or null if none is found |
| */ |
| @Override |
| protected EmbeddedData extract(Picture source) throws IOException { |
| // check for emf+ embedded pdf (poor mans style :( ) |
| // Mac Excel 2011 embeds pdf files with this method. |
| PictureData pd = source.getPictureData(); |
| if (pd == null || pd.getPictureType() != Workbook.PICTURE_TYPE_EMF) { |
| return null; |
| } |
| |
| // TODO: investigate if this is just an EMF-hack or if other formats are also embedded in EMF |
| byte[] pictureBytes = pd.getData(); |
| int idxStart = indexOf(pictureBytes, 0, "%PDF-".getBytes(LocaleUtil.CHARSET_1252)); |
| if (idxStart == -1) { |
| return null; |
| } |
| |
| int idxEnd = indexOf(pictureBytes, idxStart, "%%EOF".getBytes(LocaleUtil.CHARSET_1252)); |
| if (idxEnd == -1) { |
| return null; |
| } |
| |
| int pictureBytesLen = idxEnd-idxStart+6; |
| byte[] pdfBytes = IOUtils.safelyClone(pictureBytes, idxStart, pictureBytesLen, MAX_RECORD_LENGTH); |
| String filename = source.getShapeName().trim(); |
| if (!endsWithIgnoreCase(filename, ".pdf")) { |
| filename += ".pdf"; |
| } |
| return new EmbeddedData(filename, pdfBytes, CONTENT_TYPE_PDF); |
| } |
| |
| |
| } |
| |
| static class OOXMLExtractor extends EmbeddedExtractor { |
| @Override |
| public boolean canExtract(DirectoryNode dn) { |
| return dn.hasEntry("package"); |
| } |
| |
| @Override |
| public EmbeddedData extract(DirectoryNode dn) throws IOException { |
| |
| ClassIDPredefined clsId = ClassIDPredefined.lookup(dn.getStorageClsid()); |
| |
| String contentType = null; |
| String ext = null; |
| |
| if (clsId != null) { |
| contentType = clsId.getContentType(); |
| ext = clsId.getFileExtension(); |
| } |
| |
| if (contentType == null || ext == null) { |
| contentType = "application/zip"; |
| ext = ".zip"; |
| } |
| |
| DocumentInputStream dis = dn.createDocumentInputStream("package"); |
| byte[] data = IOUtils.toByteArray(dis); |
| dis.close(); |
| |
| return new EmbeddedData(dn.getName()+ext, data, contentType); |
| } |
| } |
| |
| static class BiffExtractor extends EmbeddedExtractor { |
| @Override |
| public boolean canExtract(DirectoryNode dn) { |
| return canExtractExcel(dn) || canExtractWord(dn); |
| } |
| |
| protected boolean canExtractExcel(DirectoryNode dn) { |
| ClassIDPredefined clsId = ClassIDPredefined.lookup(dn.getStorageClsid()); |
| return (ClassIDPredefined.EXCEL_V7 == clsId |
| || ClassIDPredefined.EXCEL_V8 == clsId |
| || dn.hasEntry("Workbook") /*...*/); |
| } |
| |
| protected boolean canExtractWord(DirectoryNode dn) { |
| ClassIDPredefined clsId = ClassIDPredefined.lookup(dn.getStorageClsid()); |
| return (ClassIDPredefined.WORD_V7 == clsId |
| || ClassIDPredefined.WORD_V8 == clsId |
| || dn.hasEntry("WordDocument")); |
| } |
| |
| @Override |
| public EmbeddedData extract(DirectoryNode dn) throws IOException { |
| EmbeddedData ed = super.extract(dn); |
| if (canExtractExcel(dn)) { |
| ed.setFilename(dn.getName() + ".xls"); |
| ed.setContentType(CONTENT_TYPE_XLS); |
| } else if (canExtractWord(dn)) { |
| ed.setFilename(dn.getName() + ".doc"); |
| ed.setContentType(CONTENT_TYPE_DOC); |
| } |
| |
| return ed; |
| } |
| } |
| |
| static class FsExtractor extends EmbeddedExtractor { |
| @Override |
| public boolean canExtract(DirectoryNode dn) { |
| return true; |
| } |
| @Override |
| public EmbeddedData extract(DirectoryNode dn) throws IOException { |
| EmbeddedData ed = super.extract(dn); |
| ed.setFilename(dn.getName() + ".ole"); |
| // TODO: read the content type from CombObj stream |
| return ed; |
| } |
| } |
| |
| protected static void copyNodes(DirectoryNode src, DirectoryNode dest) throws IOException { |
| for (Entry e : src) { |
| if (e instanceof DirectoryNode) { |
| DirectoryNode srcDir = (DirectoryNode)e; |
| DirectoryNode destDir = (DirectoryNode)dest.createDirectory(srcDir.getName()); |
| destDir.setStorageClsid(srcDir.getStorageClsid()); |
| copyNodes(srcDir, destDir); |
| } else { |
| try (InputStream is = src.createDocumentInputStream(e)) { |
| dest.createDocument(e.getName(), is); |
| } |
| } |
| } |
| } |
| |
| |
| |
| /** |
| * Knuth-Morris-Pratt Algorithm for Pattern Matching |
| * Finds the first occurrence of the pattern in the text. |
| */ |
| private static int indexOf(byte[] data, int offset, byte[] pattern) { |
| int[] failure = computeFailure(pattern); |
| |
| int j = 0; |
| if (data.length == 0) { |
| return -1; |
| } |
| |
| for (int i = offset; i < data.length; i++) { |
| while (j > 0 && pattern[j] != data[i]) { |
| j = failure[j - 1]; |
| } |
| if (pattern[j] == data[i]) { j++; } |
| if (j == pattern.length) { |
| return i - pattern.length + 1; |
| } |
| } |
| return -1; |
| } |
| |
| /** |
| * Computes the failure function using a boot-strapping process, |
| * where the pattern is matched against itself. |
| */ |
| private static int[] computeFailure(byte[] pattern) { |
| int[] failure = new int[pattern.length]; |
| |
| int j = 0; |
| for (int i = 1; i < pattern.length; i++) { |
| while (j > 0 && pattern[j] != pattern[i]) { |
| j = failure[j - 1]; |
| } |
| if (pattern[j] == pattern[i]) { |
| j++; |
| } |
| failure[i] = j; |
| } |
| |
| return failure; |
| } |
| |
| |
| } |