| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.pdf; |
| |
| import java.awt.geom.Point2D; |
| import java.awt.image.BufferedImage; |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.concurrent.atomic.AtomicInteger; |
| |
| import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine; |
| import org.apache.pdfbox.cos.COSName; |
| import org.apache.pdfbox.cos.COSStream; |
| import org.apache.pdfbox.filter.MissingImageReaderException; |
| import org.apache.pdfbox.io.IOUtils; |
| import org.apache.pdfbox.pdmodel.PDPage; |
| import org.apache.pdfbox.pdmodel.PDResources; |
| import org.apache.pdfbox.pdmodel.font.PDFont; |
| import org.apache.pdfbox.pdmodel.graphics.color.PDColor; |
| import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray; |
| import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB; |
| import org.apache.pdfbox.pdmodel.graphics.color.PDPattern; |
| import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup; |
| import org.apache.pdfbox.pdmodel.graphics.image.PDImage; |
| import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; |
| import org.apache.pdfbox.pdmodel.graphics.pattern.PDAbstractPattern; |
| import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern; |
| import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState; |
| import org.apache.pdfbox.pdmodel.graphics.state.PDSoftMask; |
| import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode; |
| import org.apache.pdfbox.tools.imageio.ImageIOUtil; |
| import org.apache.pdfbox.util.Matrix; |
| import org.apache.pdfbox.util.Vector; |
| import org.xml.sax.SAXException; |
| import org.xml.sax.helpers.AttributesImpl; |
| |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.exception.TikaMemoryLimitException; |
| import org.apache.tika.exception.ZeroByteFileException; |
| import org.apache.tika.extractor.EmbeddedDocumentExtractor; |
| import org.apache.tika.extractor.EmbeddedDocumentUtil; |
| import org.apache.tika.io.BoundedInputStream; |
| import org.apache.tika.io.TikaInputStream; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.TikaCoreProperties; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.sax.EmbeddedContentHandler; |
| import org.apache.tika.sax.XHTMLContentHandler; |
| |
| /** |
| * Copied nearly verbatim from PDFBox |
| */ |
| class ImageGraphicsEngine extends PDFGraphicsStreamEngine { |
| |
| //We're currently copying images to byte[]. We should |
| //limit the length to avoid OOM on crafted files. |
| private static final long MAX_IMAGE_LENGTH_BYTES = 100 * 1024 * 1024; |
| |
| private static final List<String> JPEG = |
| Arrays.asList(COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName()); |
| |
| |
| private static final List<String> JP2 = Collections.singletonList(COSName.JPX_DECODE.getName()); |
| |
| private static final List<String> JB2 = Collections.singletonList(COSName.JBIG2_DECODE.getName()); |
| final List<IOException> exceptions = new ArrayList<>(); |
| private final EmbeddedDocumentExtractor embeddedDocumentExtractor; |
| private final PDFParserConfig pdfParserConfig; |
| private final Map<COSStream, Integer> processedInlineImages; |
| private final AtomicInteger imageCounter; |
| private final Metadata parentMetadata; |
| private final XHTMLContentHandler xhtml; |
| private final ParseContext parseContext; |
| private final boolean extractInlineImageMetadataOnly; |
| //TODO: parameterize this ? |
| private boolean useDirectJPEG = false; |
| |
| //TODO: this is an embarrassment of an initializer...fix |
| protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor, |
| PDFParserConfig pdfParserConfig, |
| Map<COSStream, Integer> processedInlineImages, |
| AtomicInteger imageCounter, XHTMLContentHandler xhtml, |
| Metadata parentMetadata, ParseContext parseContext) { |
| super(page); |
| this.embeddedDocumentExtractor = embeddedDocumentExtractor; |
| this.pdfParserConfig = pdfParserConfig; |
| this.processedInlineImages = processedInlineImages; |
| this.imageCounter = imageCounter; |
| this.xhtml = xhtml; |
| this.parentMetadata = parentMetadata; |
| this.parseContext = parseContext; |
| this.extractInlineImageMetadataOnly = pdfParserConfig.isExtractInlineImageMetadataOnly(); |
| } |
| |
| //nearly directly copied from PDFBox ExtractImages |
| private static void writeToBuffer(PDImage pdImage, String suffix, boolean directJPEG, |
| OutputStream out) throws IOException, TikaException { |
| |
| if ("jpg".equals(suffix)) { |
| |
| String colorSpaceName = pdImage.getColorSpace().getName(); |
| if (directJPEG || (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || |
| PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) { |
| // RGB or Gray colorspace: get and write the unmodified JPEG stream |
| InputStream data = pdImage.createInputStream(JPEG); |
| try { |
| copyUpToMaxLength(data, out); |
| } finally { |
| IOUtils.closeQuietly(data); |
| } |
| } else { |
| BufferedImage image = pdImage.getImage(); |
| if (image != null) { |
| // for CMYK and other "unusual" colorspaces, the JPEG will be converted |
| ImageIOUtil.writeImage(image, suffix, out); |
| } |
| } |
| } else if ("jp2".equals(suffix)) { |
| String colorSpaceName = pdImage.getColorSpace().getName(); |
| if (directJPEG || !hasMasks(pdImage) && |
| (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) || |
| PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) { |
| // RGB or Gray colorspace: get and write the unmodified JPEG2000 stream |
| InputStream data = pdImage.createInputStream(JP2); |
| try { |
| copyUpToMaxLength(data, out); |
| } finally { |
| IOUtils.closeQuietly(data); |
| } |
| } else { |
| // for CMYK and other "unusual" colorspaces, the image will be converted |
| BufferedImage image = pdImage.getImage(); |
| if (image != null) { |
| // for CMYK and other "unusual" colorspaces, the JPEG will be converted |
| ImageIOUtil.writeImage(image, "jpeg2000", out); |
| } |
| } |
| } else if ("tif".equals(suffix) && pdImage.getColorSpace().equals(PDDeviceGray.INSTANCE)) { |
| BufferedImage image = pdImage.getImage(); |
| if (image == null) { |
| return; |
| } |
| // CCITT compressed images can have a different colorspace, but this one is B/W |
| // This is a bitonal image, so copy to TYPE_BYTE_BINARY |
| // so that a G4 compressed TIFF image is created by ImageIOUtil.writeImage() |
| int w = image.getWidth(); |
| int h = image.getHeight(); |
| BufferedImage bitonalImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY); |
| // copy image the old fashioned way - ColorConvertOp is slower! |
| for (int y = 0; y < h; y++) { |
| for (int x = 0; x < w; x++) { |
| bitonalImage.setRGB(x, y, image.getRGB(x, y)); |
| } |
| } |
| ImageIOUtil.writeImage(bitonalImage, suffix, out); |
| } else if ("jb2".equals(suffix)) { |
| InputStream data = pdImage.createInputStream(JB2); |
| try { |
| copyUpToMaxLength(data, out); |
| } finally { |
| IOUtils.closeQuietly(data); |
| } |
| } else { |
| BufferedImage image = pdImage.getImage(); |
| if (image == null) { |
| return; |
| } |
| ImageIOUtil.writeImage(image, suffix, out); |
| } |
| |
| out.flush(); |
| } |
| |
| private static void copyUpToMaxLength(InputStream is, OutputStream os) |
| throws IOException, TikaException { |
| BoundedInputStream bis = new BoundedInputStream(MAX_IMAGE_LENGTH_BYTES, is); |
| IOUtils.copy(bis, os); |
| if (bis.hasHitBound()) { |
| throw new TikaMemoryLimitException( |
| "Image size is larger than allowed (" + MAX_IMAGE_LENGTH_BYTES + ")"); |
| } |
| |
| } |
| |
| private static boolean hasMasks(PDImage pdImage) throws IOException { |
| if (pdImage instanceof PDImageXObject) { |
| PDImageXObject ximg = (PDImageXObject) pdImage; |
| return ximg.getMask() != null || ximg.getSoftMask() != null; |
| } |
| return false; |
| } |
| |
| void run() throws IOException { |
| PDPage page = getPage(); |
| |
| //TODO: is there a better way to do this rather than reprocessing the page |
| //can we process the text and images in one go? |
| processPage(page); |
| PDResources res = page.getResources(); |
| if (res == null) { |
| return; |
| } |
| |
| for (COSName name : res.getExtGStateNames()) { |
| PDExtendedGraphicsState extendedGraphicsState = res.getExtGState(name); |
| if (extendedGraphicsState != null) { |
| PDSoftMask softMask = extendedGraphicsState.getSoftMask(); |
| |
| if (softMask != null) { |
| try { |
| PDTransparencyGroup group = softMask.getGroup(); |
| |
| if (group != null) { |
| // PDFBOX-4327: without this line NPEs will occur |
| res.getExtGState(name).copyIntoGraphicsState(getGraphicsState()); |
| |
| processSoftMask(group); |
| } |
| } catch (IOException e) { |
| handleCatchableIOE(e); |
| } |
| } |
| } |
| } |
| } |
| |
| @Override |
| public void drawImage(PDImage pdImage) throws IOException { |
| int imageNumber = 0; |
| if (pdImage instanceof PDImageXObject) { |
| if (pdImage.isStencil()) { |
| processColor(getGraphicsState().getNonStrokingColor()); |
| } |
| |
| PDImageXObject xobject = (PDImageXObject) pdImage; |
| Integer cachedNumber = processedInlineImages.get(xobject.getCOSObject()); |
| if (cachedNumber != null && pdfParserConfig.isExtractUniqueInlineImagesOnly()) { |
| // skip duplicate image |
| return; |
| } |
| if (cachedNumber == null) { |
| imageNumber = imageCounter.getAndIncrement(); |
| processedInlineImages.put(xobject.getCOSObject(), imageNumber); |
| } |
| } else { |
| imageNumber = imageCounter.getAndIncrement(); |
| } |
| //TODO: should we use the hash of the PDImage to check for seen |
| //For now, we're relying on the cosobject, but this could lead to |
| //duplicates if the pdImage is not a PDImageXObject? |
| try { |
| processImage(pdImage, imageNumber); |
| } catch (TikaException | SAXException e) { |
| throw new IOException(e); |
| } catch (IOException e) { |
| handleCatchableIOE(e); |
| } |
| } |
| |
| @Override |
| public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException { |
| |
| } |
| |
| @Override |
| public void clip(int windingRule) throws IOException { |
| |
| } |
| |
| @Override |
| public void moveTo(float x, float y) throws IOException { |
| |
| } |
| |
| @Override |
| public void lineTo(float x, float y) throws IOException { |
| |
| } |
| |
| @Override |
| public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) |
| throws IOException { |
| |
| } |
| |
| @Override |
| public Point2D getCurrentPoint() throws IOException { |
| return new Point2D.Float(0, 0); |
| } |
| |
| @Override |
| public void closePath() throws IOException { |
| |
| } |
| |
| @Override |
| public void endPath() throws IOException { |
| |
| } |
| |
| @Override |
| protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode, |
| Vector displacement) throws IOException { |
| |
| RenderingMode renderingMode = getGraphicsState().getTextState().getRenderingMode(); |
| if (renderingMode.isFill()) { |
| processColor(getGraphicsState().getNonStrokingColor()); |
| } |
| |
| if (renderingMode.isStroke()) { |
| processColor(getGraphicsState().getStrokingColor()); |
| } |
| } |
| |
| @Override |
| public void strokePath() throws IOException { |
| processColor(getGraphicsState().getStrokingColor()); |
| } |
| |
| @Override |
| public void fillPath(int windingRule) throws IOException { |
| processColor(getGraphicsState().getNonStrokingColor()); |
| } |
| |
| @Override |
| public void fillAndStrokePath(int windingRule) throws IOException { |
| processColor(getGraphicsState().getNonStrokingColor()); |
| } |
| |
| @Override |
| public void shadingFill(COSName shadingName) throws IOException { |
| |
| } |
| |
| // find out if it is a tiling pattern, then process that one |
| private void processColor(PDColor color) throws IOException { |
| if (color.getColorSpace() instanceof PDPattern) { |
| PDPattern pattern = (PDPattern) color.getColorSpace(); |
| PDAbstractPattern abstractPattern = pattern.getPattern(color); |
| |
| if (abstractPattern instanceof PDTilingPattern) { |
| processTilingPattern((PDTilingPattern) abstractPattern, null, null); |
| } |
| } |
| } |
| |
| private void processImage(PDImage pdImage, int imageNumber) |
| throws IOException, TikaException, SAXException { |
| //this is the metadata for this particular image |
| Metadata metadata = new Metadata(); |
| String suffix = getSuffix(pdImage, metadata); |
| String fileName = "image" + imageNumber + "." + suffix; |
| |
| |
| AttributesImpl attr = new AttributesImpl(); |
| attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName); |
| attr.addAttribute("", "alt", "alt", "CDATA", fileName); |
| xhtml.startElement("img", attr); |
| xhtml.endElement("img"); |
| |
| |
| metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName); |
| metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, |
| TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); |
| |
| if (extractInlineImageMetadataOnly) { |
| extractInlineImageMetadataOnly(pdImage, metadata); |
| return; |
| } |
| |
| if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) { |
| ByteArrayOutputStream buffer = new ByteArrayOutputStream(); |
| if (pdImage instanceof PDImageXObject) { |
| PDMetadataExtractor |
| .extract(((PDImageXObject) pdImage).getMetadata(), metadata, parseContext); |
| } |
| //extract the metadata contained outside of the image |
| try { |
| writeToBuffer(pdImage, suffix, useDirectJPEG, buffer); |
| } catch (MissingImageReaderException e) { |
| EmbeddedDocumentUtil.recordException(e, parentMetadata); |
| return; |
| } catch (IOException e) { |
| EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata); |
| return; |
| } |
| try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) { |
| embeddedDocumentExtractor |
| .parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), metadata, |
| false); |
| } |
| } |
| |
| } |
| |
| private void extractInlineImageMetadataOnly(PDImage pdImage, Metadata metadata) |
| throws IOException, SAXException { |
| if (pdImage instanceof PDImageXObject) { |
| PDMetadataExtractor |
| .extract(((PDImageXObject) pdImage).getMetadata(), metadata, parseContext); |
| } |
| metadata.set(Metadata.IMAGE_WIDTH, pdImage.getWidth()); |
| metadata.set(Metadata.IMAGE_LENGTH, pdImage.getHeight()); |
| //TODO: what else can we extract from the PDImage without rendering? |
| ZeroByteFileException.IgnoreZeroByteFileException before = |
| parseContext.get(ZeroByteFileException.IgnoreZeroByteFileException.class); |
| try { |
| parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, |
| ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION); |
| embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new byte[0]), |
| new EmbeddedContentHandler(xhtml), metadata, false); |
| } finally { |
| //replace whatever was there before |
| parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, before); |
| } |
| } |
| |
| private String getSuffix(PDImage pdImage, Metadata metadata) throws IOException { |
| String suffix = pdImage.getSuffix(); |
| |
| if (suffix == null || suffix.equals("png")) { |
| metadata.set(Metadata.CONTENT_TYPE, "image/png"); |
| suffix = "png"; |
| } else if (suffix.equals("jpg")) { |
| metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); |
| } else if (suffix.equals("tiff")) { |
| metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); |
| suffix = "tif"; |
| } else if (suffix.equals("jpx")) { |
| metadata.set(Metadata.CONTENT_TYPE, "image/jp2"); |
| // use jp2 suffix for file because jpx not known by windows |
| suffix = "jp2"; |
| } else if (suffix.equals("jb2")) { |
| //PDFBox resets suffix to png when image's suffix == jb2 |
| metadata.set(Metadata.CONTENT_TYPE, "image/x-jbig2"); |
| } else { |
| //TODO: determine if we need to add more image types |
| // throw new RuntimeException("EXTEN:" + extension); |
| } |
| if (hasMasks(pdImage)) { |
| // TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG |
| suffix = "png"; |
| } |
| return suffix; |
| } |
| |
| void handleCatchableIOE(IOException e) throws IOException { |
| if (pdfParserConfig.isCatchIntermediateIOExceptions()) { |
| if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null && |
| e.getCause().getMessage().contains("Your document contained more than")) { |
| //TODO -- is there a cleaner way of checking for: |
| // WriteOutContentHandler.WriteLimitReachedException? |
| throw e; |
| } |
| |
| String msg = e.getMessage(); |
| if (msg == null) { |
| msg = "IOException, no message"; |
| } |
| parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg); |
| exceptions.add(e); |
| } else { |
| throw e; |
| } |
| } |
| |
| List<IOException> getExceptions() { |
| return exceptions; |
| } |
| } |