tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.pdf;

 import java.awt.geom.Point2D;
 import java.awt.image.BufferedImage;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.atomic.AtomicInteger;

 import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.cos.COSStream;
 import org.apache.pdfbox.filter.MissingImageReaderException;
 import org.apache.pdfbox.io.IOUtils;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDResources;
 import org.apache.pdfbox.pdmodel.font.PDFont;
 import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
 import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
 import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
 import org.apache.pdfbox.pdmodel.graphics.color.PDPattern;
 import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
 import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
 import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 import org.apache.pdfbox.pdmodel.graphics.pattern.PDAbstractPattern;
 import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
 import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
 import org.apache.pdfbox.pdmodel.graphics.state.PDSoftMask;
 import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
 import org.apache.pdfbox.tools.imageio.ImageIOUtil;
 import org.apache.pdfbox.util.Matrix;
 import org.apache.pdfbox.util.Vector;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;

 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.TikaMemoryLimitException;
 import org.apache.tika.exception.ZeroByteFileException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
 import org.apache.tika.io.BoundedInputStream;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;

 /**
  * Copied nearly verbatim from PDFBox
  */
 class ImageGraphicsEngine extends PDFGraphicsStreamEngine {

     //We're currently copying images to byte[].  We should
     //limit the length to avoid OOM on crafted files.
     private static final long MAX_IMAGE_LENGTH_BYTES = 100 * 1024 * 1024;

     private static final List<String> JPEG =
             Arrays.asList(COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName());


     private static final List<String> JP2 = Collections.singletonList(COSName.JPX_DECODE.getName());

     private static final List<String> JB2 = Collections.singletonList(COSName.JBIG2_DECODE.getName());
     final List<IOException> exceptions = new ArrayList<>();
     private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
     private final PDFParserConfig pdfParserConfig;
     private final Map<COSStream, Integer> processedInlineImages;
     private final AtomicInteger imageCounter;
     private final Metadata parentMetadata;
     private final XHTMLContentHandler xhtml;
     private final ParseContext parseContext;
     private final boolean extractInlineImageMetadataOnly;
     //TODO: parameterize this ?
     private boolean useDirectJPEG = false;

     //TODO: this is an embarrassment of an initializer...fix
     protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
                                   PDFParserConfig pdfParserConfig,
                                   Map<COSStream, Integer> processedInlineImages,
                                   AtomicInteger imageCounter, XHTMLContentHandler xhtml,
                                   Metadata parentMetadata, ParseContext parseContext) {
         super(page);
         this.embeddedDocumentExtractor = embeddedDocumentExtractor;
         this.pdfParserConfig = pdfParserConfig;
         this.processedInlineImages = processedInlineImages;
         this.imageCounter = imageCounter;
         this.xhtml = xhtml;
         this.parentMetadata = parentMetadata;
         this.parseContext = parseContext;
         this.extractInlineImageMetadataOnly = pdfParserConfig.isExtractInlineImageMetadataOnly();
     }

     //nearly directly copied from PDFBox ExtractImages
     private static void writeToBuffer(PDImage pdImage, String suffix, boolean directJPEG,
                                       OutputStream out) throws IOException, TikaException {

         if ("jpg".equals(suffix)) {

             String colorSpaceName = pdImage.getColorSpace().getName();
             if (directJPEG || (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
                     PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) {
                 // RGB or Gray colorspace: get and write the unmodified JPEG stream
                 InputStream data = pdImage.createInputStream(JPEG);
                 try {
                     copyUpToMaxLength(data, out);
                 } finally {
                     IOUtils.closeQuietly(data);
                 }
             } else {
                 BufferedImage image = pdImage.getImage();
                 if (image != null) {
                     // for CMYK and other "unusual" colorspaces, the JPEG will be converted
                     ImageIOUtil.writeImage(image, suffix, out);
                 }
             }
         } else if ("jp2".equals(suffix)) {
             String colorSpaceName = pdImage.getColorSpace().getName();
             if (directJPEG || !hasMasks(pdImage) &&
                     (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) ||
                             PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) {
                 // RGB or Gray colorspace: get and write the unmodified JPEG2000 stream
                 InputStream data = pdImage.createInputStream(JP2);
                 try {
                     copyUpToMaxLength(data, out);
                 } finally {
                     IOUtils.closeQuietly(data);
                 }
             } else {
                 // for CMYK and other "unusual" colorspaces, the image will be converted
                 BufferedImage image = pdImage.getImage();
                 if (image != null) {
                     // for CMYK and other "unusual" colorspaces, the JPEG will be converted
                     ImageIOUtil.writeImage(image, "jpeg2000", out);
                 }
             }
         } else if ("tif".equals(suffix) && pdImage.getColorSpace().equals(PDDeviceGray.INSTANCE)) {
             BufferedImage image = pdImage.getImage();
             if (image == null) {
                 return;
             }
             // CCITT compressed images can have a different colorspace, but this one is B/W
             // This is a bitonal image, so copy to TYPE_BYTE_BINARY
             // so that a G4 compressed TIFF image is created by ImageIOUtil.writeImage()
             int w = image.getWidth();
             int h = image.getHeight();
             BufferedImage bitonalImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
             // copy image the old fashioned way - ColorConvertOp is slower!
             for (int y = 0; y < h; y++) {
                 for (int x = 0; x < w; x++) {
                     bitonalImage.setRGB(x, y, image.getRGB(x, y));
                 }
             }
             ImageIOUtil.writeImage(bitonalImage, suffix, out);
         } else if ("jb2".equals(suffix)) {
             InputStream data = pdImage.createInputStream(JB2);
             try {
                 copyUpToMaxLength(data, out);
             } finally {
                 IOUtils.closeQuietly(data);
             }
         } else {
             BufferedImage image = pdImage.getImage();
             if (image == null) {
                 return;
             }
             ImageIOUtil.writeImage(image, suffix, out);
         }

         out.flush();
     }

     private static void copyUpToMaxLength(InputStream is, OutputStream os)
             throws IOException, TikaException {
         BoundedInputStream bis = new BoundedInputStream(MAX_IMAGE_LENGTH_BYTES, is);
         IOUtils.copy(bis, os);
         if (bis.hasHitBound()) {
             throw new TikaMemoryLimitException(
                     "Image size is larger than allowed (" + MAX_IMAGE_LENGTH_BYTES + ")");
         }

     }

     private static boolean hasMasks(PDImage pdImage) throws IOException {
         if (pdImage instanceof PDImageXObject) {
             PDImageXObject ximg = (PDImageXObject) pdImage;
             return ximg.getMask() != null || ximg.getSoftMask() != null;
         }
         return false;
     }

     void run() throws IOException {
         PDPage page = getPage();

         //TODO: is there a better way to do this rather than reprocessing the page
         //can we process the text and images in one go?
         processPage(page);
         PDResources res = page.getResources();
         if (res == null) {
             return;
         }

         for (COSName name : res.getExtGStateNames()) {
             PDExtendedGraphicsState extendedGraphicsState = res.getExtGState(name);
             if (extendedGraphicsState != null) {
                 PDSoftMask softMask = extendedGraphicsState.getSoftMask();

                 if (softMask != null) {
                     try {
                         PDTransparencyGroup group = softMask.getGroup();

                         if (group != null) {
                             // PDFBOX-4327: without this line NPEs will occur
                             res.getExtGState(name).copyIntoGraphicsState(getGraphicsState());

                             processSoftMask(group);
                         }
                     } catch (IOException e) {
                         handleCatchableIOE(e);
                     }
                 }
             }
         }
     }

     @Override
     public void drawImage(PDImage pdImage) throws IOException {
         int imageNumber = 0;
         if (pdImage instanceof PDImageXObject) {
             if (pdImage.isStencil()) {
                 processColor(getGraphicsState().getNonStrokingColor());
             }

             PDImageXObject xobject = (PDImageXObject) pdImage;
             Integer cachedNumber = processedInlineImages.get(xobject.getCOSObject());
             if (cachedNumber != null && pdfParserConfig.isExtractUniqueInlineImagesOnly()) {
                 // skip duplicate image
                 return;
             }
             if (cachedNumber == null) {
                 imageNumber = imageCounter.getAndIncrement();
                 processedInlineImages.put(xobject.getCOSObject(), imageNumber);
             }
         } else {
             imageNumber = imageCounter.getAndIncrement();
         }
         //TODO: should we use the hash of the PDImage to check for seen
         //For now, we're relying on the cosobject, but this could lead to
         //duplicates if the pdImage is not a PDImageXObject?
         try {
             processImage(pdImage, imageNumber);
         } catch (TikaException | SAXException e) {
             throw new IOException(e);
         } catch (IOException e) {
             handleCatchableIOE(e);
         }
     }

     @Override
     public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException {

     }

     @Override
     public void clip(int windingRule) throws IOException {

     }

     @Override
     public void moveTo(float x, float y) throws IOException {

     }

     @Override
     public void lineTo(float x, float y) throws IOException {

     }

     @Override
     public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
             throws IOException {

     }

     @Override
     public Point2D getCurrentPoint() throws IOException {
         return new Point2D.Float(0, 0);
     }

     @Override
     public void closePath() throws IOException {

     }

     @Override
     public void endPath() throws IOException {

     }

     @Override
     protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode,
                              Vector displacement) throws IOException {

         RenderingMode renderingMode = getGraphicsState().getTextState().getRenderingMode();
         if (renderingMode.isFill()) {
             processColor(getGraphicsState().getNonStrokingColor());
         }

         if (renderingMode.isStroke()) {
             processColor(getGraphicsState().getStrokingColor());
         }
     }

     @Override
     public void strokePath() throws IOException {
         processColor(getGraphicsState().getStrokingColor());
     }

     @Override
     public void fillPath(int windingRule) throws IOException {
         processColor(getGraphicsState().getNonStrokingColor());
     }

     @Override
     public void fillAndStrokePath(int windingRule) throws IOException {
         processColor(getGraphicsState().getNonStrokingColor());
     }

     @Override
     public void shadingFill(COSName shadingName) throws IOException {

     }

     // find out if it is a tiling pattern, then process that one
     private void processColor(PDColor color) throws IOException {
         if (color.getColorSpace() instanceof PDPattern) {
             PDPattern pattern = (PDPattern) color.getColorSpace();
             PDAbstractPattern abstractPattern = pattern.getPattern(color);

             if (abstractPattern instanceof PDTilingPattern) {
                 processTilingPattern((PDTilingPattern) abstractPattern, null, null);
             }
         }
     }

     private void processImage(PDImage pdImage, int imageNumber)
             throws IOException, TikaException, SAXException {
         //this is the metadata for this particular image
         Metadata metadata = new Metadata();
         String suffix = getSuffix(pdImage, metadata);
         String fileName = "image" + imageNumber + "." + suffix;


         AttributesImpl attr = new AttributesImpl();
         attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
         attr.addAttribute("", "alt", "alt", "CDATA", fileName);
         xhtml.startElement("img", attr);
         xhtml.endElement("img");


         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
         metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.INLINE.toString());

         if (extractInlineImageMetadataOnly) {
             extractInlineImageMetadataOnly(pdImage, metadata);
             return;
         }

         if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
             ByteArrayOutputStream buffer = new ByteArrayOutputStream();
             if (pdImage instanceof PDImageXObject) {
                 PDMetadataExtractor
                         .extract(((PDImageXObject) pdImage).getMetadata(), metadata, parseContext);
             }
             //extract the metadata contained outside of the image
             try {
                 writeToBuffer(pdImage, suffix, useDirectJPEG, buffer);
             } catch (MissingImageReaderException e) {
                 EmbeddedDocumentUtil.recordException(e, parentMetadata);
                 return;
             } catch (IOException e) {
                 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
                 return;
             }
             try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
                 embeddedDocumentExtractor
                         .parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), metadata,
                                 false);
             }
         }

     }

     private void extractInlineImageMetadataOnly(PDImage pdImage, Metadata metadata)
             throws IOException, SAXException {
         if (pdImage instanceof PDImageXObject) {
             PDMetadataExtractor
                     .extract(((PDImageXObject) pdImage).getMetadata(), metadata, parseContext);
         }
         metadata.set(Metadata.IMAGE_WIDTH, pdImage.getWidth());
         metadata.set(Metadata.IMAGE_LENGTH, pdImage.getHeight());
         //TODO: what else can we extract from the PDImage without rendering?
         ZeroByteFileException.IgnoreZeroByteFileException before =
                 parseContext.get(ZeroByteFileException.IgnoreZeroByteFileException.class);
         try {
             parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
                     ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION);
             embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new byte[0]),
                     new EmbeddedContentHandler(xhtml), metadata, false);
         } finally {
             //replace whatever was there before
             parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, before);
         }
     }

     private String getSuffix(PDImage pdImage, Metadata metadata) throws IOException {
         String suffix = pdImage.getSuffix();

         if (suffix == null || suffix.equals("png")) {
             metadata.set(Metadata.CONTENT_TYPE, "image/png");
             suffix = "png";
         } else if (suffix.equals("jpg")) {
             metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
         } else if (suffix.equals("tiff")) {
             metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
             suffix = "tif";
         } else if (suffix.equals("jpx")) {
             metadata.set(Metadata.CONTENT_TYPE, "image/jp2");
             // use jp2 suffix for file because jpx not known by windows
             suffix = "jp2";
         } else if (suffix.equals("jb2")) {
             //PDFBox resets suffix to png when image's suffix == jb2
             metadata.set(Metadata.CONTENT_TYPE, "image/x-jbig2");
         } else {
             //TODO: determine if we need to add more image types
 //                    throw new RuntimeException("EXTEN:" + extension);
         }
         if (hasMasks(pdImage)) {
             // TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG
             suffix = "png";
         }
         return suffix;
     }

     void handleCatchableIOE(IOException e) throws IOException {
         if (pdfParserConfig.isCatchIntermediateIOExceptions()) {
             if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
                     e.getCause().getMessage().contains("Your document contained more than")) {
                 //TODO -- is there a cleaner way of checking for:
                 // WriteOutContentHandler.WriteLimitReachedException?
                 throw e;
             }

             String msg = e.getMessage();
             if (msg == null) {
                 msg = "IOException, no message";
             }
             parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
             exceptions.add(e);
         } else {
             throw e;
         }
     }

     List<IOException> getExceptions() {
         return exceptions;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser.pdf;

	import java.awt.geom.Point2D;
	import java.awt.image.BufferedImage;
	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.OutputStream;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collections;
	import java.util.List;
	import java.util.Map;
	import java.util.concurrent.atomic.AtomicInteger;

	import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
	import org.apache.pdfbox.cos.COSName;
	import org.apache.pdfbox.cos.COSStream;
	import org.apache.pdfbox.filter.MissingImageReaderException;
	import org.apache.pdfbox.io.IOUtils;
	import org.apache.pdfbox.pdmodel.PDPage;
	import org.apache.pdfbox.pdmodel.PDResources;
	import org.apache.pdfbox.pdmodel.font.PDFont;
	import org.apache.pdfbox.pdmodel.graphics.color.PDColor;
	import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceGray;
	import org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB;
	import org.apache.pdfbox.pdmodel.graphics.color.PDPattern;
	import org.apache.pdfbox.pdmodel.graphics.form.PDTransparencyGroup;
	import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
	import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
	import org.apache.pdfbox.pdmodel.graphics.pattern.PDAbstractPattern;
	import org.apache.pdfbox.pdmodel.graphics.pattern.PDTilingPattern;
	import org.apache.pdfbox.pdmodel.graphics.state.PDExtendedGraphicsState;
	import org.apache.pdfbox.pdmodel.graphics.state.PDSoftMask;
	import org.apache.pdfbox.pdmodel.graphics.state.RenderingMode;
	import org.apache.pdfbox.tools.imageio.ImageIOUtil;
	import org.apache.pdfbox.util.Matrix;
	import org.apache.pdfbox.util.Vector;
	import org.xml.sax.SAXException;
	import org.xml.sax.helpers.AttributesImpl;

	import org.apache.tika.exception.TikaException;
	import org.apache.tika.exception.TikaMemoryLimitException;
	import org.apache.tika.exception.ZeroByteFileException;
	import org.apache.tika.extractor.EmbeddedDocumentExtractor;
	import org.apache.tika.extractor.EmbeddedDocumentUtil;
	import org.apache.tika.io.BoundedInputStream;
	import org.apache.tika.io.TikaInputStream;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.metadata.TikaCoreProperties;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.sax.EmbeddedContentHandler;
	import org.apache.tika.sax.XHTMLContentHandler;

	/**
	* Copied nearly verbatim from PDFBox
	*/
	class ImageGraphicsEngine extends PDFGraphicsStreamEngine {

	//We're currently copying images to byte[]. We should
	//limit the length to avoid OOM on crafted files.
	private static final long MAX_IMAGE_LENGTH_BYTES = 100 * 1024 * 1024;

	private static final List<String> JPEG =
	Arrays.asList(COSName.DCT_DECODE.getName(), COSName.DCT_DECODE_ABBREVIATION.getName());


	private static final List<String> JP2 = Collections.singletonList(COSName.JPX_DECODE.getName());

	private static final List<String> JB2 = Collections.singletonList(COSName.JBIG2_DECODE.getName());
	final List<IOException> exceptions = new ArrayList<>();
	private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
	private final PDFParserConfig pdfParserConfig;
	private final Map<COSStream, Integer> processedInlineImages;
	private final AtomicInteger imageCounter;
	private final Metadata parentMetadata;
	private final XHTMLContentHandler xhtml;
	private final ParseContext parseContext;
	private final boolean extractInlineImageMetadataOnly;
	//TODO: parameterize this ?
	private boolean useDirectJPEG = false;

	//TODO: this is an embarrassment of an initializer...fix
	protected ImageGraphicsEngine(PDPage page, EmbeddedDocumentExtractor embeddedDocumentExtractor,
	PDFParserConfig pdfParserConfig,
	Map<COSStream, Integer> processedInlineImages,
	AtomicInteger imageCounter, XHTMLContentHandler xhtml,
	Metadata parentMetadata, ParseContext parseContext) {
	super(page);
	this.embeddedDocumentExtractor = embeddedDocumentExtractor;
	this.pdfParserConfig = pdfParserConfig;
	this.processedInlineImages = processedInlineImages;
	this.imageCounter = imageCounter;
	this.xhtml = xhtml;
	this.parentMetadata = parentMetadata;
	this.parseContext = parseContext;
	this.extractInlineImageMetadataOnly = pdfParserConfig.isExtractInlineImageMetadataOnly();
	}

	//nearly directly copied from PDFBox ExtractImages
	private static void writeToBuffer(PDImage pdImage, String suffix, boolean directJPEG,
	OutputStream out) throws IOException, TikaException {

	if ("jpg".equals(suffix)) {

	String colorSpaceName = pdImage.getColorSpace().getName();
	if (directJPEG \|\| (PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) \|\|
	PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) {
	// RGB or Gray colorspace: get and write the unmodified JPEG stream
	InputStream data = pdImage.createInputStream(JPEG);
	try {
	copyUpToMaxLength(data, out);
	} finally {
	IOUtils.closeQuietly(data);
	}
	} else {
	BufferedImage image = pdImage.getImage();
	if (image != null) {
	// for CMYK and other "unusual" colorspaces, the JPEG will be converted
	ImageIOUtil.writeImage(image, suffix, out);
	}
	}
	} else if ("jp2".equals(suffix)) {
	String colorSpaceName = pdImage.getColorSpace().getName();
	if (directJPEG \|\| !hasMasks(pdImage) &&
	(PDDeviceGray.INSTANCE.getName().equals(colorSpaceName) \|\|
	PDDeviceRGB.INSTANCE.getName().equals(colorSpaceName))) {
	// RGB or Gray colorspace: get and write the unmodified JPEG2000 stream
	InputStream data = pdImage.createInputStream(JP2);
	try {
	copyUpToMaxLength(data, out);
	} finally {
	IOUtils.closeQuietly(data);
	}
	} else {
	// for CMYK and other "unusual" colorspaces, the image will be converted
	BufferedImage image = pdImage.getImage();
	if (image != null) {
	// for CMYK and other "unusual" colorspaces, the JPEG will be converted
	ImageIOUtil.writeImage(image, "jpeg2000", out);
	}
	}
	} else if ("tif".equals(suffix) && pdImage.getColorSpace().equals(PDDeviceGray.INSTANCE)) {
	BufferedImage image = pdImage.getImage();
	if (image == null) {
	return;
	}
	// CCITT compressed images can have a different colorspace, but this one is B/W
	// This is a bitonal image, so copy to TYPE_BYTE_BINARY
	// so that a G4 compressed TIFF image is created by ImageIOUtil.writeImage()
	int w = image.getWidth();
	int h = image.getHeight();
	BufferedImage bitonalImage = new BufferedImage(w, h, BufferedImage.TYPE_BYTE_BINARY);
	// copy image the old fashioned way - ColorConvertOp is slower!
	for (int y = 0; y < h; y++) {
	for (int x = 0; x < w; x++) {
	bitonalImage.setRGB(x, y, image.getRGB(x, y));
	}
	}
	ImageIOUtil.writeImage(bitonalImage, suffix, out);
	} else if ("jb2".equals(suffix)) {
	InputStream data = pdImage.createInputStream(JB2);
	try {
	copyUpToMaxLength(data, out);
	} finally {
	IOUtils.closeQuietly(data);
	}
	} else {
	BufferedImage image = pdImage.getImage();
	if (image == null) {
	return;
	}
	ImageIOUtil.writeImage(image, suffix, out);
	}

	out.flush();
	}

	private static void copyUpToMaxLength(InputStream is, OutputStream os)
	throws IOException, TikaException {
	BoundedInputStream bis = new BoundedInputStream(MAX_IMAGE_LENGTH_BYTES, is);
	IOUtils.copy(bis, os);
	if (bis.hasHitBound()) {
	throw new TikaMemoryLimitException(
	"Image size is larger than allowed (" + MAX_IMAGE_LENGTH_BYTES + ")");
	}

	}

	private static boolean hasMasks(PDImage pdImage) throws IOException {
	if (pdImage instanceof PDImageXObject) {
	PDImageXObject ximg = (PDImageXObject) pdImage;
	return ximg.getMask() != null \|\| ximg.getSoftMask() != null;
	}
	return false;
	}

	void run() throws IOException {
	PDPage page = getPage();

	//TODO: is there a better way to do this rather than reprocessing the page
	//can we process the text and images in one go?
	processPage(page);
	PDResources res = page.getResources();
	if (res == null) {
	return;
	}

	for (COSName name : res.getExtGStateNames()) {
	PDExtendedGraphicsState extendedGraphicsState = res.getExtGState(name);
	if (extendedGraphicsState != null) {
	PDSoftMask softMask = extendedGraphicsState.getSoftMask();

	if (softMask != null) {
	try {
	PDTransparencyGroup group = softMask.getGroup();

	if (group != null) {
	// PDFBOX-4327: without this line NPEs will occur
	res.getExtGState(name).copyIntoGraphicsState(getGraphicsState());

	processSoftMask(group);
	}
	} catch (IOException e) {
	handleCatchableIOE(e);
	}
	}
	}
	}
	}

	@Override
	public void drawImage(PDImage pdImage) throws IOException {
	int imageNumber = 0;
	if (pdImage instanceof PDImageXObject) {
	if (pdImage.isStencil()) {
	processColor(getGraphicsState().getNonStrokingColor());
	}

	PDImageXObject xobject = (PDImageXObject) pdImage;
	Integer cachedNumber = processedInlineImages.get(xobject.getCOSObject());
	if (cachedNumber != null && pdfParserConfig.isExtractUniqueInlineImagesOnly()) {
	// skip duplicate image
	return;
	}
	if (cachedNumber == null) {
	imageNumber = imageCounter.getAndIncrement();
	processedInlineImages.put(xobject.getCOSObject(), imageNumber);
	}
	} else {
	imageNumber = imageCounter.getAndIncrement();
	}
	//TODO: should we use the hash of the PDImage to check for seen
	//For now, we're relying on the cosobject, but this could lead to
	//duplicates if the pdImage is not a PDImageXObject?
	try {
	processImage(pdImage, imageNumber);
	} catch (TikaException \| SAXException e) {
	throw new IOException(e);
	} catch (IOException e) {
	handleCatchableIOE(e);
	}
	}

	@Override
	public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException {

	}

	@Override
	public void clip(int windingRule) throws IOException {

	}

	@Override
	public void moveTo(float x, float y) throws IOException {

	}

	@Override
	public void lineTo(float x, float y) throws IOException {

	}

	@Override
	public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3)
	throws IOException {

	}

	@Override
	public Point2D getCurrentPoint() throws IOException {
	return new Point2D.Float(0, 0);
	}

	@Override
	public void closePath() throws IOException {

	}

	@Override
	public void endPath() throws IOException {

	}

	@Override
	protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, String unicode,
	Vector displacement) throws IOException {

	RenderingMode renderingMode = getGraphicsState().getTextState().getRenderingMode();
	if (renderingMode.isFill()) {
	processColor(getGraphicsState().getNonStrokingColor());
	}

	if (renderingMode.isStroke()) {
	processColor(getGraphicsState().getStrokingColor());
	}
	}

	@Override
	public void strokePath() throws IOException {
	processColor(getGraphicsState().getStrokingColor());
	}

	@Override
	public void fillPath(int windingRule) throws IOException {
	processColor(getGraphicsState().getNonStrokingColor());
	}

	@Override
	public void fillAndStrokePath(int windingRule) throws IOException {
	processColor(getGraphicsState().getNonStrokingColor());
	}

	@Override
	public void shadingFill(COSName shadingName) throws IOException {

	}

	// find out if it is a tiling pattern, then process that one
	private void processColor(PDColor color) throws IOException {
	if (color.getColorSpace() instanceof PDPattern) {
	PDPattern pattern = (PDPattern) color.getColorSpace();
	PDAbstractPattern abstractPattern = pattern.getPattern(color);

	if (abstractPattern instanceof PDTilingPattern) {
	processTilingPattern((PDTilingPattern) abstractPattern, null, null);
	}
	}
	}

	private void processImage(PDImage pdImage, int imageNumber)
	throws IOException, TikaException, SAXException {
	//this is the metadata for this particular image
	Metadata metadata = new Metadata();
	String suffix = getSuffix(pdImage, metadata);
	String fileName = "image" + imageNumber + "." + suffix;


	AttributesImpl attr = new AttributesImpl();
	attr.addAttribute("", "src", "src", "CDATA", "embedded:" + fileName);
	attr.addAttribute("", "alt", "alt", "CDATA", fileName);
	xhtml.startElement("img", attr);
	xhtml.endElement("img");


	metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
	metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
	TikaCoreProperties.EmbeddedResourceType.INLINE.toString());

	if (extractInlineImageMetadataOnly) {
	extractInlineImageMetadataOnly(pdImage, metadata);
	return;
	}

	if (embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
	ByteArrayOutputStream buffer = new ByteArrayOutputStream();
	if (pdImage instanceof PDImageXObject) {
	PDMetadataExtractor
	.extract(((PDImageXObject) pdImage).getMetadata(), metadata, parseContext);
	}
	//extract the metadata contained outside of the image
	try {
	writeToBuffer(pdImage, suffix, useDirectJPEG, buffer);
	} catch (MissingImageReaderException e) {
	EmbeddedDocumentUtil.recordException(e, parentMetadata);
	return;
	} catch (IOException e) {
	EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
	return;
	}
	try (InputStream embeddedIs = TikaInputStream.get(buffer.toByteArray())) {
	embeddedDocumentExtractor
	.parseEmbedded(embeddedIs, new EmbeddedContentHandler(xhtml), metadata,
	false);
	}
	}

	}

	private void extractInlineImageMetadataOnly(PDImage pdImage, Metadata metadata)
	throws IOException, SAXException {
	if (pdImage instanceof PDImageXObject) {
	PDMetadataExtractor
	.extract(((PDImageXObject) pdImage).getMetadata(), metadata, parseContext);
	}
	metadata.set(Metadata.IMAGE_WIDTH, pdImage.getWidth());
	metadata.set(Metadata.IMAGE_LENGTH, pdImage.getHeight());
	//TODO: what else can we extract from the PDImage without rendering?
	ZeroByteFileException.IgnoreZeroByteFileException before =
	parseContext.get(ZeroByteFileException.IgnoreZeroByteFileException.class);
	try {
	parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
	ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION);
	embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new byte[0]),
	new EmbeddedContentHandler(xhtml), metadata, false);
	} finally {
	//replace whatever was there before
	parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, before);
	}
	}

	private String getSuffix(PDImage pdImage, Metadata metadata) throws IOException {
	String suffix = pdImage.getSuffix();

	if (suffix == null \|\| suffix.equals("png")) {
	metadata.set(Metadata.CONTENT_TYPE, "image/png");
	suffix = "png";
	} else if (suffix.equals("jpg")) {
	metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
	} else if (suffix.equals("tiff")) {
	metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
	suffix = "tif";
	} else if (suffix.equals("jpx")) {
	metadata.set(Metadata.CONTENT_TYPE, "image/jp2");
	// use jp2 suffix for file because jpx not known by windows
	suffix = "jp2";
	} else if (suffix.equals("jb2")) {
	//PDFBox resets suffix to png when image's suffix == jb2
	metadata.set(Metadata.CONTENT_TYPE, "image/x-jbig2");
	} else {
	//TODO: determine if we need to add more image types
	// throw new RuntimeException("EXTEN:" + extension);
	}
	if (hasMasks(pdImage)) {
	// TIKA-3040, PDFBOX-4771: can't save ARGB as JPEG
	suffix = "png";
	}
	return suffix;
	}

	void handleCatchableIOE(IOException e) throws IOException {
	if (pdfParserConfig.isCatchIntermediateIOExceptions()) {
	if (e.getCause() instanceof SAXException && e.getCause().getMessage() != null &&
	e.getCause().getMessage().contains("Your document contained more than")) {
	//TODO -- is there a cleaner way of checking for:
	// WriteOutContentHandler.WriteLimitReachedException?
	throw e;
	}

	String msg = e.getMessage();
	if (msg == null) {
	msg = "IOException, no message";
	}
	parentMetadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, msg);
	exceptions.add(e);
	} else {
	throw e;
	}
	}

	List<IOException> getExceptions() {
	return exceptions;
	}
	}