| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.pdf; |
| |
| import java.io.IOException; |
| import java.io.Writer; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.concurrent.atomic.AtomicInteger; |
| |
| import org.apache.pdfbox.cos.COSArray; |
| import org.apache.pdfbox.cos.COSName; |
| import org.apache.pdfbox.cos.COSStream; |
| import org.apache.pdfbox.pdmodel.PDDocument; |
| import org.apache.pdfbox.pdmodel.PDPage; |
| import org.apache.pdfbox.pdmodel.PDPageContentStream; |
| import org.apache.pdfbox.text.PDFTextStripper; |
| import org.apache.pdfbox.text.TextPosition; |
| import org.apache.pdfbox.util.Matrix; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.parser.ParseContext; |
| |
| /** |
| * Utility class that overrides the {@link PDFTextStripper} functionality |
| * to produce a semi-structured XHTML SAX events instead of a plain text |
| * stream. |
| */ |
| class PDF2XHTML extends AbstractPDF2XHTML { |
| |
| |
| /** |
| * This keeps track of the pdf object ids for inline |
| * images that have been processed. |
| * If {@link PDFParserConfig#isExtractUniqueInlineImagesOnly() |
| * is true, this will be checked before extracting an embedded image. |
| * The integer keeps track of the inlineImageCounter for that image. |
| * This integer is used to identify images in the markup. |
| * <p> |
| * This is used across the document. To avoid infinite recursion |
| * TIKA-1742, we're limiting the export to one image per page. |
| */ |
| private Map<COSStream, Integer> processedInlineImages = new HashMap<>(); |
| private AtomicInteger inlineImageCounter = new AtomicInteger(0); |
| |
| PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata, |
| PDFParserConfig config) throws IOException { |
| super(document, handler, context, metadata, config); |
| } |
| |
| /** |
| * Converts the given PDF document (and related metadata) to a stream |
| * of XHTML SAX events sent to the given content handler. |
| * |
| * @param document PDF document |
| * @param handler SAX content handler |
| * @param metadata PDF metadata |
| * @throws SAXException if the content handler fails to process SAX events |
| * @throws TikaException if there was an exception outside of per page processing |
| */ |
| public static void process(PDDocument document, ContentHandler handler, ParseContext context, |
| Metadata metadata, PDFParserConfig config) |
| throws SAXException, TikaException { |
| PDF2XHTML pdf2XHTML = null; |
| try { |
| // Extract text using a dummy Writer as we override the |
| // key methods to output to the given content |
| // handler. |
| if (config.isDetectAngles()) { |
| pdf2XHTML = |
| new AngleDetectingPDF2XHTML(document, handler, context, metadata, config); |
| } else { |
| pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config); |
| } |
| config.configure(pdf2XHTML); |
| |
| pdf2XHTML.writeText(document, new Writer() { |
| @Override |
| public void write(char[] cbuf, int off, int len) { |
| } |
| |
| @Override |
| public void flush() { |
| } |
| |
| @Override |
| public void close() { |
| } |
| }); |
| } catch (IOException e) { |
| if (e.getCause() instanceof SAXException) { |
| throw (SAXException) e.getCause(); |
| } else { |
| throw new TikaException("Unable to extract PDF content", e); |
| } |
| } |
| if (pdf2XHTML.exceptions.size() > 0) { |
| //throw the first |
| throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0)); |
| } |
| } |
| |
| @Override |
| public void processPage(PDPage page) throws IOException { |
| try { |
| super.processPage(page); |
| } catch (IOException e) { |
| handleCatchableIOE(e); |
| endPage(page); |
| } |
| } |
| |
| @Override |
| protected void endPage(PDPage page) throws IOException { |
| try { |
| writeParagraphEnd(); |
| try { |
| extractImages(page); |
| } catch (IOException e) { |
| handleCatchableIOE(e); |
| } |
| super.endPage(page); |
| } catch (SAXException e) { |
| throw new IOException("Unable to end a page", e); |
| } catch (IOException e) { |
| handleCatchableIOE(e); |
| } |
| } |
| |
| void extractImages(PDPage page) throws SAXException, IOException { |
| if (config.isExtractInlineImages() == false && |
| config.isExtractInlineImageMetadataOnly() == false) { |
| return; |
| } |
| |
| ImageGraphicsEngine engine = |
| new ImageGraphicsEngine(page, embeddedDocumentExtractor, config, |
| processedInlineImages, inlineImageCounter, xhtml, metadata, context); |
| engine.run(); |
| List<IOException> engineExceptions = engine.getExceptions(); |
| if (engineExceptions.size() > 0) { |
| IOException first = engineExceptions.remove(0); |
| if (config.isCatchIntermediateIOExceptions()) { |
| exceptions.addAll(engineExceptions); |
| } |
| throw first; |
| } |
| } |
| |
| @Override |
| protected void writeParagraphStart() throws IOException { |
| super.writeParagraphStart(); |
| try { |
| xhtml.startElement("p"); |
| } catch (SAXException e) { |
| throw new IOException("Unable to start a paragraph", e); |
| } |
| } |
| |
| @Override |
| protected void writeParagraphEnd() throws IOException { |
| super.writeParagraphEnd(); |
| try { |
| xhtml.endElement("p"); |
| } catch (SAXException e) { |
| throw new IOException("Unable to end a paragraph", e); |
| } |
| } |
| |
| @Override |
| protected void writeString(String text) throws IOException { |
| try { |
| xhtml.characters(text); |
| } catch (SAXException e) { |
| throw new IOException("Unable to write a string: " + text, e); |
| } |
| } |
| |
| @Override |
| protected void writeCharacters(TextPosition text) throws IOException { |
| try { |
| xhtml.characters(text.getUnicode()); |
| } catch (SAXException e) { |
| throw new IOException("Unable to write a character: " + text.getUnicode(), e); |
| } |
| } |
| |
| @Override |
| protected void writeWordSeparator() throws IOException { |
| try { |
| xhtml.characters(getWordSeparator()); |
| } catch (SAXException e) { |
| throw new IOException("Unable to write a space character", e); |
| } |
| } |
| |
| @Override |
| protected void writeLineSeparator() throws IOException { |
| try { |
| xhtml.newline(); |
| } catch (SAXException e) { |
| throw new IOException("Unable to write a newline character", e); |
| } |
| } |
| |
| private static class AngleDetectingPDF2XHTML extends PDF2XHTML { |
| |
| private AngleDetectingPDF2XHTML(PDDocument document, ContentHandler handler, |
| ParseContext context, Metadata metadata, |
| PDFParserConfig config) throws IOException { |
| super(document, handler, context, metadata, config); |
| } |
| |
| @Override |
| protected void startPage(PDPage page) throws IOException { |
| //no-op |
| } |
| |
| @Override |
| protected void endPage(PDPage page) throws IOException { |
| //no-op |
| } |
| |
| @Override |
| public void processPage(PDPage page) throws IOException { |
| try { |
| super.startPage(page); |
| detectAnglesAndProcessPage(page); |
| } catch (IOException e) { |
| handleCatchableIOE(e); |
| } finally { |
| super.endPage(page); |
| } |
| } |
| |
| private void detectAnglesAndProcessPage(PDPage page) throws IOException { |
| //copied and pasted from https://issues.apache.org/jira/secure/attachment/12947452/ExtractAngledText.java |
| //PDFBOX-4371 |
| AngleCollector angleCollector = new AngleCollector(); // alternatively, reset angles |
| angleCollector.setStartPage(getCurrentPageNo()); |
| angleCollector.setEndPage(getCurrentPageNo()); |
| angleCollector.getText(document); |
| |
| int rotation = page.getRotation(); |
| page.setRotation(0); |
| |
| for (Integer angle : angleCollector.getAngles()) { |
| if (angle == 0) { |
| try { |
| super.processPage(page); |
| } catch (IOException e) { |
| handleCatchableIOE(e); |
| } |
| } else { |
| // prepend a transformation |
| try (PDPageContentStream cs = new PDPageContentStream(document, page, |
| PDPageContentStream.AppendMode.PREPEND, false)) { |
| cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0)); |
| } |
| |
| try { |
| super.processPage(page); |
| } catch (IOException e) { |
| handleCatchableIOE(e); |
| } |
| |
| // remove transformation |
| COSArray contents = (COSArray) page.getCOSObject().getItem(COSName.CONTENTS); |
| contents.remove(0); |
| } |
| } |
| page.setRotation(rotation); |
| } |
| |
| @Override |
| protected void processTextPosition(TextPosition text) { |
| Matrix m = text.getTextMatrix(); |
| m.concatenate(text.getFont().getFontMatrix()); |
| int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY()))); |
| if (angle == 0) { |
| super.processTextPosition(text); |
| } |
| } |
| } |
| |
| static class AngleCollector extends PDFTextStripper { |
| Set<Integer> angles = new HashSet<>(); |
| |
| /** |
| * Instantiate a new PDFTextStripper object. |
| * |
| * @throws IOException If there is an error loading the properties. |
| */ |
| AngleCollector() throws IOException { |
| } |
| |
| public Set<Integer> getAngles() { |
| return angles; |
| } |
| |
| @Override |
| protected void processTextPosition(TextPosition text) { |
| Matrix m = text.getTextMatrix(); |
| m.concatenate(text.getFont().getFontMatrix()); |
| int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY()))); |
| angle = (angle + 360) % 360; |
| angles.add(angle); |
| } |
| } |
| } |
| |