blob: 93d1b7e819f342315bffc94e052d53fcc21acc65 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.pdf;
import java.io.IOException;
import java.io.Writer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.cos.COSStream;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.util.Matrix;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
/**
* Utility class that overrides the {@link PDFTextStripper} functionality
* to produce a semi-structured XHTML SAX events instead of a plain text
* stream.
*/
class PDF2XHTML extends AbstractPDF2XHTML {
/**
* This keeps track of the pdf object ids for inline
* images that have been processed.
* If {@link PDFParserConfig#isExtractUniqueInlineImagesOnly()
* is true, this will be checked before extracting an embedded image.
* The integer keeps track of the inlineImageCounter for that image.
* This integer is used to identify images in the markup.
* <p>
* This is used across the document. To avoid infinite recursion
* TIKA-1742, we're limiting the export to one image per page.
*/
private Map<COSStream, Integer> processedInlineImages = new HashMap<>();
private AtomicInteger inlineImageCounter = new AtomicInteger(0);
PDF2XHTML(PDDocument document, ContentHandler handler, ParseContext context, Metadata metadata,
PDFParserConfig config) throws IOException {
super(document, handler, context, metadata, config);
}
/**
* Converts the given PDF document (and related metadata) to a stream
* of XHTML SAX events sent to the given content handler.
*
* @param document PDF document
* @param handler SAX content handler
* @param metadata PDF metadata
* @throws SAXException if the content handler fails to process SAX events
* @throws TikaException if there was an exception outside of per page processing
*/
public static void process(PDDocument document, ContentHandler handler, ParseContext context,
Metadata metadata, PDFParserConfig config)
throws SAXException, TikaException {
PDF2XHTML pdf2XHTML = null;
try {
// Extract text using a dummy Writer as we override the
// key methods to output to the given content
// handler.
if (config.isDetectAngles()) {
pdf2XHTML =
new AngleDetectingPDF2XHTML(document, handler, context, metadata, config);
} else {
pdf2XHTML = new PDF2XHTML(document, handler, context, metadata, config);
}
config.configure(pdf2XHTML);
pdf2XHTML.writeText(document, new Writer() {
@Override
public void write(char[] cbuf, int off, int len) {
}
@Override
public void flush() {
}
@Override
public void close() {
}
});
} catch (IOException e) {
if (e.getCause() instanceof SAXException) {
throw (SAXException) e.getCause();
} else {
throw new TikaException("Unable to extract PDF content", e);
}
}
if (pdf2XHTML.exceptions.size() > 0) {
//throw the first
throw new TikaException("Unable to extract PDF content", pdf2XHTML.exceptions.get(0));
}
}
@Override
public void processPage(PDPage page) throws IOException {
try {
super.processPage(page);
} catch (IOException e) {
handleCatchableIOE(e);
endPage(page);
}
}
@Override
protected void endPage(PDPage page) throws IOException {
try {
writeParagraphEnd();
try {
extractImages(page);
} catch (IOException e) {
handleCatchableIOE(e);
}
super.endPage(page);
} catch (SAXException e) {
throw new IOException("Unable to end a page", e);
} catch (IOException e) {
handleCatchableIOE(e);
}
}
void extractImages(PDPage page) throws SAXException, IOException {
if (config.isExtractInlineImages() == false &&
config.isExtractInlineImageMetadataOnly() == false) {
return;
}
ImageGraphicsEngine engine =
new ImageGraphicsEngine(page, embeddedDocumentExtractor, config,
processedInlineImages, inlineImageCounter, xhtml, metadata, context);
engine.run();
List<IOException> engineExceptions = engine.getExceptions();
if (engineExceptions.size() > 0) {
IOException first = engineExceptions.remove(0);
if (config.isCatchIntermediateIOExceptions()) {
exceptions.addAll(engineExceptions);
}
throw first;
}
}
@Override
protected void writeParagraphStart() throws IOException {
super.writeParagraphStart();
try {
xhtml.startElement("p");
} catch (SAXException e) {
throw new IOException("Unable to start a paragraph", e);
}
}
@Override
protected void writeParagraphEnd() throws IOException {
super.writeParagraphEnd();
try {
xhtml.endElement("p");
} catch (SAXException e) {
throw new IOException("Unable to end a paragraph", e);
}
}
@Override
protected void writeString(String text) throws IOException {
try {
xhtml.characters(text);
} catch (SAXException e) {
throw new IOException("Unable to write a string: " + text, e);
}
}
@Override
protected void writeCharacters(TextPosition text) throws IOException {
try {
xhtml.characters(text.getUnicode());
} catch (SAXException e) {
throw new IOException("Unable to write a character: " + text.getUnicode(), e);
}
}
@Override
protected void writeWordSeparator() throws IOException {
try {
xhtml.characters(getWordSeparator());
} catch (SAXException e) {
throw new IOException("Unable to write a space character", e);
}
}
@Override
protected void writeLineSeparator() throws IOException {
try {
xhtml.newline();
} catch (SAXException e) {
throw new IOException("Unable to write a newline character", e);
}
}
private static class AngleDetectingPDF2XHTML extends PDF2XHTML {
private AngleDetectingPDF2XHTML(PDDocument document, ContentHandler handler,
ParseContext context, Metadata metadata,
PDFParserConfig config) throws IOException {
super(document, handler, context, metadata, config);
}
@Override
protected void startPage(PDPage page) throws IOException {
//no-op
}
@Override
protected void endPage(PDPage page) throws IOException {
//no-op
}
@Override
public void processPage(PDPage page) throws IOException {
try {
super.startPage(page);
detectAnglesAndProcessPage(page);
} catch (IOException e) {
handleCatchableIOE(e);
} finally {
super.endPage(page);
}
}
private void detectAnglesAndProcessPage(PDPage page) throws IOException {
//copied and pasted from https://issues.apache.org/jira/secure/attachment/12947452/ExtractAngledText.java
//PDFBOX-4371
AngleCollector angleCollector = new AngleCollector(); // alternatively, reset angles
angleCollector.setStartPage(getCurrentPageNo());
angleCollector.setEndPage(getCurrentPageNo());
angleCollector.getText(document);
int rotation = page.getRotation();
page.setRotation(0);
for (Integer angle : angleCollector.getAngles()) {
if (angle == 0) {
try {
super.processPage(page);
} catch (IOException e) {
handleCatchableIOE(e);
}
} else {
// prepend a transformation
try (PDPageContentStream cs = new PDPageContentStream(document, page,
PDPageContentStream.AppendMode.PREPEND, false)) {
cs.transform(Matrix.getRotateInstance(-Math.toRadians(angle), 0, 0));
}
try {
super.processPage(page);
} catch (IOException e) {
handleCatchableIOE(e);
}
// remove transformation
COSArray contents = (COSArray) page.getCOSObject().getItem(COSName.CONTENTS);
contents.remove(0);
}
}
page.setRotation(rotation);
}
@Override
protected void processTextPosition(TextPosition text) {
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
if (angle == 0) {
super.processTextPosition(text);
}
}
}
static class AngleCollector extends PDFTextStripper {
Set<Integer> angles = new HashSet<>();
/**
* Instantiate a new PDFTextStripper object.
*
* @throws IOException If there is an error loading the properties.
*/
AngleCollector() throws IOException {
}
public Set<Integer> getAngles() {
return angles;
}
@Override
protected void processTextPosition(TextPosition text) {
Matrix m = text.getTextMatrix();
m.concatenate(text.getFont().getFontMatrix());
int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
angle = (angle + 360) % 360;
angles.add(angle);
}
}
}