| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.pdf; |
| |
| import java.io.Serializable; |
| import java.lang.reflect.Field; |
| import java.lang.reflect.Modifier; |
| import java.util.HashSet; |
| import java.util.Locale; |
| import java.util.Set; |
| |
| import org.apache.pdfbox.rendering.ImageType; |
| import org.apache.pdfbox.text.PDFTextStripper; |
| |
| import org.apache.tika.exception.TikaException; |
| |
| /** |
| * Config for PDFParser. |
| * <p/> |
| * This allows parameters to be set programmatically: |
| * <ol> |
| * <li>Calls to PDFParser, i.e. parser.getPDFParserConfig().setEnableAutoSpace() (as before)</li> |
| * <li>Passing to PDFParser through a ParseContext: context.set(PDFParserConfig.class, config);</li> |
| * </ol> |
| * <p/> |
| */ |
| public class PDFParserConfig implements Serializable { |
| |
| |
| private static final long serialVersionUID = 6492570218190936986L; |
| private final Set<String> userConfigured = new HashSet<>(); |
| // True if we let PDFBox "guess" where spaces should go: |
| private boolean enableAutoSpace = true; |
| |
| // True if we let PDFBox remove duplicate overlapping text: |
| private boolean suppressDuplicateOverlappingText = false; |
| |
| // True if we extract annotation text ourselves |
| // (workaround for PDFBOX-1143): |
| private boolean extractAnnotationText = true; |
| |
| // True if we should sort text tokens by position |
| // (necessary for some PDFs, but messes up other PDFs): |
| private boolean sortByPosition = false; |
| |
| //True if acroform content should be extracted |
| private boolean extractAcroFormContent = true; |
| |
| //True if bookmarks content should be extracted |
| private boolean extractBookmarksText = true; |
| |
| //True if inline PDXImage objects should be extracted |
| private boolean extractInlineImages = false; |
| |
| //True if inline images should only have their metadata |
| //extracted. |
| private boolean extractInlineImageMetadataOnly = false; |
| |
| //True if inline images (as identified by their object id within |
| //a pdf file) should only be extracted once. |
| private boolean extractUniqueInlineImagesOnly = true; |
| |
| //Should the PDFParser _try_ to extract marked content/structure tags (backoff to regular |
| //text extraction if the given PDF doesn't have marked content) |
| private boolean extractMarkedContent = false; |
| |
| //The character width-based tolerance value used to estimate where spaces in text should be |
| // added. Default taken from PDFBox. |
| private Float averageCharTolerance = 0.3f; |
| |
| //The space width-based tolerance value used to estimate where spaces in text should be added |
| //Default taken from PDFBox. |
| private Float spacingTolerance = 0.5f; |
| |
| // The multiplication factor for line height to decide when a new paragraph starts. |
| //Default taken from PDFBox. |
| private float dropThreshold = 2.5f; |
| |
| //If the PDF has an XFA element, process only that and skip extracting |
| //content from elsewhere in the document. |
| private boolean ifXFAExtractOnlyXFA = false; |
| |
| private OCR_STRATEGY ocrStrategy = OCR_STRATEGY.AUTO; |
| |
| private int ocrDPI = 300; |
| private ImageType ocrImageType = ImageType.GRAY; |
| private String ocrImageFormatName = "png"; |
| private float ocrImageQuality = 1.0f; |
| |
| private AccessChecker accessChecker = new AccessChecker(); |
| |
| //The PDFParser can throw IOExceptions if there is a problem |
| //with a streams. If this is set to true, Tika's |
| //parser catches these exceptions, reports them in the metadata |
| //and then throws the first stored exception after the parse has completed. |
| private boolean catchIntermediateIOExceptions = true; |
| |
| private boolean extractActions = false; |
| |
| private boolean extractFontNames = false; |
| |
| private long maxMainMemoryBytes = -1; |
| |
| private boolean setKCMS = false; |
| |
| private boolean detectAngles = false; |
| |
| /** |
| * @return whether or not to extract only inline image metadata and not render the images |
| */ |
| boolean isExtractInlineImageMetadataOnly() { |
| return extractInlineImageMetadataOnly; |
| } |
| |
| /** |
| * Use this when you want to know how many images of what formats are in a PDF |
| * but you don't need to render the images (e.g. for OCR). This is far |
| * faster than {@link #extractInlineImages} because it doesn't have to render the |
| * images, which can be very slow. This does not extract metadata from |
| * within each image, rather it extracts the XMP that may be stored |
| * external to an image in PDImageXObjects. |
| * |
| * @param extractInlineImageMetadataOnly |
| * @since 1.25 |
| */ |
| void setExtractInlineImageMetadataOnly(boolean extractInlineImageMetadataOnly) { |
| this.extractInlineImageMetadataOnly = extractInlineImageMetadataOnly; |
| userConfigured.add("extractInlineImageMetadataOnly"); |
| } |
| |
| public boolean isExtractMarkedContent() { |
| return extractMarkedContent; |
| } |
| |
| /** |
| * If the PDF contains marked content, try to extract text and its marked structure. |
| * If the PDF does not contain marked content, backoff to the regular PDF2XHTML for |
| * text extraction. As of 1.24, this is an "alpha" version. |
| * |
| * @param extractMarkedContent |
| * @since 1.24 |
| */ |
| public void setExtractMarkedContent(boolean extractMarkedContent) { |
| this.extractMarkedContent = extractMarkedContent; |
| userConfigured.add("extractMarkedContent"); |
| } |
| |
| /** |
| * Configures the given pdf2XHTML. |
| * |
| * @param pdf2XHTML |
| */ |
| public void configure(PDF2XHTML pdf2XHTML) { |
| pdf2XHTML.setSortByPosition(isSortByPosition()); |
| if (isEnableAutoSpace()) { |
| pdf2XHTML.setWordSeparator(" "); |
| } else { |
| pdf2XHTML.setWordSeparator(""); |
| } |
| if (getAverageCharTolerance() != null) { |
| pdf2XHTML.setAverageCharTolerance(getAverageCharTolerance()); |
| } |
| if (getSpacingTolerance() != null) { |
| pdf2XHTML.setSpacingTolerance(getSpacingTolerance()); |
| } |
| if (getDropThreshold() != null) { |
| pdf2XHTML.setDropThreshold(dropThreshold); |
| } |
| pdf2XHTML.setSuppressDuplicateOverlappingText(isSuppressDuplicateOverlappingText()); |
| } |
| |
| /** |
| * @see #setExtractAcroFormContent(boolean) |
| */ |
| public boolean isExtractAcroFormContent() { |
| return extractAcroFormContent; |
| } |
| |
| /** |
| * If true (the default), extract content from AcroForms |
| * at the end of the document. If an XFA is found, |
| * try to process that, otherwise, process the AcroForm. |
| * |
| * @param extractAcroFormContent |
| */ |
| public void setExtractAcroFormContent(boolean extractAcroFormContent) { |
| this.extractAcroFormContent = extractAcroFormContent; |
| userConfigured.add("extractAcroFormContent"); |
| } |
| |
| /** |
| * @return how to handle XFA data if it exists |
| * @see #setIfXFAExtractOnlyXFA(boolean) |
| */ |
| public boolean isIfXFAExtractOnlyXFA() { |
| return ifXFAExtractOnlyXFA; |
| } |
| |
| /** |
| * If false (the default), extract content from the full PDF |
| * as well as the XFA form. This will likely lead to some duplicative |
| * content. |
| * |
| * @param ifXFAExtractOnlyXFA |
| */ |
| public void setIfXFAExtractOnlyXFA(boolean ifXFAExtractOnlyXFA) { |
| this.ifXFAExtractOnlyXFA = ifXFAExtractOnlyXFA; |
| userConfigured.add("ifXFAExtractOnlyXFA"); |
| } |
| |
| /** |
| * @see #setExtractBookmarksText(boolean) |
| */ |
| public boolean isExtractBookmarksText() { |
| return extractBookmarksText; |
| } |
| |
| /** |
| * If true, extract bookmarks (document outline) text. |
| * <p/> |
| * Te default is <code>true</code> |
| * |
| * @param extractBookmarksText |
| */ |
| public void setExtractBookmarksText(boolean extractBookmarksText) { |
| this.extractBookmarksText = extractBookmarksText; |
| userConfigured.add("extractBookmarksText"); |
| } |
| |
| public boolean isExtractFontNames() { |
| return extractFontNames; |
| } |
| |
| /** |
| * Extract font names into a metadata field |
| * |
| * @param extractFontNames |
| */ |
| public void setExtractFontNames(boolean extractFontNames) { |
| this.extractFontNames = extractFontNames; |
| userConfigured.add("extractFontNames"); |
| } |
| |
| /** |
| * @see #setExtractInlineImages(boolean) |
| */ |
| public boolean isExtractInlineImages() { |
| return extractInlineImages; |
| } |
| |
| /** |
| * If true, extract inline embedded OBXImages. |
| * <b>Beware:</b> some PDF documents of modest size (~4MB) can contain |
| * thousands of embedded images totaling > 2.5 GB. Also, at least as of PDFBox 1.8.5, |
| * there can be surprisingly large memory consumption and/or out of memory errors. |
| * Set to <code>true</code> with caution. |
| * <p/> |
| * The default is <code>false</code>. |
| * <p/> |
| * |
| * @param extractInlineImages |
| * @see #setExtractUniqueInlineImagesOnly(boolean) |
| */ |
| public void setExtractInlineImages(boolean extractInlineImages) { |
| this.extractInlineImages = extractInlineImages; |
| userConfigured.add("extractInlineImages"); |
| } |
| |
| /** |
| * @see #setExtractUniqueInlineImagesOnly(boolean) |
| */ |
| public boolean isExtractUniqueInlineImagesOnly() { |
| return extractUniqueInlineImagesOnly; |
| } |
| |
| /** |
| * Multiple pages within a PDF file might refer to the same underlying image. |
| * If {@link #extractUniqueInlineImagesOnly} is set to <code>false</code>, the |
| * parser will call the EmbeddedExtractor each time the image appears on a page. |
| * This might be desired for some use cases. However, to avoid duplication of |
| * extracted images, set this to <code>true</code>. The default is <code>true</code>. |
| * <p/> |
| * Note that uniqueness is determined only by the underlying PDF COSObject id, not by |
| * file hash or similar equality metric. |
| * If the PDF actually contains multiple copies of the same image |
| * -- all with different object ids -- then all images will be extracted. |
| * <p/> |
| * For this parameter to have any effect, {@link #extractInlineImages} must be |
| * set to <code>true</code>. |
| * <p> |
| * Because of TIKA-1742 -- to avoid infinite recursion -- no matter the setting |
| * of this parameter, the extractor will only pull out one copy of each image per |
| * page. This parameter tries to capture uniqueness across the entire document. |
| * |
| * @param extractUniqueInlineImagesOnly |
| */ |
| public void setExtractUniqueInlineImagesOnly(boolean extractUniqueInlineImagesOnly) { |
| this.extractUniqueInlineImagesOnly = extractUniqueInlineImagesOnly; |
| userConfigured.add("extractUniqueInlineImagesOnly"); |
| } |
| |
| /** |
| * @see #setEnableAutoSpace(boolean) |
| */ |
| public boolean isEnableAutoSpace() { |
| return enableAutoSpace; |
| } |
| |
| /** |
| * If true (the default), the parser should estimate |
| * where spaces should be inserted between words. For |
| * many PDFs this is necessary as they do not include |
| * explicit whitespace characters. |
| */ |
| public void setEnableAutoSpace(boolean enableAutoSpace) { |
| this.enableAutoSpace = enableAutoSpace; |
| userConfigured.add("enableAutoSpace"); |
| } |
| |
| /** |
| * @see #setSuppressDuplicateOverlappingText(boolean) |
| */ |
| public boolean isSuppressDuplicateOverlappingText() { |
| return suppressDuplicateOverlappingText; |
| } |
| |
| /** |
| * If true, the parser should try to remove duplicated |
| * text over the same region. This is needed for some |
| * PDFs that achieve bolding by re-writing the same |
| * text in the same area. Note that this can |
| * slow down extraction substantially (PDFBOX-956) and |
| * sometimes remove characters that were not in fact |
| * duplicated (PDFBOX-1155). By default this is disabled. |
| */ |
| public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText) { |
| this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText; |
| userConfigured.add("suppressDuplicateOverlappingText"); |
| } |
| |
| /** |
| * @see #setExtractAnnotationText(boolean) |
| */ |
| public boolean isExtractAnnotationText() { |
| return extractAnnotationText; |
| } |
| |
| /** |
| * If true (the default), text in annotations will be |
| * extracted. |
| */ |
| public void setExtractAnnotationText(boolean extractAnnotationText) { |
| this.extractAnnotationText = extractAnnotationText; |
| userConfigured.add("extractAnnotationText"); |
| } |
| |
| /** |
| * @see #setSortByPosition(boolean) |
| */ |
| public boolean isSortByPosition() { |
| return sortByPosition; |
| } |
| |
| /** |
| * If true, sort text tokens by their x/y position |
| * before extracting text. This may be necessary for |
| * some PDFs (if the text tokens are not rendered "in |
| * order"), while for other PDFs it can produce the |
| * wrong result (for example if there are 2 columns, |
| * the text will be interleaved). Default is false. |
| */ |
| public void setSortByPosition(boolean sortByPosition) { |
| this.sortByPosition = sortByPosition; |
| userConfigured.add("sortByPosition"); |
| } |
| |
| /** |
| * @see #setAverageCharTolerance(Float) |
| */ |
| public Float getAverageCharTolerance() { |
| return averageCharTolerance; |
| } |
| |
| /** |
| * See {@link PDFTextStripper#setAverageCharTolerance(float)} |
| */ |
| public void setAverageCharTolerance(Float averageCharTolerance) { |
| this.averageCharTolerance = averageCharTolerance; |
| userConfigured.add("averageCharTolerance"); |
| } |
| |
| /** |
| * @see #setSpacingTolerance(Float) |
| */ |
| public Float getSpacingTolerance() { |
| return spacingTolerance; |
| } |
| |
| /** |
| * See {@link PDFTextStripper#setSpacingTolerance(float)} |
| */ |
| public void setSpacingTolerance(Float spacingTolerance) { |
| this.spacingTolerance = spacingTolerance; |
| userConfigured.add("spacingTolerance"); |
| } |
| |
| /** |
| * @see #setDropThreshold(Float) |
| */ |
| public Float getDropThreshold() { |
| return dropThreshold; |
| } |
| |
| /** |
| * See {@link PDFTextStripper#setDropThreshold(float)} |
| */ |
| public void setDropThreshold(Float dropThreshold) { |
| this.dropThreshold = dropThreshold; |
| userConfigured.add("dropThreshold"); |
| } |
| |
| public AccessChecker getAccessChecker() { |
| return accessChecker; |
| } |
| |
| public void setAccessChecker(AccessChecker accessChecker) { |
| this.accessChecker = accessChecker; |
| userConfigured.add("accessChecker"); |
| } |
| |
| /** |
| * See {@link #setCatchIntermediateIOExceptions(boolean)} |
| * |
| * @return whether or not to catch IOExceptions |
| */ |
| public boolean isCatchIntermediateIOExceptions() { |
| return catchIntermediateIOExceptions; |
| } |
| |
| /** |
| * The PDFBox parser will throw an IOException if there is |
| * a problem with a stream. If this is set to <code>true</code>, |
| * Tika's PDFParser will catch these exceptions and try to parse |
| * the rest of the document. After the parse is completed, |
| * Tika's PDFParser will throw the first caught exception. |
| * |
| * @param catchIntermediateIOExceptions |
| */ |
| public void setCatchIntermediateIOExceptions(boolean catchIntermediateIOExceptions) { |
| this.catchIntermediateIOExceptions = catchIntermediateIOExceptions; |
| userConfigured.add("catchIntermediateIOExceptions"); |
| } |
| |
| /** |
| * @return strategy to use for OCR |
| */ |
| public OCR_STRATEGY getOcrStrategy() { |
| return ocrStrategy; |
| } |
| |
| /** |
| * Which strategy to use for OCR |
| * |
| * @param ocrStrategy |
| */ |
| public void setOcrStrategy(OCR_STRATEGY ocrStrategy) { |
| this.ocrStrategy = ocrStrategy; |
| userConfigured.add("ocrStrategy"); |
| } |
| |
| /** |
| * Which strategy to use for OCR |
| * |
| * @param ocrStrategyString |
| */ |
| public void setOcrStrategy(String ocrStrategyString) { |
| setOcrStrategy(OCR_STRATEGY.parse(ocrStrategyString)); |
| } |
| |
| /** |
| * String representation of the image format used to render |
| * the page image for OCR (examples: png, tiff, jpeg) |
| * |
| * @return |
| */ |
| public String getOcrImageFormatName() { |
| return ocrImageFormatName; |
| } |
| |
| /** |
| * @param ocrImageFormatName name of image format used to render |
| * page image |
| * @see #getOcrImageFormatName() |
| */ |
| public void setOcrImageFormatName(String ocrImageFormatName) { |
| if (!ocrImageFormatName.equals("png") && !ocrImageFormatName.equals("tiff") && |
| !ocrImageFormatName.equals("jpeg")) { |
| throw new IllegalArgumentException( |
| "Available options: png, tiff, jpeg. " + "I'm sorry, but I don't recognize: " + |
| ocrImageFormatName); |
| } |
| this.ocrImageFormatName = ocrImageFormatName; |
| userConfigured.add("ocrImageFormatName"); |
| } |
| |
| /** |
| * Image type used to render the page image for OCR. |
| * |
| * @return image type |
| * @see #setOcrImageType(ImageType) |
| */ |
| public ImageType getOcrImageType() { |
| return ocrImageType; |
| } |
| |
| /** |
| * Image type used to render the page image for OCR. |
| * |
| * @param ocrImageType |
| */ |
| public void setOcrImageType(ImageType ocrImageType) { |
| this.ocrImageType = ocrImageType; |
| userConfigured.add("ocrImageType"); |
| } |
| |
| /** |
| * Image type used to render the page image for OCR. |
| * |
| * @see #setOcrImageType(ImageType) |
| */ |
| public void setOcrImageType(String ocrImageTypeString) { |
| this.ocrImageType = parseImageType(ocrImageTypeString); |
| } |
| |
| /** |
| * Dots per inch used to render the page image for OCR |
| * |
| * @return dots per inch |
| */ |
| public int getOcrDPI() { |
| return ocrDPI; |
| } |
| |
| /** |
| * Dots per inch used to render the page image for OCR. |
| * This does not apply to all image formats. |
| * |
| * @param ocrDPI |
| */ |
| public void setOcrDPI(int ocrDPI) { |
| this.ocrDPI = ocrDPI; |
| userConfigured.add("ocrDPI"); |
| } |
| |
| /** |
| * Image quality used to render the page image for OCR. |
| * This does not apply to all image formats |
| * |
| * @return |
| */ |
| public float getOcrImageQuality() { |
| return ocrImageQuality; |
| } |
| |
| /** |
| * Image quality used to render the page image for OCR. |
| * This does not apply to all image formats |
| */ |
| public void setOcrImageQuality(float ocrImageQuality) { |
| this.ocrImageQuality = ocrImageQuality; |
| userConfigured.add("ocrImageQuality"); |
| } |
| |
| /** |
| * @return whether or not to extract PDActions |
| * @see #setExtractActions(boolean) |
| */ |
| public boolean isExtractActions() { |
| return extractActions; |
| } |
| |
| /** |
| * Whether or not to extract PDActions from the file. |
| * Most Action types are handled inline; javascript macros |
| * are processed as embedded documents. |
| * |
| * @param v |
| */ |
| public void setExtractActions(boolean v) { |
| extractActions = v; |
| userConfigured.add("extractActions"); |
| } |
| |
| /** |
| * The maximum amount of memory to use when loading a pdf into a PDDocument. Additional |
| * buffering is done using a temp file. |
| * |
| * @return |
| */ |
| public long getMaxMainMemoryBytes() { |
| return maxMainMemoryBytes; |
| } |
| |
| public void setMaxMainMemoryBytes(long maxMainMemoryBytes) { |
| this.maxMainMemoryBytes = maxMainMemoryBytes; |
| userConfigured.add("maxMainMemoryBytes"); |
| } |
| |
| public boolean isSetKCMS() { |
| return setKCMS; |
| } |
| |
| /** |
| * <p> |
| * Whether to call <code>System.setProperty("sun.java2d.cmm", |
| * "sun.java2d.cmm.kcms.KcmsServiceProvider")</code>. |
| * KCMS is the unmaintained, legacy provider and is far faster than the newer replacement. |
| * However, there are stability and security risks with using the unmaintained legacy provider. |
| * </p> |
| * <p> |
| * Note, of course, that this is <b>not</b> thread safe. If the value is <code>false</code> |
| * in your first thread, and the second thread changes this to <code>true</code>, |
| * the system property in the first thread will now be <code>true</code>. |
| * </p> |
| * <p> |
| * Default is <code>false</code>. |
| * </p> |
| * |
| * @param setKCMS whether or not to set KCMS |
| */ |
| public void setSetKCMS(boolean setKCMS) { |
| this.setKCMS = setKCMS; |
| userConfigured.add("setKCMS"); |
| } |
| |
| private ImageType parseImageType(String ocrImageType) { |
| for (ImageType t : ImageType.values()) { |
| if (ocrImageType.equalsIgnoreCase(t.toString())) { |
| return t; |
| } |
| } |
| StringBuilder sb = new StringBuilder(); |
| sb.append("I regret that I could not parse '"); |
| sb.append(ocrImageType); |
| sb.append("'. I'm only familiar with: "); |
| int i = 0; |
| for (ImageType t : ImageType.values()) { |
| if (i++ == 0) { |
| sb.append(", "); |
| } |
| sb.append(t.toString()); |
| } |
| throw new IllegalArgumentException(sb.toString()); |
| } |
| |
| public boolean isDetectAngles() { |
| return detectAngles; |
| } |
| |
| public void setDetectAngles(boolean detectAngles) { |
| this.detectAngles = detectAngles; |
| userConfigured.add("detectAngles"); |
| } |
| |
| public PDFParserConfig cloneAndUpdate(PDFParserConfig updates) throws TikaException { |
| PDFParserConfig updated = new PDFParserConfig(); |
| for (Field field : this.getClass().getDeclaredFields()) { |
| if (Modifier.isFinal(field.getModifiers())) { |
| continue; |
| } else if (Modifier.isStatic(field.getModifiers())) { |
| continue; |
| } |
| if ("userConfigured".equals(field.getName())) { |
| continue; |
| } |
| if (updates.userConfigured.contains(field.getName())) { |
| try { |
| field.set(updated, field.get(updates)); |
| } catch (IllegalAccessException e) { |
| throw new TikaException("can't update " + field.getName(), e); |
| } |
| } else { |
| try { |
| field.set(updated, field.get(this)); |
| } catch (IllegalAccessException e) { |
| throw new TikaException("can't update " + field.getName(), e); |
| } |
| } |
| } |
| return updated; |
| } |
| |
| @Override |
| public boolean equals(Object o) { |
| if (this == o) { |
| return true; |
| } |
| if (!(o instanceof PDFParserConfig)) { |
| return false; |
| } |
| |
| PDFParserConfig config = (PDFParserConfig) o; |
| |
| if (isEnableAutoSpace() != config.isEnableAutoSpace()) { |
| return false; |
| } |
| if (isSuppressDuplicateOverlappingText() != config.isSuppressDuplicateOverlappingText()) { |
| return false; |
| } |
| if (isExtractAnnotationText() != config.isExtractAnnotationText()) { |
| return false; |
| } |
| if (isSortByPosition() != config.isSortByPosition()) { |
| return false; |
| } |
| if (isExtractAcroFormContent() != config.isExtractAcroFormContent()) { |
| return false; |
| } |
| if (isExtractBookmarksText() != config.isExtractBookmarksText()) { |
| return false; |
| } |
| if (isExtractInlineImages() != config.isExtractInlineImages()) { |
| return false; |
| } |
| if (isExtractUniqueInlineImagesOnly() != config.isExtractUniqueInlineImagesOnly()) { |
| return false; |
| } |
| if (isIfXFAExtractOnlyXFA() != config.isIfXFAExtractOnlyXFA()) { |
| return false; |
| } |
| if (getOcrDPI() != config.getOcrDPI()) { |
| return false; |
| } |
| if (isCatchIntermediateIOExceptions() != config.isCatchIntermediateIOExceptions()) { |
| return false; |
| } |
| if (!getAverageCharTolerance().equals(config.getAverageCharTolerance())) { |
| return false; |
| } |
| if (!getSpacingTolerance().equals(config.getSpacingTolerance())) { |
| return false; |
| } |
| if (!getDropThreshold().equals(config.getDropThreshold())) { |
| return false; |
| } |
| if (!getOcrStrategy().equals(config.getOcrStrategy())) { |
| return false; |
| } |
| if (getOcrImageType() != config.getOcrImageType()) { |
| return false; |
| } |
| if (!getOcrImageFormatName().equals(config.getOcrImageFormatName())) { |
| return false; |
| } |
| if (isExtractActions() != config.isExtractActions()) { |
| return false; |
| } |
| if (!getAccessChecker().equals(config.getAccessChecker())) { |
| return false; |
| } |
| return getMaxMainMemoryBytes() == config.getMaxMainMemoryBytes(); |
| } |
| |
| @Override |
| public int hashCode() { |
| int result = (isEnableAutoSpace() ? 1 : 0); |
| result = 31 * result + (isSuppressDuplicateOverlappingText() ? 1 : 0); |
| result = 31 * result + (isExtractAnnotationText() ? 1 : 0); |
| result = 31 * result + (isSortByPosition() ? 1 : 0); |
| result = 31 * result + (isExtractAcroFormContent() ? 1 : 0); |
| result = 31 * result + (isExtractBookmarksText() ? 1 : 0); |
| result = 31 * result + (isExtractInlineImages() ? 1 : 0); |
| result = 31 * result + (isExtractUniqueInlineImagesOnly() ? 1 : 0); |
| result = 31 * result + getAverageCharTolerance().hashCode(); |
| result = 31 * result + getSpacingTolerance().hashCode(); |
| result = 31 * result + getDropThreshold().hashCode(); |
| result = 31 * result + (isIfXFAExtractOnlyXFA() ? 1 : 0); |
| result = 31 * result + ocrStrategy.hashCode(); |
| result = 31 * result + getOcrDPI(); |
| result = 31 * result + getOcrImageType().hashCode(); |
| result = 31 * result + getOcrImageFormatName().hashCode(); |
| result = 31 * result + getAccessChecker().hashCode(); |
| result = 31 * result + (isCatchIntermediateIOExceptions() ? 1 : 0); |
| result = 31 * result + (isExtractActions() ? 1 : 0); |
| result = 31 * result + Long.valueOf(getMaxMainMemoryBytes()).hashCode(); |
| return result; |
| } |
| |
| @Override |
| public String toString() { |
| return "PDFParserConfig{" + "enableAutoSpace=" + enableAutoSpace + |
| ", suppressDuplicateOverlappingText=" + suppressDuplicateOverlappingText + |
| ", extractAnnotationText=" + extractAnnotationText + ", sortByPosition=" + |
| sortByPosition + ", extractAcroFormContent=" + extractAcroFormContent + |
| ", extractBookmarksText=" + extractBookmarksText + ", extractInlineImages=" + |
| extractInlineImages + ", extractUniqueInlineImagesOnly=" + |
| extractUniqueInlineImagesOnly + ", averageCharTolerance=" + averageCharTolerance + |
| ", spacingTolerance=" + spacingTolerance + ", dropThreshold=" + dropThreshold + |
| ", ifXFAExtractOnlyXFA=" + ifXFAExtractOnlyXFA + ", ocrStrategy=" + ocrStrategy + |
| ", ocrDPI=" + ocrDPI + ", ocrImageType=" + ocrImageType + ", ocrImageFormatName='" + |
| ocrImageFormatName + '\'' + ", accessChecker=" + accessChecker + |
| ", extractActions=" + extractActions + ", catchIntermediateIOExceptions=" + |
| catchIntermediateIOExceptions + ", maxMainMemoryBytes=" + maxMainMemoryBytes + '}'; |
| } |
| |
| public enum OCR_STRATEGY { |
| AUTO, NO_OCR, OCR_ONLY, OCR_AND_TEXT_EXTRACTION; |
| |
| private static OCR_STRATEGY parse(String s) { |
| if (s == null) { |
| return NO_OCR; |
| } else if ("no_ocr".equals(s.toLowerCase(Locale.ROOT))) { |
| return NO_OCR; |
| } else if ("ocr_only".equals(s.toLowerCase(Locale.ROOT))) { |
| return OCR_ONLY; |
| } else if (s.toLowerCase(Locale.ROOT).contains("ocr_and_text")) { |
| return OCR_AND_TEXT_EXTRACTION; |
| } else if ("auto".equals(s.toLowerCase(Locale.ROOT))) { |
| return AUTO; |
| } |
| StringBuilder sb = new StringBuilder(); |
| sb.append("I regret that I don't recognize '").append(s); |
| sb.append("' as an OCR_STRATEGY. I only recognize:"); |
| int i = 0; |
| for (OCR_STRATEGY strategy : OCR_STRATEGY.values()) { |
| if (i++ > 0) { |
| sb.append(", "); |
| } |
| sb.append(strategy.toString()); |
| |
| } |
| throw new IllegalArgumentException(sb.toString()); |
| } |
| } |
| } |