blob: 165569e1a42d9a7be0dde2e5802fe8bba9624345 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ocr;
import java.io.Serializable;
import java.lang.reflect.Field;
import java.lang.reflect.Modifier;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.tika.exception.TikaException;
import org.apache.tika.utils.StringUtils;
/**
* Configuration for TesseractOCRParser.
* This class is not thread safe and must be synchronized externally.
* <p>
* This class will remember all set* field forever,
* and on {@link #cloneAndUpdate(TesseractOCRConfig)},
* it will update all the fields that have been set on the "update" config.
* So, for example, if you want to change language to "fra"
* from "eng" and then on another parse,
* you want to change depth to 5 on the same update object,
* but you expect the language to revert to "eng", you'll be wrong.
* Create a new update config for each parse unless you're only changing the
* same field(s) with every parse.
*/
public class TesseractOCRConfig implements Serializable {
private static final long serialVersionUID = -4861942486845757891L;
private static final Logger LOG = LoggerFactory.getLogger(TesseractOCRConfig.class);
private static Pattern ALLOWABLE_PAGE_SEPARATORS_PATTERN =
Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$");
private static Pattern ALLOWABLE_OTHER_PARAMS_PATTERN =
Pattern.compile("(?i)^[-_/\\.A-Z0-9]+$");
// Language dictionary to be used.
private String language = "eng";
// Tesseract page segmentation mode.
private String pageSegMode = "1";
// Minimum file size to submit file to ocr.
private long minFileSizeToOcr = 0;
// Maximum file size to submit file to ocr.
private long maxFileSizeToOcr = Integer.MAX_VALUE;
// Maximum time (seconds) to wait for the ocring process termination
private int timeoutSeconds = 120;
// The format of the ocr'ed output to be returned, txt or hocr.
private OUTPUT_TYPE outputType = OUTPUT_TYPE.TXT;
// enable image preprocessing with ImageMagick (optional)
private boolean enableImagePreprocessing = false;
// resolution of processed image (in dpi).
private int density = 300;
// number of bits in a color sample within a pixel.
private int depth = 4;
// colorspace of processed image.
private String colorspace = "gray";
// filter to be applied to the processed image.
private String filter = "triangle";
// factor by which image is to be scaled.
// TODO: we should make this dynamic depending on the size of the image
// The current testRotation.png takes minutes to expand 900%
private int resize = 200;
// See setPageSeparator.
private String pageSeparator = "";
// whether or not to preserve interword spacing
private boolean preserveInterwordSpacing = false;
// whether or not to apply rotation calculated by the rotation.py script
private boolean applyRotation = false;
// runtime switch to turn off OCR
private boolean skipOcr = false;
// See addOtherTesseractConfig.
private Map<String, String> otherTesseractConfig = new HashMap<>();
private Set<String> userConfigured = new HashSet<>();
/**
* This takes a language string, parses it and then bins individual langs into
* valid or invalid based on regexes against the language codes
*
* @param language
* @param validLangs
* @param invalidLangs
*/
public static void getLangs(String language, Set<String> validLangs, Set<String> invalidLangs) {
if (StringUtils.isBlank(language)) {
return;
}
// Get rid of embedded spaces
language = language.replaceAll("\\s", "");
// Test for leading or trailing +
if (language.matches("\\+.*|.*\\+")) {
throw new IllegalArgumentException(
"Invalid syntax - Can't start or end with +" + language);
}
// Split on the + sign
final String[] langs = language.split("\\+");
for (String lang : langs) {
// First, make sure it conforms to the correct syntax
if (!lang.matches(
"([a-zA-Z]{3}(_[a-zA-Z]{3,4}){0,2})|script(/|\\\\)[A-Z][a-zA-Z_]+")) {
invalidLangs.add(lang + " (invalid syntax)");
} else {
validLangs.add(lang);
}
}
}
/**
* @see #setLanguage(String language)
*/
public String getLanguage() {
return language;
}
/**
* Set tesseract language dictionary to be used. Default is "eng".
* languages are either:
* <ol>
* <li>Nominally an ISO-639-2 code but compound codes are allowed separated by underscore:
* e.g., chi_tra_vert, aze_cyrl</li>
* <li>A file path in the script directory. The name starts with upper-case letter.
* Some of them have underscores and other upper-case letters: e.g., script/Arabic,
* script/HanS_vert, script/Japanese_vert, script/Canadian_Aboriginal</li>
* </ol>
* Multiple languages may be specified, separated by plus characters.
* e.g. "chi_tra+chi_sim+script/Arabic"
*/
public void setLanguage(String languageString) {
Set<String> invalidCodes = new HashSet<>();
Set<String> validCodes = new HashSet<>();
getLangs(languageString, validCodes, invalidCodes);
if (!invalidCodes.isEmpty()) {
throw new IllegalArgumentException("Invalid language code(s): " + invalidCodes);
}
this.language = languageString;
userConfigured.add("language");
}
/**
* @see #setPageSegMode(String pageSegMode)
*/
public String getPageSegMode() {
return pageSegMode;
}
/**
* Set tesseract page segmentation mode.
* Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
*/
public void setPageSegMode(String pageSegMode) {
if (!pageSegMode.matches("[0-9]|10|11|12|13")) {
throw new IllegalArgumentException("Invalid page segmentation mode");
}
this.pageSegMode = pageSegMode;
userConfigured.add("pageSegMode");
}
/**
* @see #setPageSeparator(String pageSeparator)
*/
public String getPageSeparator() {
return pageSeparator;
}
/**
* The page separator to use in plain text output. This corresponds to Tesseract's
* page_separator config option.
* The default here is the empty string (i.e. no page separators). Note that this is also
* the default in
* Tesseract 3.x, but in Tesseract 4.0 the default is to use the form feed control character.
* We are overriding
* Tesseract 4.0's default here.
*
* @param pageSeparator
*/
public void setPageSeparator(String pageSeparator) {
Matcher m = ALLOWABLE_PAGE_SEPARATORS_PATTERN.matcher(pageSeparator);
if (!m.find()) {
throw new IllegalArgumentException(pageSeparator + " contains illegal characters.\n" +
"If you trust this value, set it with setTrustedPageSeparator");
}
setTrustedPageSeparator(pageSeparator);
userConfigured.add("pageSeparator");
}
/**
* Same as {@link #setPageSeparator(String)} but does not perform
* any checks on the string.
*
* @param pageSeparator
*/
public void setTrustedPageSeparator(String pageSeparator) {
this.pageSeparator = pageSeparator;
}
/**
* @return whether or not to maintain interword spacing.
*/
public boolean isPreserveInterwordSpacing() {
return preserveInterwordSpacing;
}
/**
* Whether or not to maintain interword spacing. Default is <code>false</code>.
*
* @param preserveInterwordSpacing
*/
public void setPreserveInterwordSpacing(boolean preserveInterwordSpacing) {
this.preserveInterwordSpacing = preserveInterwordSpacing;
userConfigured.add("preserveInterwordSpacing");
}
/**
* @see #setMinFileSizeToOcr(long minFileSizeToOcr)
*/
public long getMinFileSizeToOcr() {
return minFileSizeToOcr;
}
/**
* Set minimum file size to submit file to ocr.
* Default is 0.
*/
public void setMinFileSizeToOcr(long minFileSizeToOcr) {
this.minFileSizeToOcr = minFileSizeToOcr;
userConfigured.add("minFileSizeToOcr");
}
/**
* @see #setMaxFileSizeToOcr(long maxFileSizeToOcr)
*/
public long getMaxFileSizeToOcr() {
return maxFileSizeToOcr;
}
/**
* Set maximum file size to submit file to ocr.
* Default is Integer.MAX_VALUE.
*/
public void setMaxFileSizeToOcr(long maxFileSizeToOcr) {
this.maxFileSizeToOcr = maxFileSizeToOcr;
userConfigured.add("maxFileSizeToOcr");
}
/**
* @return timeout value for Tesseract
* @see #setTimeoutSeconds(int timeout)
*/
public int getTimeoutSeconds() {
return timeoutSeconds;
}
/**
* Set maximum time (seconds) to wait for the ocring process to terminate.
* Default value is 120s.
*/
public void setTimeoutSeconds(int timeoutSeconds) {
this.timeoutSeconds = timeoutSeconds;
userConfigured.add("timeoutSeconds");
}
/**
* @see #setOutputType(OUTPUT_TYPE outputType)
*/
public OUTPUT_TYPE getOutputType() {
return outputType;
}
/**
* Set output type from ocr process. Default is "txt", but can be "hocr".
* Default value is {@link OUTPUT_TYPE#TXT}.
*/
public void setOutputType(OUTPUT_TYPE outputType) {
this.outputType = outputType;
userConfigured.add("outputType");
}
public void setOutputType(String outputType) {
if (outputType == null) {
throw new IllegalArgumentException("outputType must not be null");
}
String lc = outputType.toLowerCase(Locale.US);
if ("txt".equals(lc)) {
setOutputType(OUTPUT_TYPE.TXT);
} else if ("hocr".equals(lc)) {
setOutputType(OUTPUT_TYPE.HOCR);
} else {
throw new IllegalArgumentException("outputType must be either 'txt' or 'hocr'");
}
}
/**
* @return image processing is enabled or not
* @see #setEnableImagePreprocessing(boolean)
*/
public boolean isEnableImagePreprocessing() {
return enableImagePreprocessing;
}
/**
* Set the value to true if processing is to be enabled.
* Default value is false.
*/
public void setEnableImagePreprocessing(boolean enableImagePreprocessing) {
this.enableImagePreprocessing = enableImagePreprocessing;
userConfigured.add("enableImagePreprocessing");
}
/**
* @return the density
*/
public int getDensity() {
return density;
}
/**
* @param density the density to set. Valid range of values is 150-1200.
* Default value is 300.
*/
public void setDensity(int density) {
if (density < 150 || density > 1200) {
throw new IllegalArgumentException(
"Invalid density value. Valid range of values is 150-1200.");
}
this.density = density;
userConfigured.add("density");
}
/**
* @return the depth
*/
public int getDepth() {
return depth;
}
/**
* @param depth the depth to set. Valid values are 2, 4, 8, 16, 32, 64, 256, 4096.
* Default value is 4.
*/
public void setDepth(int depth) {
int[] allowedValues = {2, 4, 8, 16, 32, 64, 256, 4096};
for (int allowedValue : allowedValues) {
if (depth == allowedValue) {
this.depth = depth;
userConfigured.add("depth");
return;
}
}
throw new IllegalArgumentException(
"Invalid depth value. Valid values are 2, 4, 8, 16, 32, 64, 256, 4096.");
}
/**
* @return the colorspace
*/
public String getColorspace() {
return colorspace;
}
/**
* @param colorspace the colorspace to set
* Deafult value is gray.
*/
public void setColorspace(String colorspace) {
if (colorspace == null) {
throw new IllegalArgumentException("Colorspace value cannot be null.");
}
if (!colorspace.matches("(?i)^[-_A-Z0-9]+$")) {
throw new IllegalArgumentException(
"colorspace must match this pattern: (?i)^[-_A-Z0-9]+$");
}
this.colorspace = colorspace;
userConfigured.add("colorspace");
}
/**
* @return the filter
*/
public String getFilter() {
return filter;
}
/**
* @param filter the filter to set. Valid values are point, hermite, cubic, box, gaussian,
* catrom, triangle, quadratic and mitchell.
* Default value is triangle.
*/
public void setFilter(String filter) {
if (filter.equals(null)) {
throw new IllegalArgumentException(
"Filter value cannot be null. Valid values are point, hermite, " +
"cubic, box, gaussian, catrom, triangle, quadratic and mitchell.");
}
String[] allowedFilters =
{"Point", "Hermite", "Cubic", "Box", "Gaussian", "Catrom", "Triangle", "Quadratic",
"Mitchell"};
for (String allowedFilter : allowedFilters) {
if (filter.equalsIgnoreCase(allowedFilter)) {
this.filter = filter;
userConfigured.add("filter");
return;
}
}
throw new IllegalArgumentException(
"Invalid filter value. Valid values are point, hermite, " +
"cubic, box, gaussian, catrom, triangle, quadratic and mitchell.");
}
public boolean isSkipOcr() {
return skipOcr;
}
/**
* If you want to turn off OCR at run time for a specific file,
* set this to <code>true</code>
*
* @param skipOcr
*/
public void setSkipOcr(boolean skipOcr) {
this.skipOcr = skipOcr;
userConfigured.add("skipOcr");
}
/**
* @return the resize
*/
public int getResize() {
return resize;
}
/**
* @param resize the resize to set. Valid range of values is 100-900.
* Default value is 900.
*/
public void setResize(int resize) {
for (int i = 1; i < 10; i++) {
if (resize == i * 100) {
this.resize = resize;
userConfigured.add("resize");
return;
}
}
throw new IllegalArgumentException(
"Invalid resize value. Valid range of values is 100-900.");
}
/**
* @return Whether or not a rotation value should be calculated and passed to ImageMagick
* before performing OCR.
*/
public boolean isApplyRotation() {
return this.applyRotation;
}
/**
* Sets whether or not a rotation value should be calculated and passed to ImageMagick.
*
* @param applyRotation to calculate and apply rotation, false to skip. Default is false
*/
public void setApplyRotation(boolean applyRotation) {
this.applyRotation = applyRotation;
userConfigured.add("applyRotation");
}
/**
* @see #addOtherTesseractConfig(String, String)
*/
public Map<String, String> getOtherTesseractConfig() {
return otherTesseractConfig;
}
/**
* Add a key-value pair to pass to Tesseract using its -c command line option.
* To see the possible options, run tesseract --print-parameters.
* <p>
* You may also add these parameters in TesseractOCRConfig.properties; any
* key-value pair in the properties file where the key contains an underscore
* is passed directly to Tesseract.
*
* @param key
* @param value
*/
public void addOtherTesseractConfig(String key, String value) {
if (key == null) {
throw new IllegalArgumentException("key must not be null");
}
if (value == null) {
throw new IllegalArgumentException("value must not be null");
}
Matcher m = ALLOWABLE_OTHER_PARAMS_PATTERN.matcher(key);
if (!m.find()) {
throw new IllegalArgumentException("Key contains illegal characters: " + key);
}
m.reset(value);
if (!m.find()) {
throw new IllegalArgumentException("Value contains illegal characters: " + value);
}
otherTesseractConfig.put(key.trim(), value.trim());
userConfigured.add("otherTesseractConfig");
}
public TesseractOCRConfig cloneAndUpdate(TesseractOCRConfig updates) throws TikaException {
TesseractOCRConfig updated = new TesseractOCRConfig();
for (Field field : this.getClass().getDeclaredFields()) {
if (Modifier.isFinal(field.getModifiers())) {
continue;
} else if (Modifier.isStatic(field.getModifiers())) {
continue;
}
if ("userConfigured".equals(field.getName())) {
continue;
}
if ("otherTesseractConfig".equals(field.getName()) &&
updates.userConfigured.contains(field.getName())) {
//deep copy
for (Map.Entry<String, String> e : updates.getOtherTesseractConfig().entrySet()) {
updated.addOtherTesseractConfig(e.getKey(), e.getValue());
}
continue;
}
if (updates.userConfigured.contains(field.getName())) {
try {
field.set(updated, field.get(updates));
} catch (IllegalAccessException e) {
throw new TikaException("can't update " + field.getName(), e);
}
} else {
try {
field.set(updated, field.get(this));
} catch (IllegalAccessException e) {
throw new TikaException("can't update " + field.getName(), e);
}
}
}
return updated;
}
public enum OUTPUT_TYPE {
TXT, HOCR
}
}