blob: a67fc17743fef4efc6a945036abd22268461cda7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ocr;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import static org.junit.Assume.assumeTrue;
import java.io.File;
import java.io.InputStream;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.junit.Assert;
import org.junit.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.image.BPGParser;
import org.apache.tika.parser.image.HeifParser;
import org.apache.tika.parser.image.ICNSParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.image.JpegParser;
import org.apache.tika.parser.image.PSDParser;
import org.apache.tika.parser.image.TiffParser;
import org.apache.tika.parser.image.WebPParser;
public class TesseractOCRParserTest extends TikaTest {
public static boolean canRun() throws TikaConfigException {
TesseractOCRParser p = new TesseractOCRParser();
p.initialize(Collections.EMPTY_MAP);
return p.hasTesseract();
}
@Test
public void testInterwordSpacing() throws Exception {
assumeTrue("can run OCR", canRun());
//default
String xml = getXML("testOCR_spacing.png", getMetadata(MediaType.image("png"))).xml;
assertContains("The quick", xml);
TesseractOCRConfig tesseractOCRConfigconfig = new TesseractOCRConfig();
tesseractOCRConfigconfig.setPreserveInterwordSpacing(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, tesseractOCRConfigconfig);
//with preserve interwordspacing "on"
//allow some flexibility in case Tesseract is computing spaces
//somewhat differently in different versions/OS's, etc.
xml = getXML("testOCR_spacing.png", getMetadata(MediaType.image("png")), parseContext).xml;
Matcher m = Pattern.compile("The\\s{5,20}quick").matcher(xml);
assertTrue(m.find());
}
private Metadata getMetadata(MediaType mediaType) {
Metadata metadata = new Metadata();
MediaType ocrMediaType =
new MediaType(mediaType.getType(), "OCR-" + mediaType.getSubtype());
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, ocrMediaType.toString());
return metadata;
}
private MediaType deOCR(MediaType mediaType) {
String subtype = mediaType.getSubtype();
if (subtype.startsWith("ocr-")) {
subtype = subtype.substring(4);
}
return new MediaType(mediaType.getType(), subtype);
}
@Test
public void confirmMultiPageTiffHandling() throws Exception {
assumeTrue("can run OCR", canRun());
//tesseract should handle multipage tiffs by itself
//let's confirm that
String xml = getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff"))).xml;
assertContains("Page 2", xml);
}
@Test
public void confirmRuntimeSkipOCR() throws Exception {
assumeTrue("can run OCR", canRun());
TesseractOCRConfig config = new TesseractOCRConfig();
config.setSkipOcr(true);
ParseContext context = new ParseContext();
context.set(TesseractOCRConfig.class, config);
String xml =
getXML("testTIFF_multipage.tif", getMetadata(MediaType.image("tiff")), context).xml;
assertNotContained("Page 2", xml);
}
@Test
public void testPositiveRotateOCR() throws Exception {
assumeTrue(canRun());
TesseractOCRParser p = new TesseractOCRParser();
assumeTrue(p.hasImageMagick());
TesseractOCRConfig config = new TesseractOCRConfig();
config.setApplyRotation(true);
config.setResize(100);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
Metadata metadata = getMetadata(MediaType.image("png"));
String ocr = getText("testRotated+10.png", metadata, parseContext);
assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
assertEquals(10.0, Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)),
0.01);
assertContains("Its had resolving otherwise she contented therefore", ocr);
}
@Test
public void testNegativeRotateOCR() throws Exception {
TesseractOCRParser p = new TesseractOCRParser();
assumeTrue(p.hasImageMagick());
TesseractOCRConfig config = new TesseractOCRConfig();
config.setApplyRotation(true);
config.setResize(100);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
assumeTrue(canRun());
Metadata metadata = getMetadata(MediaType.image("png"));
String ocr = getText("testRotated-10.png", metadata, parseContext);
assertEquals("true", metadata.get(TesseractOCRParser.IMAGE_MAGICK));
assertEquals(-10.0, Double.parseDouble(metadata.get(TesseractOCRParser.IMAGE_ROTATION)),
0.01);
assertContains("Its had resolving otherwise she contented therefore", ocr);
}
@Test
public void testConfig() throws Exception {
try (InputStream is = getResourceAsStream("/test-configs/TIKA-2705-tesseract.xml")) {
TikaConfig config = new TikaConfig(is);
Parser p = config.getParser();
Parser tesseractOCRParser =
findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
assertNotNull(tesseractOCRParser);
TesseractOCRConfig tesseractOCRConfig =
((TesseractOCRParser) tesseractOCRParser).getDefaultConfig();
Assert.assertEquals(241, tesseractOCRConfig.getTimeoutSeconds());
Assert.assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR,
tesseractOCRConfig.getOutputType());
Assert.assertEquals("ceb", tesseractOCRConfig.getLanguage());
Assert.assertEquals(false, tesseractOCRConfig.isApplyRotation());
// assertContains("myspecial", tesseractOCRConfig.getTesseractPath());
}
}
@Test
public void testPreloadLangs() throws Exception {
assumeTrue(canRun());
TikaConfig config;
try (InputStream is = getResourceAsStream(
"/test-configs/tika-config-tesseract-load-langs.xml")) {
config = new TikaConfig(is);
}
Parser p = config.getParser();
Parser tesseractOCRParser =
findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
assertNotNull(tesseractOCRParser);
Set<String> langs = ((TesseractOCRParser) tesseractOCRParser).getLangs();
assertTrue(langs.size() > 0);
TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
tesseractOCRConfig.setLanguage("zzz");
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, tesseractOCRConfig);
try {
getRecursiveMetadata("testOCR_spacing.png", new AutoDetectParser(config),
getMetadata(MediaType.image("png")), parseContext, false);
fail("should have thrown exception");
} catch (TikaException e) {
//expected
}
}
@Test
public void testArbitraryParams() throws Exception {
try (InputStream is = getResourceAsStream(
"/test-configs/tika-config-tesseract-arbitrary.xml")) {
TikaConfig config = new TikaConfig(is);
Parser p = config.getParser();
Parser tesseractOCRParser =
findParser(p, org.apache.tika.parser.ocr.TesseractOCRParser.class);
assertNotNull(tesseractOCRParser);
TesseractOCRConfig tesseractOCRConfig =
((TesseractOCRParser) tesseractOCRParser).getDefaultConfig();
Assert.assertEquals("0.75",
tesseractOCRConfig.getOtherTesseractConfig().get("textord_initialx_ile"));
Assert.assertEquals("0.15625",
tesseractOCRConfig.getOtherTesseractConfig().get("textord_noise_hfract"));
}
}
//to be used to figure out a) what image media types don't have ocr coverage and
// b) what ocr media types don't have dedicated image parsers
//this obv requires that tesseract be installed
//TODO: convert to actual unit test
//@Test
public void showCoverage() throws Exception {
Set<MediaType> imageParserMimes = new HashSet<>();
for (Parser p : new Parser[]{new BPGParser(), new HeifParser(), new ICNSParser(),
new ImageParser(), new JpegParser(), new PSDParser(), new TiffParser(),
new WebPParser(),}) {
imageParserMimes.addAll(p.getSupportedTypes(new ParseContext()));
}
//mime types that Tesseract will cover if there is no existing parser
//that in turn will call tesseract .. e.g. the mime subtype doesn't start
//with ocr-
Set<MediaType> literalTesseractMimes = new HashSet<>();
//mimes whose subtimes start with ocr-
Set<MediaType> ocrTesseractMimes = new HashSet<>();
for (MediaType mt : new TesseractOCRParser().getSupportedTypes(new ParseContext())) {
if (mt.getSubtype().startsWith("ocr-")) {
ocrTesseractMimes.add(deOCR(mt));
} else {
literalTesseractMimes.add(mt);
}
}
for (MediaType mt : imageParserMimes) {
if (!ocrTesseractMimes.contains(mt)) {
System.out.println("tesseract isn't currently configured to handle: " + mt);
}
}
for (MediaType mt : literalTesseractMimes) {
System.out.println("We don't have dedicated image parsers " +
"for these formats, which are handled by tesseract: " + mt);
}
}
@Test
public void testTrailingSlashInPathBehavior() {
TesseractOCRParser parser = new TesseractOCRParser();
parser.setTesseractPath("blah");
assertEquals("blah" + File.separator, parser.getTesseractPath());
parser.setTesseractPath("blah" + File.separator);
assertEquals("blah" + File.separator, parser.getTesseractPath());
parser.setTesseractPath("");
assertEquals("", parser.getTesseractPath());
parser.setTessdataPath("blahdata");
assertEquals("blahdata" + File.separator, parser.getTessdataPath());
parser.setTessdataPath("blahdata" + File.separator);
assertEquals("blahdata" + File.separator, parser.getTessdataPath());
parser.setTessdataPath("");
assertEquals("", parser.getTessdataPath());
parser.setImageMagickPath("imagemagickpath");
assertEquals("imagemagickpath" + File.separator, parser.getImageMagickPath());
parser.setImageMagickPath("imagemagickpath" + File.separator);
assertEquals("imagemagickpath" + File.separator, parser.getImageMagickPath());
parser.setImageMagickPath("");
assertEquals("", parser.getImageMagickPath());
}
@Test
public void testBogusPathCheck() {
//allow path that doesn't actually exist
TesseractOCRParser parser = new TesseractOCRParser();
parser.setTesseractPath("blahdeblahblah");
assertEquals("blahdeblahblah" + File.separator, parser.getTesseractPath());
}
@Test
public void testThreadJoinInLoadingLangs() throws Exception {
assumeTrue(canRun());
//make sure that the stream is fully read and
//we're getting the same answers on several iterations
Set<String> langs = getLangs();
assumeTrue(langs.size() > 0);
for (int i = 0; i < 20; i++) {
assertEquals(langs, getLangs());
}
}
private Set<String> getLangs() throws Exception {
TesseractOCRParser p = new TesseractOCRParser();
p.setPreloadLangs(true);
p.initialize(Collections.EMPTY_MAP);
return p.getLangs();
}
}