blob: 3b2713eac5c8d7f5ae82d2a38a0e9d918d67b4cd [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ocr;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import org.junit.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.CompositeParser;
public class TesseractOCRConfigTest extends TikaTest {
@Test
public void testNoConfig() throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
assertEquals("Invalid default language value", "eng", config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE,
config.getMaxFileSizeToOcr());
assertEquals("Invalid default timeout value", 120, config.getTimeoutSeconds());
assertEquals("Invalid default density value", 300, config.getDensity());
assertEquals("Invalid default depth value", 4, config.getDepth());
assertEquals("Invalid default colorpsace value", "gray", config.getColorspace());
assertEquals("Invalid default filter value", "triangle", config.getFilter());
assertEquals("Invalid default resize value", 200, config.getResize());
assertEquals("Invalid default applyRotation value", false, config.isApplyRotation());
}
@Test
public void testPartialConfig() throws Exception {
InputStream stream = getResourceAsStream("/test-configs/tika-config-tesseract-partial.xml");
TesseractOCRParser parser =
(TesseractOCRParser) ((CompositeParser) new TikaConfig(stream).getParser())
.getAllComponentParsers().get(0);
TesseractOCRConfig config = parser.getDefaultConfig();
assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE,
config.getMaxFileSizeToOcr());
assertEquals("Invalid overridden timeout value", 240, config.getTimeoutSeconds());
assertEquals("Invalid overridden density value", 200, config.getDensity());
assertEquals("Invalid overridden depth value", 8, config.getDepth());
assertEquals("Invalid overridden filter value", "box", config.getFilter());
assertEquals("Invalid overridden resize value", 300, config.getResize());
assertEquals("Invalid default applyRotation value", false, config.isApplyRotation());
}
@Test
public void testFullConfig() throws Exception {
InputStream stream = getResourceAsStream("/test-configs/tika-config-tesseract-full.xml");
TesseractOCRParser parser =
(TesseractOCRParser) ((CompositeParser) new TikaConfig(stream).getParser())
.getAllComponentParsers().get(0);
TesseractOCRConfig config = parser.getDefaultConfig();
assertEquals("Invalid overridden language value", "ceb", config.getLanguage());
assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000,
config.getMaxFileSizeToOcr());
assertEquals("Invalid overridden timeout value", 240, config.getTimeoutSeconds());
assertEquals("Invalid overridden density value", 200, config.getDensity());
assertEquals("Invalid overridden depth value", 8, config.getDepth());
assertEquals("Invalid overridden filter value", "box", config.getFilter());
assertEquals("Invalid overridden resize value", 300, config.getResize());
assertEquals("Invalid overridden applyRotation value", true, config.isApplyRotation());
}
@Test
public void testValidateValidLanguage() {
List<String> validLanguages =
Arrays.asList("eng", "slk_frak", "chi_tra", "eng+fra", "tgk+chi_tra+slk_frak",
"chi_tra_vert", "tgk+chi_tra_vert+slk_frak", "eng+script/Arabic",
"script/HanT_vert");
TesseractOCRConfig config = new TesseractOCRConfig();
for (String language : validLanguages) {
config.setLanguage(language);
assertEquals("Valid language not set", language, config.getLanguage());
}
}
@Test
public void testValidateInvalidLanguage() {
List<String> invalidLanguages = Arrays.asList(
//"", allow empty string
"+", "en", "en+", "eng+fra+", "Arabic", "/script/Arabic", "rm -rf *");
TesseractOCRConfig config = new TesseractOCRConfig();
for (String language : invalidLanguages) {
try {
config.setLanguage(language);
fail("Invalid language set: " + language);
} catch (IllegalArgumentException e) {
// expected exception thrown
}
}
}
@Test(expected = IllegalArgumentException.class)
public void testValidatePageSegMode() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setPageSegMode("0");
config.setPageSegMode("10");
assertTrue("Couldn't set valid values", true);
config.setPageSegMode("14");
}
@Test(expected = IllegalArgumentException.class)
public void testValidateDensity() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setDensity(300);
config.setDensity(400);
assertTrue("Couldn't set valid values", true);
config.setDensity(1);
}
@Test(expected = IllegalArgumentException.class)
public void testValidateDepth() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setDepth(4);
config.setDepth(8);
assertTrue("Couldn't set valid values", true);
config.setDepth(6);
}
@Test(expected = IllegalArgumentException.class)
public void testValidateFilter() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setFilter("Triangle");
config.setFilter("box");
assertTrue("Couldn't set valid values", true);
config.setFilter("abc");
}
@Test(expected = IllegalArgumentException.class)
public void testValidateResize() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setResize(200);
config.setResize(400);
assertTrue("Couldn't set valid values", true);
config.setResize(1000);
}
@Test(expected = IllegalArgumentException.class)
public void testDataPathCheck() {
TesseractOCRParser parser = new TesseractOCRParser();
parser.setTessdataPath("blah\u0000deblah");
}
@Test(expected = IllegalArgumentException.class)
public void testPathCheck() {
TesseractOCRParser parser = new TesseractOCRParser();
parser.setTesseractPath("blah\u0000deblah");
}
@Test(expected = IllegalArgumentException.class)
public void testBadOtherKey() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("bad bad", "bad");
}
@Test(expected = IllegalArgumentException.class)
public void testBadOtherValue() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("bad", "bad bad");
}
@Test(expected = IllegalArgumentException.class)
public void testBadOtherValueSlash() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("bad", "bad\\bad");
}
@Test(expected = IllegalArgumentException.class)
public void testBadOtherValueControl() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("bad", "bad\u0001bad");
}
@Test
public void testGoodOtherParameters() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.addOtherTesseractConfig("good", "good");
}
@Test(expected = IllegalArgumentException.class)
public void testBadLanguageCode() throws Exception {
TesseractOCRConfig tesseractOCRConfig = new TesseractOCRConfig();
tesseractOCRConfig.setLanguage("kerplekistani");
}
@Test(expected = IllegalArgumentException.class)
public void testBadColorSpace() {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setColorspace("someth!ng");
}
@Test
public void testUpdatingConfigs() throws Exception {
TesseractOCRConfig configA = new TesseractOCRConfig();
configA.setLanguage("eng");
configA.setMinFileSizeToOcr(100);
configA.setOutputType(TesseractOCRConfig.OUTPUT_TYPE.TXT);
configA.addOtherTesseractConfig("k1", "a1");
configA.addOtherTesseractConfig("k2", "a2");
TesseractOCRConfig configB = new TesseractOCRConfig();
configB.setLanguage("fra");
configB.setMinFileSizeToOcr(1000);
configB.setOutputType(TesseractOCRConfig.OUTPUT_TYPE.HOCR);
configB.addOtherTesseractConfig("k1", "b1");
configB.addOtherTesseractConfig("k2", "b2");
TesseractOCRConfig clone = configA.cloneAndUpdate(configB);
assertEquals("fra", clone.getLanguage());
assertEquals(1000, clone.getMinFileSizeToOcr());
assertEquals(TesseractOCRConfig.OUTPUT_TYPE.HOCR, clone.getOutputType());
assertEquals("b1", clone.getOtherTesseractConfig().get("k1"));
assertEquals("b2", clone.getOtherTesseractConfig().get("k2"));
}
}