blob: f36106bcea2d5720fa627ef3e1cdf70bad5893a1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.ocr;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assume.assumeTrue;
import java.util.List;
import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.external.ExternalParser;
import org.apache.tika.parser.image.ImageParser;
import org.apache.tika.parser.pdf.PDFParserConfig;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.junit.Assert;
import org.junit.Assume;
import org.junit.Test;
public class TesseractOCRParserTest extends TikaTest {
public static boolean canRun() throws TikaConfigException {
TesseractOCRParser p = new TesseractOCRParser();
return p.hasTesseract();
}
/*
Check that if Tesseract is told to skip OCR,
the TesseractOCRParser claims to not support
any file types. So, the standard image parser is called instead.
*/
@Test
public void offersNoTypesIfNotFound() throws Exception {
TesseractOCRParser parser = new TesseractOCRParser();
DefaultParser defaultParser = new DefaultParser();
MediaType png = MediaType.image("png");
// With an invalid path, will offer no types
TesseractOCRConfig skipOcrConfig = new TesseractOCRConfig();
skipOcrConfig.setSkipOcr(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, skipOcrConfig);
// No types offered
Assert.assertEquals(0, parser.getSupportedTypes(parseContext).size());
// And DefaultParser won't use us
assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
}
@Test
public void testPDFOCR() throws Exception {
String resource = "testOCR.pdf";
String[] nonOCRContains = new String[0];
testBasicOCR(resource, nonOCRContains, 2);
}
@Test
public void testDOCXOCR() throws Exception {
String resource = "testOCR.docx";
String[] nonOCRContains = {
"This is some text.",
"Here is an embedded image:"
};
testBasicOCR(resource, nonOCRContains, 3);
}
@Test
public void testPPTXOCR() throws Exception {
String resource = "testOCR.pptx";
String[] nonOCRContains = {
"This is some text"
};
testBasicOCR(resource, nonOCRContains, 3);
}
@Test
public void testOCROutputsHOCR() throws Exception {
assumeTrue("can run OCR", canRun());
String resource = "testOCR.pdf";
String[] nonOCRContains = new String[0];
String contents = runOCR(resource, nonOCRContains, 2,
BasicContentHandlerFactory.HANDLER_TYPE.XML,
TesseractOCRConfig.OUTPUT_TYPE.HOCR);
assertContains("<span class=\"ocrx_word\" id=\"word_1_1\"", contents);
assertContains("Happy</span>", contents);
}
@Test
public void testParserContentTypeOverride() throws Exception {
Assume.assumeTrue("can run OCR", canRun());
//this tests that the content-type is not overwritten by the ocr parser override content type
List<Metadata> metadata = getRecursiveMetadata("testOCR.pdf", AUTO_DETECT_PARSER,
BasicContentHandlerFactory.HANDLER_TYPE.XML);
assertContains("<meta name=\"Content-Type\" content=\"application/pdf\" />",
metadata.get(0).get(TikaCoreProperties.TIKA_CONTENT));
}
private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception{
Assume.assumeTrue("can run OCR", canRun());
String contents = runOCR(resource, nonOCRContains, numMetadatas,
BasicContentHandlerFactory.HANDLER_TYPE.TEXT, TesseractOCRConfig.OUTPUT_TYPE.TXT);
if (canRun()) {
if(resource.substring(resource.lastIndexOf('.')).equals(".jpg")) {
assertContains("Apache", contents);
} else {
assertContains("Happy New Year 2003!", contents);
}
}
}
private String runOCR(String resource, String[] nonOCRContains, int numMetadatas,
BasicContentHandlerFactory.HANDLER_TYPE handlerType,
TesseractOCRConfig.OUTPUT_TYPE outputType) throws Exception {
TesseractOCRConfig config = new TesseractOCRConfig();
config.setOutputType(outputType);
PDFParserConfig pdfConfig = new PDFParserConfig();
pdfConfig.setExtractInlineImages(true);
ParseContext parseContext = new ParseContext();
parseContext.set(TesseractOCRConfig.class, config);
parseContext.set(PDFParserConfig.class, pdfConfig);
List<Metadata> metadataList = getRecursiveMetadata(resource,
AUTO_DETECT_PARSER, handlerType, parseContext);
assertEquals(numMetadatas, metadataList.size());
StringBuilder contents = new StringBuilder();
for (Metadata m : metadataList) {
contents.append(m.get(TikaCoreProperties.TIKA_CONTENT));
}
for (String needle : nonOCRContains) {
assertContains(needle, contents.toString());
}
assertTrue(metadataList.get(0).names().length > 10);
assertTrue(metadataList.get(1).names().length > 10);
//test at least one value
assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
return contents.toString();
}
@Test
public void testSingleImage() throws Exception {
Assume.assumeTrue("can run OCR", canRun());
String xml = getXML("testOCR.jpg").xml;
assertContains("OCR Testing", xml);
//test metadata extraction
assertContains("<meta name=\"Image Width\" content=\"136 pixels\" />", xml);
//TIKA-2169
assertContainsCount("<html", xml, 1);
assertContainsCount("<title", xml, 1);
assertContainsCount("</title", xml, 1);
assertContainsCount("<body", xml, 1);
assertContainsCount("</body", xml, 1);
assertContainsCount("</html", xml, 1);
}
@Test
public void getNormalMetadataToo() throws Exception {
//this should be successful whether or not TesseractOCR is installed/active
//If tesseract is installed, the internal metadata extraction parser should
//work; and if tesseract isn't installed, the regular parsers should take over.
//gif
Metadata m = getXML("testGIF.gif").metadata;
assertTrue(m.names().length > 20);
assertEquals("RGB", m.get("Chroma ColorSpaceType"));
//jpg
m = getXML("testOCR.jpg").metadata;
assertEquals("136", m.get(Metadata.IMAGE_WIDTH));
assertEquals("66", m.get(Metadata.IMAGE_LENGTH));
assertEquals("8", m.get(Metadata.BITS_PER_SAMPLE));
assertEquals(null, m.get(Metadata.SAMPLES_PER_PIXEL));
assertContains("This is a test Apache Tika imag", m.get(TikaCoreProperties.COMMENTS));
//bmp
m = getXML("testBMP.bmp").metadata;
assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
//png
m = getXML("testPNG.png").metadata;
assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
assertEquals("UnsignedIntegral", m.get("Data SampleFormat"));
//tiff
m = getXML("testTIFF.tif").metadata;
assertEquals("100", m.get(Metadata.IMAGE_WIDTH));
assertEquals("75", m.get(Metadata.IMAGE_LENGTH));
assertEquals("72 dots per inch", m.get("Exif IFD0:Y Resolution"));
}
//TODO: add unit tests for jp2/jpx/ppm TIKA-2174
}