blob: a0e3f26aac713f5e42a5c3ae5c7331a0099bc26d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.bundle;
import static java.nio.charset.StandardCharsets.UTF_8;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.ops4j.pax.exam.CoreOptions.bundle;
import static org.ops4j.pax.exam.CoreOptions.junitBundles;
import static org.ops4j.pax.exam.CoreOptions.mavenBundle;
import static org.ops4j.pax.exam.CoreOptions.options;
import static org.ops4j.pax.exam.CoreOptions.systemPackages;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringWriter;
import java.io.Writer;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.Set;
import java.util.jar.Attributes;
import java.util.jar.JarInputStream;
import java.util.jar.Manifest;
import javax.inject.Inject;
import org.apache.tika.Tika;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.fork.ForkParser;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.internal.Activator;
import org.apache.tika.parser.ocr.TesseractOCRParser;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Ignore;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.ops4j.pax.exam.Configuration;
import org.ops4j.pax.exam.Option;
import org.ops4j.pax.exam.junit.PaxExam;
import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy;
import org.ops4j.pax.exam.spi.reactors.PerMethod;
import org.osgi.framework.Bundle;
import org.osgi.framework.BundleContext;
import org.osgi.framework.ServiceReference;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@RunWith(PaxExam.class)
@ExamReactorStrategy(PerMethod.class)
public class BundleIT {
private final File TARGET = new File("target");
@Inject
private Parser defaultParser;
@Inject
private Detector contentTypeDetector;
@Inject
private BundleContext bc;
@Configuration
public Option[] configuration() throws IOException, URISyntaxException, ClassNotFoundException {
File base = new File(TARGET, "test-bundles");
return options(
systemPackages("javax.xml.bind"),
bundle(new File(base, "tika-core.jar").toURI().toURL().toString()),
//I couldn't find a way to get the build of bundle to work via imports
//for this one
mavenBundle("commons-io", "commons-io", "2.8.0"),
mavenBundle("org.ops4j.pax.logging", "pax-logging-api", "1.8.5"),
mavenBundle("org.ops4j.pax.logging", "pax-logging-service", "1.8.5"),
junitBundles(),
bundle(new File(base, "tika-bundle-standard.jar").toURI().toURL().toString())
);
}
@Test
public void testBundleLoaded() throws Exception {
boolean hasCore = false, hasBundle = false;
for (Bundle b : bc.getBundles()) {
if ("org.apache.tika.core".equals(b.getSymbolicName())) {
hasCore = true;
assertEquals("Core not activated", Bundle.ACTIVE, b.getState());
}
if ("org.apache.tika.bundle-standard".equals(b.getSymbolicName())) {
hasBundle = true;
assertEquals("Bundle not activated", Bundle.ACTIVE, b.getState());
}
}
assertTrue("Core bundle not found", hasCore);
assertTrue("Bundle bundle not found", hasBundle);
}
@Test
public void testManifestNoJUnit() throws Exception {
File TARGET = new File("target");
File base = new File(TARGET, "test-bundles");
File tikaBundle = new File(base, "tika-bundle-standard.jar");
JarInputStream jarIs = new JarInputStream(new FileInputStream(tikaBundle));
Manifest mf = jarIs.getManifest();
Attributes main = mf.getMainAttributes();
String importPackage = main.getValue("Import-Package");
boolean containsJunit = importPackage.contains("junit");
assertFalse("The bundle should not import junit", containsJunit);
}
@Test
public void testBundleDetection() throws Exception {
Metadata metadataTXT = new Metadata();
metadataTXT.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt");
Metadata metadataPDF = new Metadata();
metadataPDF.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.pdf");
// Simple type detection
assertEquals(MediaType.TEXT_PLAIN, contentTypeDetector.detect(null, metadataTXT));
assertEquals(MediaType.application("pdf"), contentTypeDetector.detect(null, metadataPDF));
}
@Test
public void testForkParser() throws Exception {
ForkParser parser = new ForkParser(Activator.class.getClassLoader(), defaultParser);
String data = "<!DOCTYPE html>\n<html><body><p>test <span>content</span></p></body></html>";
InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8));
Writer writer = new StringWriter();
ContentHandler contentHandler = new BodyContentHandler(writer);
Metadata metadata = new Metadata();
MediaType type = contentTypeDetector.detect(stream, metadata);
assertEquals(type.toString(), "text/html");
metadata.add(Metadata.CONTENT_TYPE, type.toString());
ParseContext parseCtx = new ParseContext();
parser.parse(stream, contentHandler, metadata, parseCtx);
writer.flush();
String content = writer.toString();
assertTrue(content.length() > 0);
assertEquals("test content", content.trim());
}
@Test
public void testBundleSimpleText() throws Exception {
Tika tika = new Tika();
// Simple text extraction
String xml = tika.parseToString(new File("pom.xml"));
assertTrue(xml.contains("tika-bundle"));
}
@Test
//@Ignore("until we can figure out why OverrideDetector is not loaded by osgi")
public void testBundleDetectors() throws Exception {
//For some reason, the detector created by OSGi has a flat
//list of detectors, whereas the detector created by the traditional
//service loading method has children: DefaultDetector, MimeTypes.
//We have to flatten the service loaded DefaultDetector to get equivalence.
//Detection behavior should all be the same.
// Get the classes found within OSGi
ServiceReference<Detector> detectorRef = bc.getServiceReference(Detector.class);
DefaultDetector detectorService = (DefaultDetector) bc.getService(detectorRef);
Set<String> osgiDetectors = new HashSet<>();
for (Detector d : detectorService.getDetectors()) {
osgiDetectors.add(d.getClass().getName());
}
// Check we did get a few, just in case...
assertTrue("Should have several Detector names, found " + osgiDetectors.size(),
osgiDetectors.size() > 3);
// Get the raw detectors list from the traditional service loading mechanism
DefaultDetector detector = new DefaultDetector();
Set<String> rawDetectors = new HashSet<>();
for (Detector d : detector.getDetectors()) {
if (d instanceof DefaultDetector) {
for (Detector dChild : ((DefaultDetector) d).getDetectors()) {
rawDetectors.add(dChild.getClass().getName());
}
} else {
//TODO: figure out how to get this loaded correctly from tika-core
if (!d.getClass().getName().equals("org.apache.tika.detect.OverrideDetector")) {
rawDetectors.add(d.getClass().getName());
}
}
}
assertEquals(rawDetectors, osgiDetectors);
}
@Test
public void testBundleParsers() throws Exception {
// Get the classes found within OSGi
ServiceReference<Parser> parserRef = bc.getServiceReference(Parser.class);
DefaultParser parserService = (DefaultParser) bc.getService(parserRef);
Set<String> osgiParsers = new HashSet<>();
for (Parser p : parserService.getAllComponentParsers()) {
osgiParsers.add(p.getClass().getName());
}
// Check we did get a few, just in case...
assertTrue("Should have lots Parser names, found " + osgiParsers.size(),
osgiParsers.size() > 15);
// Get the raw parsers list from the traditional service loading mechanism
CompositeParser parser = (CompositeParser) defaultParser;
Set<String> rawParsers = new HashSet<>();
for (Parser p : parser.getAllComponentParsers()) {
if (p instanceof DefaultParser) {
for (Parser pChild : ((DefaultParser) p).getAllComponentParsers()) {
rawParsers.add(pChild.getClass().getName());
}
} else {
rawParsers.add(p.getClass().getName());
}
}
assertEquals(rawParsers, osgiParsers);
}
@Test
public void testTesseractParser() throws Exception {
ContentHandler handler = new BodyContentHandler();
ParseContext context = new ParseContext();
Parser tesseractParser = new TesseractOCRParser();
try (InputStream stream = new FileInputStream("src/test/resources/testOCR.jpg")) {
tesseractParser.parse(stream, handler, new Metadata(), context);
}
}
@Test
public void testTikaBundle() throws Exception {
Tika tika = new Tika();
// Package extraction
ContentHandler handler = new BodyContentHandler();
Parser parser = tika.getParser();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
try (InputStream stream = TikaInputStream.get(Paths.get("src/test/resources/test-documents.zip"))) {
parser.parse(stream, handler, new Metadata(), context);
}
String content = handler.toString();
assertTrue(content.contains("testEXCEL.xls"));
assertTrue(content.contains("Sample Excel Worksheet"));
assertTrue(content.contains("testHTML.html"));
assertTrue(content.contains("Test Indexation Html"));
assertTrue(content.contains("testOpenOffice2.odt"));
assertTrue(content.contains("This is a sample Open Office document"));
assertTrue(content.contains("testPDF.pdf"));
assertTrue(content.contains("Apache Tika"));
assertTrue(content.contains("testPPT.ppt"));
assertTrue(content.contains("Sample Powerpoint Slide"));
assertTrue(content.contains("testRTF.rtf"));
assertTrue(content.contains("indexation Word"));
assertTrue(content.contains("testTXT.txt"));
assertTrue(content.contains("Test d'indexation de Txt"));
assertTrue(content.contains("testWORD.doc"));
assertTrue(content.contains("This is a sample Microsoft Word Document"));
assertTrue(content.contains("testXML.xml"));
assertTrue(content.contains("Rida Benjelloun"));
}
@Test
public void testPoiTikaBundle() throws Exception {
Tika tika = new Tika();
// Package extraction
ContentHandler handler = new BodyContentHandler();
Parser parser = tika.getParser();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
try (InputStream stream = TikaInputStream.get(Paths.get("src/test/resources/testPPT.pptx"))) {
parser.parse(stream, handler, new Metadata(), context);
}
String content = handler.toString();
assertTrue(content.contains("Attachment Test"));
}
@Test
@Ignore
public void testAll() throws Exception {
Tika tika = new Tika();
// Package extraction
ContentHandler handler = new BodyContentHandler();
Parser parser = tika.getParser();
ParseContext context = new ParseContext();
context.set(Parser.class, parser);
Set<String> needToFix = new HashSet<>();
//needToFix.add("testAccess2_encrypted.accdb");
System.out.println(getTestDir());
for (File f : getTestDir().listFiles()) {
if (f.isDirectory()) {
continue;
}
if (needToFix.contains(f.getName())) {
continue;
}
System.out.println("about to parse " + f);
Metadata metadata = new Metadata();
try (InputStream is = TikaInputStream.get(f)) {
parser.parse(is, handler, metadata, context);
} catch (EncryptedDocumentException e) {
//swallow
} catch (SAXException e) {
//
} catch (TikaException e) {
System.err.println("tika Exception " + f.getName());
e.printStackTrace();
}
}
}
private File getTestDir() {
return new File("../tika-parsers/src/test/resources/test-documents");
}
}