| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.bundle; |
| |
| import static java.nio.charset.StandardCharsets.UTF_8; |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertFalse; |
| import static org.junit.Assert.assertTrue; |
| import static org.ops4j.pax.exam.CoreOptions.bundle; |
| import static org.ops4j.pax.exam.CoreOptions.junitBundles; |
| import static org.ops4j.pax.exam.CoreOptions.mavenBundle; |
| import static org.ops4j.pax.exam.CoreOptions.options; |
| import static org.ops4j.pax.exam.CoreOptions.systemPackages; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.StringWriter; |
| import java.io.Writer; |
| import java.net.URISyntaxException; |
| import java.nio.file.Paths; |
| import java.util.HashSet; |
| import java.util.Set; |
| import java.util.jar.Attributes; |
| import java.util.jar.JarInputStream; |
| import java.util.jar.Manifest; |
| |
| import javax.inject.Inject; |
| |
| import org.apache.tika.Tika; |
| import org.apache.tika.detect.DefaultDetector; |
| import org.apache.tika.detect.Detector; |
| import org.apache.tika.exception.EncryptedDocumentException; |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.fork.ForkParser; |
| import org.apache.tika.io.TikaInputStream; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.TikaCoreProperties; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.CompositeParser; |
| import org.apache.tika.parser.DefaultParser; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.parser.Parser; |
| import org.apache.tika.parser.internal.Activator; |
| import org.apache.tika.parser.ocr.TesseractOCRParser; |
| import org.apache.tika.sax.BodyContentHandler; |
| import org.junit.Ignore; |
| import org.junit.Test; |
| import org.junit.runner.RunWith; |
| import org.ops4j.pax.exam.Configuration; |
| import org.ops4j.pax.exam.Option; |
| import org.ops4j.pax.exam.junit.PaxExam; |
| import org.ops4j.pax.exam.spi.reactors.ExamReactorStrategy; |
| import org.ops4j.pax.exam.spi.reactors.PerMethod; |
| import org.osgi.framework.Bundle; |
| import org.osgi.framework.BundleContext; |
| import org.osgi.framework.ServiceReference; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| |
| @RunWith(PaxExam.class) |
| @ExamReactorStrategy(PerMethod.class) |
| public class BundleIT { |
| |
| private final File TARGET = new File("target"); |
| |
| @Inject |
| private Parser defaultParser; |
| |
| @Inject |
| private Detector contentTypeDetector; |
| |
| @Inject |
| private BundleContext bc; |
| |
| @Configuration |
| public Option[] configuration() throws IOException, URISyntaxException, ClassNotFoundException { |
| File base = new File(TARGET, "test-bundles"); |
| return options( |
| systemPackages("javax.xml.bind"), |
| bundle(new File(base, "tika-core.jar").toURI().toURL().toString()), |
| //I couldn't find a way to get the build of bundle to work via imports |
| //for this one |
| mavenBundle("commons-io", "commons-io", "2.8.0"), |
| mavenBundle("org.ops4j.pax.logging", "pax-logging-api", "1.8.5"), |
| mavenBundle("org.ops4j.pax.logging", "pax-logging-service", "1.8.5"), |
| junitBundles(), |
| bundle(new File(base, "tika-bundle-standard.jar").toURI().toURL().toString()) |
| ); |
| } |
| |
| @Test |
| public void testBundleLoaded() throws Exception { |
| boolean hasCore = false, hasBundle = false; |
| for (Bundle b : bc.getBundles()) { |
| if ("org.apache.tika.core".equals(b.getSymbolicName())) { |
| hasCore = true; |
| assertEquals("Core not activated", Bundle.ACTIVE, b.getState()); |
| } |
| if ("org.apache.tika.bundle-standard".equals(b.getSymbolicName())) { |
| hasBundle = true; |
| assertEquals("Bundle not activated", Bundle.ACTIVE, b.getState()); |
| } |
| } |
| assertTrue("Core bundle not found", hasCore); |
| assertTrue("Bundle bundle not found", hasBundle); |
| } |
| |
| @Test |
| public void testManifestNoJUnit() throws Exception { |
| File TARGET = new File("target"); |
| File base = new File(TARGET, "test-bundles"); |
| File tikaBundle = new File(base, "tika-bundle-standard.jar"); |
| |
| JarInputStream jarIs = new JarInputStream(new FileInputStream(tikaBundle)); |
| Manifest mf = jarIs.getManifest(); |
| |
| Attributes main = mf.getMainAttributes(); |
| |
| String importPackage = main.getValue("Import-Package"); |
| |
| boolean containsJunit = importPackage.contains("junit"); |
| |
| assertFalse("The bundle should not import junit", containsJunit); |
| } |
| |
| @Test |
| public void testBundleDetection() throws Exception { |
| Metadata metadataTXT = new Metadata(); |
| metadataTXT.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt"); |
| |
| Metadata metadataPDF = new Metadata(); |
| metadataPDF.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.pdf"); |
| |
| // Simple type detection |
| assertEquals(MediaType.TEXT_PLAIN, contentTypeDetector.detect(null, metadataTXT)); |
| assertEquals(MediaType.application("pdf"), contentTypeDetector.detect(null, metadataPDF)); |
| } |
| |
| @Test |
| public void testForkParser() throws Exception { |
| ForkParser parser = new ForkParser(Activator.class.getClassLoader(), defaultParser); |
| String data = "<!DOCTYPE html>\n<html><body><p>test <span>content</span></p></body></html>"; |
| InputStream stream = new ByteArrayInputStream(data.getBytes(UTF_8)); |
| Writer writer = new StringWriter(); |
| ContentHandler contentHandler = new BodyContentHandler(writer); |
| Metadata metadata = new Metadata(); |
| MediaType type = contentTypeDetector.detect(stream, metadata); |
| assertEquals(type.toString(), "text/html"); |
| metadata.add(Metadata.CONTENT_TYPE, type.toString()); |
| ParseContext parseCtx = new ParseContext(); |
| parser.parse(stream, contentHandler, metadata, parseCtx); |
| writer.flush(); |
| String content = writer.toString(); |
| assertTrue(content.length() > 0); |
| assertEquals("test content", content.trim()); |
| } |
| |
| @Test |
| public void testBundleSimpleText() throws Exception { |
| Tika tika = new Tika(); |
| |
| // Simple text extraction |
| String xml = tika.parseToString(new File("pom.xml")); |
| assertTrue(xml.contains("tika-bundle")); |
| } |
| |
| @Test |
| //@Ignore("until we can figure out why OverrideDetector is not loaded by osgi") |
| public void testBundleDetectors() throws Exception { |
| //For some reason, the detector created by OSGi has a flat |
| //list of detectors, whereas the detector created by the traditional |
| //service loading method has children: DefaultDetector, MimeTypes. |
| //We have to flatten the service loaded DefaultDetector to get equivalence. |
| //Detection behavior should all be the same. |
| |
| // Get the classes found within OSGi |
| ServiceReference<Detector> detectorRef = bc.getServiceReference(Detector.class); |
| DefaultDetector detectorService = (DefaultDetector) bc.getService(detectorRef); |
| |
| Set<String> osgiDetectors = new HashSet<>(); |
| for (Detector d : detectorService.getDetectors()) { |
| osgiDetectors.add(d.getClass().getName()); |
| } |
| |
| // Check we did get a few, just in case... |
| assertTrue("Should have several Detector names, found " + osgiDetectors.size(), |
| osgiDetectors.size() > 3); |
| |
| // Get the raw detectors list from the traditional service loading mechanism |
| DefaultDetector detector = new DefaultDetector(); |
| Set<String> rawDetectors = new HashSet<>(); |
| for (Detector d : detector.getDetectors()) { |
| if (d instanceof DefaultDetector) { |
| for (Detector dChild : ((DefaultDetector) d).getDetectors()) { |
| rawDetectors.add(dChild.getClass().getName()); |
| } |
| } else { |
| //TODO: figure out how to get this loaded correctly from tika-core |
| if (!d.getClass().getName().equals("org.apache.tika.detect.OverrideDetector")) { |
| rawDetectors.add(d.getClass().getName()); |
| } |
| } |
| } |
| assertEquals(rawDetectors, osgiDetectors); |
| } |
| |
| @Test |
| public void testBundleParsers() throws Exception { |
| // Get the classes found within OSGi |
| ServiceReference<Parser> parserRef = bc.getServiceReference(Parser.class); |
| DefaultParser parserService = (DefaultParser) bc.getService(parserRef); |
| |
| Set<String> osgiParsers = new HashSet<>(); |
| for (Parser p : parserService.getAllComponentParsers()) { |
| osgiParsers.add(p.getClass().getName()); |
| } |
| |
| // Check we did get a few, just in case... |
| assertTrue("Should have lots Parser names, found " + osgiParsers.size(), |
| osgiParsers.size() > 15); |
| |
| // Get the raw parsers list from the traditional service loading mechanism |
| CompositeParser parser = (CompositeParser) defaultParser; |
| Set<String> rawParsers = new HashSet<>(); |
| for (Parser p : parser.getAllComponentParsers()) { |
| if (p instanceof DefaultParser) { |
| for (Parser pChild : ((DefaultParser) p).getAllComponentParsers()) { |
| rawParsers.add(pChild.getClass().getName()); |
| } |
| } else { |
| rawParsers.add(p.getClass().getName()); |
| } |
| } |
| assertEquals(rawParsers, osgiParsers); |
| } |
| |
| @Test |
| public void testTesseractParser() throws Exception { |
| ContentHandler handler = new BodyContentHandler(); |
| ParseContext context = new ParseContext(); |
| Parser tesseractParser = new TesseractOCRParser(); |
| try (InputStream stream = new FileInputStream("src/test/resources/testOCR.jpg")) { |
| tesseractParser.parse(stream, handler, new Metadata(), context); |
| } |
| |
| } |
| |
| @Test |
| public void testTikaBundle() throws Exception { |
| Tika tika = new Tika(); |
| |
| // Package extraction |
| ContentHandler handler = new BodyContentHandler(); |
| |
| Parser parser = tika.getParser(); |
| ParseContext context = new ParseContext(); |
| context.set(Parser.class, parser); |
| |
| try (InputStream stream = TikaInputStream.get(Paths.get("src/test/resources/test-documents.zip"))) { |
| parser.parse(stream, handler, new Metadata(), context); |
| } |
| |
| String content = handler.toString(); |
| assertTrue(content.contains("testEXCEL.xls")); |
| assertTrue(content.contains("Sample Excel Worksheet")); |
| assertTrue(content.contains("testHTML.html")); |
| assertTrue(content.contains("Test Indexation Html")); |
| assertTrue(content.contains("testOpenOffice2.odt")); |
| assertTrue(content.contains("This is a sample Open Office document")); |
| assertTrue(content.contains("testPDF.pdf")); |
| assertTrue(content.contains("Apache Tika")); |
| assertTrue(content.contains("testPPT.ppt")); |
| assertTrue(content.contains("Sample Powerpoint Slide")); |
| assertTrue(content.contains("testRTF.rtf")); |
| assertTrue(content.contains("indexation Word")); |
| assertTrue(content.contains("testTXT.txt")); |
| assertTrue(content.contains("Test d'indexation de Txt")); |
| assertTrue(content.contains("testWORD.doc")); |
| assertTrue(content.contains("This is a sample Microsoft Word Document")); |
| assertTrue(content.contains("testXML.xml")); |
| assertTrue(content.contains("Rida Benjelloun")); |
| } |
| |
| @Test |
| public void testPoiTikaBundle() throws Exception { |
| Tika tika = new Tika(); |
| |
| // Package extraction |
| ContentHandler handler = new BodyContentHandler(); |
| |
| Parser parser = tika.getParser(); |
| ParseContext context = new ParseContext(); |
| context.set(Parser.class, parser); |
| |
| try (InputStream stream = TikaInputStream.get(Paths.get("src/test/resources/testPPT.pptx"))) { |
| parser.parse(stream, handler, new Metadata(), context); |
| } |
| |
| String content = handler.toString(); |
| assertTrue(content.contains("Attachment Test")); |
| } |
| |
| @Test |
| @Ignore |
| public void testAll() throws Exception { |
| Tika tika = new Tika(); |
| |
| // Package extraction |
| ContentHandler handler = new BodyContentHandler(); |
| |
| Parser parser = tika.getParser(); |
| ParseContext context = new ParseContext(); |
| context.set(Parser.class, parser); |
| Set<String> needToFix = new HashSet<>(); |
| //needToFix.add("testAccess2_encrypted.accdb"); |
| System.out.println(getTestDir()); |
| for (File f : getTestDir().listFiles()) { |
| if (f.isDirectory()) { |
| continue; |
| } |
| if (needToFix.contains(f.getName())) { |
| continue; |
| } |
| System.out.println("about to parse " + f); |
| Metadata metadata = new Metadata(); |
| try (InputStream is = TikaInputStream.get(f)) { |
| parser.parse(is, handler, metadata, context); |
| } catch (EncryptedDocumentException e) { |
| //swallow |
| } catch (SAXException e) { |
| // |
| } catch (TikaException e) { |
| System.err.println("tika Exception " + f.getName()); |
| e.printStackTrace(); |
| } |
| } |
| } |
| |
| private File getTestDir() { |
| return new File("../tika-parsers/src/test/resources/test-documents"); |
| } |
| |
| |
| } |