| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.fork; |
| |
| import static org.junit.Assert.assertEquals; |
| import static org.junit.Assert.assertNotNull; |
| import static org.junit.Assert.fail; |
| |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.NotSerializableException; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.HashSet; |
| import java.util.Set; |
| |
| import org.junit.Ignore; |
| import org.junit.Test; |
| import org.xml.sax.ContentHandler; |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.MultiThreadedTikaTest; |
| import org.apache.tika.Tika; |
| import org.apache.tika.detect.Detector; |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.fork.ForkParser; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.mime.MediaType; |
| import org.apache.tika.parser.EmptyParser; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.parser.Parser; |
| import org.apache.tika.parser.RecursiveParserWrapper; |
| import org.apache.tika.sax.BodyContentHandler; |
| |
| /** |
| * Test that the ForkParser correctly behaves when |
| * wired in to the regular Parsers and their test data |
| */ |
| public class ForkParserIntegrationTest extends MultiThreadedTikaTest { |
| |
| private Tika tika = new Tika(); // TODO Use TikaConfig instead, when it works |
| |
| /** |
| * Simple text parsing |
| */ |
| @Test |
| public void testForkedTextParsing() throws Exception { |
| try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), |
| tika.getParser())) { |
| ContentHandler output = new BodyContentHandler(); |
| InputStream stream = getResourceAsStream("/test-documents/testTXT.txt"); |
| ParseContext context = new ParseContext(); |
| parser.parse(stream, output, new Metadata(), context); |
| |
| String content = output.toString(); |
| assertContains("Test d'indexation", content); |
| assertContains("http://www.apache.org", content); |
| } |
| } |
| |
| /** |
| * TIKA-831 Parsers throwing errors should be caught and |
| * properly reported |
| */ |
| @Test |
| public void testParsingErrorInForkedParserShouldBeReported() throws Exception { |
| BrokenParser brokenParser = new BrokenParser(); |
| ForkParser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser); |
| InputStream stream = getResourceAsStream("/test-documents/testTXT.txt"); |
| |
| // With a serializable error, we'll get that back |
| try { |
| ContentHandler output = new BodyContentHandler(); |
| ParseContext context = new ParseContext(); |
| parser.parse(stream, output, new Metadata(), context); |
| fail("Expected TikaException caused by Error"); |
| } catch (TikaException e) { |
| assertEquals(brokenParser.err, e.getCause()); |
| } finally { |
| parser.close(); |
| stream.close(); |
| } |
| |
| // With a non serializable one, we'll get something else |
| // TODO Fix this test |
| brokenParser = new BrokenParser(); |
| brokenParser.re = new WontBeSerializedError("Can't Serialize"); |
| parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser); |
| // try { |
| // ContentHandler output = new BodyContentHandler(); |
| // ParseContext context = new ParseContext(); |
| // parser.parse(stream, output, new Metadata(), context); |
| // fail("Expected TikaException caused by Error"); |
| // } catch (TikaException e) { |
| // assertEquals(TikaException.class, e.getCause().getClass()); |
| // assertEquals("Bang!", e.getCause().getMessage()); |
| // } |
| } |
| |
| /** |
| * If we supply a non serializable object on the ParseContext, |
| * check we get a helpful exception back |
| */ |
| @Test |
| public void testParserHandlingOfNonSerializable() throws Exception { |
| ForkParser parser = |
| new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser()); |
| |
| ParseContext context = new ParseContext(); |
| context.set(Detector.class, new Detector() { |
| public MediaType detect(InputStream input, Metadata metadata) { |
| return MediaType.OCTET_STREAM; |
| } |
| }); |
| |
| try { |
| ContentHandler output = new BodyContentHandler(); |
| InputStream stream = getResourceAsStream("/test-documents/testTXT.txt"); |
| parser.parse(stream, output, new Metadata(), context); |
| fail("Should have blown up with a non serializable ParseContext"); |
| } catch (TikaException e) { |
| // Check the right details |
| assertNotNull(e.getCause()); |
| assertEquals(NotSerializableException.class, e.getCause().getClass()); |
| assertEquals("Unable to serialize ParseContext to pass to the Forked Parser",e.getMessage()); |
| } finally { |
| parser.close(); |
| } |
| } |
| |
| /** |
| * TIKA-832 |
| */ |
| @Test |
| public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception { |
| ParseContext context = new ParseContext(); |
| context.set(Parser.class, tika.getParser()); |
| |
| ForkParser parser = |
| new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser()); |
| parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", |
| "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n")); |
| try { |
| ContentHandler body = new BodyContentHandler(); |
| InputStream stream = getResourceAsStream("/test-documents/testTXT.txt"); |
| parser.parse(stream, body, new Metadata(), context); |
| String content = body.toString(); |
| assertContains("Test d'indexation", content); |
| assertContains("http://www.apache.org", content); |
| } finally { |
| parser.close(); |
| } |
| } |
| |
| /** |
| * TIKA-808 - Ensure that parsing of our test PDFs work under |
| * the Fork Parser, to ensure that complex parsing behaves |
| */ |
| @Test |
| public void testForkedPDFParsing() throws Exception { |
| try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), |
| tika.getParser())) { |
| ContentHandler output = new BodyContentHandler(); |
| InputStream stream = getResourceAsStream("/test-documents/testPDF.pdf"); |
| ParseContext context = new ParseContext(); |
| context.set(Parser.class, new EmptyParser()); |
| parser.parse(stream, output, new Metadata(), context); |
| |
| String content = output.toString(); |
| assertContains("Apache Tika", content); |
| assertContains("Tika - Content Analysis Toolkit", content); |
| assertContains("incubator", content); |
| assertContains("Apache Software Foundation", content); |
| } |
| } |
| |
| @Test |
| public void testForkedPackageParsing() throws Exception { |
| try (ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), |
| tika.getParser())) { |
| ContentHandler output = new BodyContentHandler(); |
| InputStream stream = getResourceAsStream("/test-documents/moby.zip"); |
| ParseContext context = new ParseContext(); |
| parser.parse(stream, output, new Metadata(), context); |
| assertContains("Moby Dick", output.toString()); |
| } |
| } |
| |
| @Test |
| @Ignore("use for development/one off testing. This is a beast and takes enormous " + |
| "resources and time") |
| public void smokeTest() throws Exception { |
| RecursiveParserWrapper wrapper = new RecursiveParserWrapper(tika.getParser()); |
| int numThreads = 5; |
| ForkParser parser = |
| new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), wrapper); |
| parser.setServerPulseMillis(500); |
| parser.setServerParseTimeoutMillis(1000); |
| parser.setPoolSize(numThreads); |
| ParseContext[] parseContexts = new ParseContext[numThreads]; |
| for (int i = 0; i < numThreads; i++) { |
| parseContexts[i] = new ParseContext(); |
| } |
| try { |
| super.testMultiThreaded(parser, parseContexts, numThreads, 5, pathname -> { |
| return pathname.getAbsolutePath().contains("mock"); |
| /* |
| if (pathname.getName().contains("11_hang.rar") || |
| pathname.getName().contains("radar_profiles_2009.mat") || |
| pathname.getAbsolutePath().contains("mock")) { |
| //return false; |
| } |
| return true;*/ |
| }); |
| } catch (Throwable t) { |
| t.printStackTrace(); |
| } |
| } |
| |
| /** |
| * This error has a message and an equals() implementation as to be able |
| * to match it against the serialized version of itself. |
| */ |
| static class AnError extends Error { |
| private static final long serialVersionUID = -6197267350768803348L; |
| private String message; |
| |
| AnError(String message) { |
| super(message); |
| this.message = message; |
| } |
| |
| @Override |
| public boolean equals(Object o) { |
| if (this == o) { |
| return true; |
| } |
| if (o == null || getClass() != o.getClass()) { |
| return false; |
| } |
| |
| AnError anError = (AnError) o; |
| |
| if (!message.equals(anError.message)) { |
| return false; |
| } |
| |
| return true; |
| } |
| |
| @Override |
| public int hashCode() { |
| return message.hashCode(); |
| } |
| } |
| |
| /** |
| * This error isn't serializable on the server, so can't be sent back |
| * to the Fork Client once it has occured |
| */ |
| static class WontBeSerializedError extends RuntimeException { |
| private static final long serialVersionUID = 1L; |
| |
| WontBeSerializedError(String message) { |
| super(message); |
| } |
| |
| private void writeObject(java.io.ObjectOutputStream out) { |
| RuntimeException e = new RuntimeException("Bang!"); |
| boolean found = false; |
| for (StackTraceElement ste : e.getStackTrace()) { |
| if (ste.getClassName().equals(ForkParser.class.getName())) { |
| found = true; |
| break; |
| } |
| } |
| if (!found) { |
| throw e; |
| } |
| } |
| } |
| |
| static class BrokenParser implements Parser { |
| private static final long serialVersionUID = 995871497930817839L; |
| public Error err = new AnError("Simulated fail"); |
| public RuntimeException re = null; |
| |
| public Set<MediaType> getSupportedTypes(ParseContext context) { |
| return new HashSet<>(Collections.singletonList(MediaType.TEXT_PLAIN)); |
| } |
| |
| public void parse(InputStream stream, ContentHandler handler, Metadata metadata, |
| ParseContext context) throws IOException, SAXException, TikaException { |
| if (re != null) { |
| throw re; |
| } |
| throw err; |
| } |
| } |
| |
| } |