blob: f6d37de227a68b85cb9f9cf4aef925b9ccb1dca5 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* <p>
* http://www.apache.org/licenses/LICENSE-2.0
* <p>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.server.core;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import javax.ws.rs.core.Response;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.FileUtils;
import org.apache.cxf.jaxrs.JAXRSServerFactoryBean;
import org.apache.cxf.jaxrs.client.WebClient;
import org.apache.cxf.jaxrs.lifecycle.ResourceProvider;
import org.apache.cxf.jaxrs.lifecycle.SingletonResourceProvider;
import org.junit.AfterClass;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.serialization.JsonFetchEmitTuple;
import org.apache.tika.metadata.serialization.JsonMetadataList;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.HandlerConfig;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.fetcher.FetcherManager;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.server.core.resource.PipesResource;
import org.apache.tika.server.core.writer.JSONObjWriter;
import org.apache.tika.utils.ProcessUtils;
/**
* This offers basic integration tests with fetchers and emitters.
* We use file system fetchers and emitters.
*/
public class TikaPipesTest extends CXFTestBase {
private static final String PIPES_PATH = "/pipes";
private static Path TMP_DIR;
private static Path TMP_OUTPUT_DIR;
private static Path TMP_OUTPUT_FILE;
private static Path TMP_NPE_OUTPUT_FILE;
private static Path TIKA_CONFIG_PATH;
private static String TIKA_CONFIG_XML;
private static FetcherManager FETCHER_MANAGER;
private static String HELLO_WORLD = "hello_world.xml";
private static String HELLO_WORLD_JSON = "hello_world.xml.json";
private static String NPE_JSON = "null_pointer.xml.json";
private static String[] VALUE_ARRAY = new String[]{"my-value-1", "my-value-2", "my-value-3"};
@BeforeClass
public static void setUpBeforeClass() throws Exception {
TMP_DIR = Files.createTempDirectory("tika-pipes-test-");
Path inputDir = TMP_DIR.resolve("input");
TMP_OUTPUT_DIR = TMP_DIR.resolve("output");
TMP_OUTPUT_FILE = TMP_OUTPUT_DIR.resolve(HELLO_WORLD_JSON);
TMP_NPE_OUTPUT_FILE = TMP_OUTPUT_DIR.resolve("null_pointer.xml.json");
Files.createDirectories(inputDir);
Files.createDirectories(TMP_OUTPUT_DIR);
for (String mockFile : new String[]{"hello_world.xml", "null_pointer.xml"}) {
Files.copy(
TikaPipesTest.class.getResourceAsStream("/test-documents/mock/" + mockFile),
inputDir.resolve(mockFile));
}
TIKA_CONFIG_PATH = Files.createTempFile(TMP_DIR, "tika-pipes-", ".xml");
TIKA_CONFIG_XML =
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<properties>" + "<fetchers>" +
"<fetcher class=\"org.apache.tika.pipes.fetcher.fs.FileSystemFetcher\">" +
"<params>" + "<name>fsf</name>" +
"<basePath>" + inputDir.toAbsolutePath() +
"</basePath>" + "</params>" + "</fetcher>" + "</fetchers>" + "<emitters>" +
"<emitter class=\"org.apache.tika.pipes.emitter.fs.FileSystemEmitter\">" +
"<params>" + "<name>fse</name>" +
"<basePath>" +
TMP_OUTPUT_DIR.toAbsolutePath() + "</basePath>" + "</params>" +
"</emitter>" +
"</emitters>" + "<pipes><params><tikaConfig>" +
ProcessUtils.escapeCommandLine(TIKA_CONFIG_PATH.toAbsolutePath().toString()) +
"</tikaConfig><numClients>10</numClients><forkedJvmArgs><arg>-Xmx256m" +
"</arg></forkedJvmArgs>" +
"</params></pipes>" + "</properties>";
Files.write(TIKA_CONFIG_PATH, TIKA_CONFIG_XML.getBytes(StandardCharsets.UTF_8));
}
@AfterClass
public static void tearDownAfterClass() throws Exception {
FileUtils.deleteDirectory(TMP_DIR.toFile());
}
@Before
public void setUpEachTest() throws Exception {
if (Files.exists(TMP_OUTPUT_FILE)) {
Files.delete(TMP_OUTPUT_FILE);
}
if (Files.exists(TMP_NPE_OUTPUT_FILE)) {
Files.delete(TMP_NPE_OUTPUT_FILE);
}
assertFalse(Files.isRegularFile(TMP_OUTPUT_FILE));
}
@Override
protected void setUpResources(JAXRSServerFactoryBean sf) {
List<ResourceProvider> rCoreProviders = new ArrayList<>();
try {
rCoreProviders.add(new SingletonResourceProvider(new PipesResource(TIKA_CONFIG_PATH)));
} catch (IOException | TikaConfigException e) {
throw new RuntimeException(e);
}
sf.setResourceProviders(rCoreProviders);
}
@Override
protected void setUpProviders(JAXRSServerFactoryBean sf) {
List<Object> providers = new ArrayList<>();
providers.add(new TikaServerParseExceptionMapper(true));
providers.add(new JSONObjWriter());
sf.setProviders(providers);
}
@Override
protected InputStream getTikaConfigInputStream() {
return new ByteArrayInputStream(TIKA_CONFIG_XML.getBytes(StandardCharsets.UTF_8));
}
@Override
protected InputStreamFactory getInputStreamFactory(TikaConfig tikaConfig) {
return new FetcherStreamFactory(FETCHER_MANAGER);
}
@Test
public void testPost() throws Exception {
Metadata userMetadata = new Metadata();
userMetadata.set("my-key", "my-value");
for (String s : VALUE_ARRAY) {
userMetadata.add("my-key-multi", s);
}
FetchEmitTuple t =
new FetchEmitTuple("myId",
new FetchKey("fsf", "hello_world.xml"), new EmitKey("fse", ""),
userMetadata);
StringWriter writer = new StringWriter();
JsonFetchEmitTuple.toJson(t, writer);
String getUrl = endPoint + PIPES_PATH;
Response response =
WebClient.create(getUrl).accept("application/json").post(writer.toString());
assertEquals(200, response.getStatus());
List<Metadata> metadataList = null;
try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) {
metadataList = JsonMetadataList.fromJson(reader);
}
assertEquals(1, metadataList.size());
Metadata metadata = metadataList.get(0);
assertEquals("hello world", metadata.get(TikaCoreProperties.TIKA_CONTENT).trim());
assertEquals("Nikolai Lobachevsky", metadata.get("author"));
assertEquals("你好,世界", metadata.get("title"));
assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("my-value", metadata.get("my-key"));
assertArrayEquals(VALUE_ARRAY, metadata.getValues("my-key-multi"));
}
@Test
public void testPostXML() throws Exception {
Metadata userMetadata = new Metadata();
userMetadata.set("my-key", "my-value");
for (String s : VALUE_ARRAY) {
userMetadata.add("my-key-multi", s);
}
FetchEmitTuple t =
new FetchEmitTuple("myId",
new FetchKey("fsf", "hello_world.xml"),
new EmitKey("fse", ""),
userMetadata,
new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1, -1),
FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT);
StringWriter writer = new StringWriter();
JsonFetchEmitTuple.toJson(t, writer);
String getUrl = endPoint + PIPES_PATH;
Response response =
WebClient.create(getUrl).accept("application/json").post(writer.toString());
assertEquals(200, response.getStatus());
List<Metadata> metadataList = null;
try (Reader reader = Files.newBufferedReader(TMP_OUTPUT_FILE)) {
metadataList = JsonMetadataList.fromJson(reader);
}
assertEquals(1, metadataList.size());
Metadata metadata = metadataList.get(0);
assertContains("<p>hello world</p>", metadata.get(TikaCoreProperties.TIKA_CONTENT).trim());
}
@Test
public void testPostNPE() throws Exception {
Metadata userMetadata = new Metadata();
userMetadata.set("my-key", "my-value");
for (String s : VALUE_ARRAY) {
userMetadata.add("my-key-multi", s);
}
FetchEmitTuple t =
new FetchEmitTuple("myId",
new FetchKey("fsf", "null_pointer.xml"),
new EmitKey("fse", ""),
userMetadata);
StringWriter writer = new StringWriter();
JsonFetchEmitTuple.toJson(t, writer);
String getUrl = endPoint + PIPES_PATH;
Response response =
WebClient.create(getUrl).accept("application/json").post(writer.toString());
assertEquals(200, response.getStatus());
JsonNode jsonResponse;
try (Reader reader = new InputStreamReader((InputStream) response.getEntity(),
StandardCharsets.UTF_8)) {
jsonResponse = new ObjectMapper().readTree(reader);
}
;
String parseException = jsonResponse.get("parse_exception").asText();
assertNotNull(parseException);
assertContains("NullPointerException", parseException);
assertEquals(true, jsonResponse.get("emitted").asBoolean());
List<Metadata> metadataList = null;
try (Reader reader = Files
.newBufferedReader(TMP_OUTPUT_DIR.resolve("null_pointer.xml.json"))) {
metadataList = JsonMetadataList.fromJson(reader);
}
assertEquals(1, metadataList.size());
Metadata metadata = metadataList.get(0);
assertEquals("application/mock+xml", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("my-value", metadata.get("my-key"));
assertArrayEquals(VALUE_ARRAY, metadata.getValues("my-key-multi"));
assertContains("NullPointerException",
metadata.get(TikaCoreProperties.CONTAINER_EXCEPTION));
}
@Test
public void testPostNPENoEmit() throws Exception {
FetchEmitTuple t =
new FetchEmitTuple("myId",
new FetchKey("fsf", "null_pointer.xml"),
new EmitKey("fse", ""),
FetchEmitTuple.ON_PARSE_EXCEPTION.SKIP);
StringWriter writer = new StringWriter();
JsonFetchEmitTuple.toJson(t, writer);
String getUrl = endPoint + PIPES_PATH;
Response response =
WebClient.create(getUrl).accept("application/json").post(writer.toString());
assertEquals(200, response.getStatus());
JsonNode jsonResponse;
try (Reader reader = new InputStreamReader((InputStream) response.getEntity(),
StandardCharsets.UTF_8)) {
jsonResponse = new ObjectMapper().readTree(reader);
}
;
String parseException = jsonResponse.get("parse_exception").asText();
assertNotNull(parseException);
assertContains("NullPointerException", parseException);
assertEquals(false, jsonResponse.get("emitted").asBoolean());
assertFalse(Files.isRegularFile(TMP_NPE_OUTPUT_FILE));
}
}