TIKA-4506 -- remove tika-fuzzing module (#2358)
diff --git a/CHANGES.txt b/CHANGES.txt
index 06d7373..861b820 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -7,7 +7,7 @@
* Headers are no longer injected into the body/content of MSG files (TIKA-4345). Please open
a ticket if you need this behavior across email formats.
- * The tika-batch module has been removed (TIKA-4333).
+ * Remove tika-batch (TIKA-4333).
* Remove snaps deployment (TIKA-4502).
@@ -15,7 +15,9 @@
* Removed the advanced media module (TIKA-4500).
- * Remove the tika-dl module (TIKA-4499).
+ * Removed the tika-dl module (TIKA-4499).
+
+ * Removed the tika-fuzzing module (TIKA-4506).
OTHER CHANGES
diff --git a/pom.xml b/pom.xml
index 58635e4..9e59f6c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -50,7 +50,6 @@
<module>tika-server</module>
<module>tika-integration-tests</module>
<module>tika-eval</module>
- <module>tika-fuzzing</module>
<module>tika-translate</module>
<module>tika-example</module>
<module>tika-java7</module>
diff --git a/tika-bom/pom.xml b/tika-bom/pom.xml
index d57fad5..f37a5ae 100644
--- a/tika-bom/pom.xml
+++ b/tika-bom/pom.xml
@@ -78,11 +78,6 @@
<artifactId>tika-eval-core</artifactId>
<version>4.0.0-SNAPSHOT</version>
</dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-fuzzing</artifactId>
- <version>4.0.0-SNAPSHOT</version>
- </dependency>
<!-- Tika language detection modules -->
<dependency>
diff --git a/tika-fuzzing/pom.xml b/tika-fuzzing/pom.xml
deleted file mode 100644
index a02d322..0000000
--- a/tika-fuzzing/pom.xml
+++ /dev/null
@@ -1,126 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd">
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parent</artifactId>
- <version>4.0.0-SNAPSHOT</version>
- <relativePath>../tika-parent/pom.xml</relativePath>
- </parent>
-
- <artifactId>tika-fuzzing</artifactId>
- <name>Apache Tika fuzzing</name>
- <url>https://tika.apache.org/</url>
-
- <modelVersion>4.0.0</modelVersion>
-
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-pipes-core</artifactId>
- <version>${project.version}</version>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>commons-cli</groupId>
- <artifactId>commons-cli</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-pkg-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-pdf-module</artifactId>
- <version>${project.version}</version>
- </dependency>
- <!-- logging -->
- <dependency>
- <groupId>org.apache.logging.log4j</groupId>
- <artifactId>log4j-slf4j2-impl</artifactId>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>jcl-over-slf4j</artifactId>
- </dependency>
- <!-- test -->
- <dependency>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-digest-commons</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <!-- bring in the mock parser -->
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- <type>test-jar</type>
- <scope>test</scope>
- </dependency>
- </dependencies>
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-checkstyle-plugin</artifactId>
- <version>${checkstyle.plugin.version}</version>
- <dependencies>
- <dependency>
- <groupId>com.puppycrawl.tools</groupId>
- <artifactId>checkstyle</artifactId>
- <version>${puppycrawl.version}</version>
- </dependency>
- </dependencies>
- <executions>
- <execution>
- <id>validate</id>
- <phase>validate</phase>
- <configuration>
- <configLocation>checkstyle.xml</configLocation>
- <inputEncoding>UTF-8</inputEncoding>
- <consoleOutput>false</consoleOutput>
- <includeTestSourceDirectory>true</includeTestSourceDirectory>
- <testSourceDirectories>${project.basedir}/src/test/java</testSourceDirectories>
- <violationSeverity>error</violationSeverity>
- <failOnViolation>true</failOnViolation>
- </configuration>
- <goals>
- <goal>check</goal>
- </goals>
- </execution>
- </executions>
- </plugin>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-jar-plugin</artifactId>
- <configuration>
- <archive>
- <manifestEntries>
- <Automatic-Module-Name>org.apache.tika.fuzzing</Automatic-Module-Name>
- </manifestEntries>
- </archive>
- </configuration>
- </plugin>
- </plugins>
- </build>
-</project>
\ No newline at end of file
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java
deleted file mode 100644
index 05bf5e2..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/AutoDetectTransformer.java
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.fuzzing.general.GeneralTransformer;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MediaTypeRegistry;
-
-public class AutoDetectTransformer implements Transformer {
-
- private static final ServiceLoader DEFAULT_LOADER =
- new ServiceLoader(AutoDetectTransformer.class.getClassLoader());
-
- TikaConfig config = TikaConfig.getDefaultConfig();
- MediaTypeRegistry registry = config.getMediaTypeRegistry();
- Detector detector = TikaConfig.getDefaultConfig().getDetector();
-
- Transformer fallback = new GeneralTransformer();
- Map<MediaType, Transformer> transformerMap = new HashMap<>();
-
- public AutoDetectTransformer() {
- this(DEFAULT_LOADER.loadServiceProviders(org.apache.tika.fuzzing.Transformer.class));
- }
-
- public AutoDetectTransformer(List<Transformer> transformers) {
- for (Transformer t : transformers) {
- for (MediaType mediaType : t.getSupportedTypes()) {
- transformerMap.put(mediaType, t);
- }
- }
- }
-
- @Override
- public Set<MediaType> getSupportedTypes() {
- return transformerMap.keySet();
- }
-
- @Override
- public void transform(InputStream is, OutputStream os) throws IOException, TikaException {
- try (TikaInputStream tis = TikaInputStream.get(is)) {
- // Automatically detect the MIME type of the document
- Metadata metadata = new Metadata();
- MediaType type = detector.detect(tis, metadata);
- Transformer transformer = getTransformer(type);
- transformer.transform(tis, os);
- }
- }
-
- private Transformer getTransformer(MediaType type) {
- if (type == null) {
- return fallback;
- }
- // We always work on the normalised, canonical form
- type = registry.normalize(type);
-
- while (type != null) {
- // Try finding a parser for the type
- Transformer transformer = transformerMap.get(type);
- if (transformer != null) {
- return transformer;
- }
-
- // Failing that, try for the parent of the type
- type = registry.getSupertype(type);
- }
- return fallback;
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java
deleted file mode 100644
index 57a710f..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/Transformer.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-
-public interface Transformer {
-
- /**
- * Returns the set of media types supported by this parser when used
- * with the given parse context.
- *
- * @return immutable set of media types
- * @since Apache Tika 1.24.1
- */
- Set<MediaType> getSupportedTypes();
-
-
- void transform(InputStream is, OutputStream os) throws IOException, TikaException;
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java
deleted file mode 100644
index 7e55dbf..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzOne.java
+++ /dev/null
@@ -1,244 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.cli;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.TimeoutException;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.DefaultParser;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.ParseException;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.fuzzing.AutoDetectTransformer;
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.fuzzing.exceptions.CantFuzzException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.ExceptionUtils;
-
-/**
- * Forked process that runs against a single input file
- */
-public class FuzzOne {
- private static final Logger LOG = LoggerFactory.getLogger(FuzzOne.class);
-
- static Options OPTIONS;
-
- static {
- //By the time this commandline is parsed, there should be both an extracts and an inputDir
- Option extracts = new Option("extracts", true, "directory for extract files");
- extracts.setRequired(true);
-
-
- OPTIONS = new Options().addOption(
- Option.builder("i").longOpt("inputFile").desc("input directory for seed files")
- .hasArg(true).required(true).get()).addOption(
- Option.builder("o").longOpt("outputFile").desc("output file base").hasArg(true)
- .required(true).get()).addOption(Option.builder("m").longOpt("timeoutMs")
- .desc("timeout in ms -- max time allowed to parse a file").hasArg(true)
- .required(true).get()).addOption(
- Option.builder("n").desc("thread id (thread number)").hasArg(true).required(true)
- .get()).addOption(Option.builder("p").longOpt("perFile")
- .desc("number of iterations to run per seed file").hasArg(true).required(true)
- .get()).addOption(Option.builder("t").longOpt("maxTransformers")
- .desc("maximum number of transformers to run per iteration").hasArg(true)
- .required(true).get()).addOption(
- Option.builder("r").longOpt("retryId").desc("which retry is this").hasArg(true)
- .required(true).get());
- }
-
- Parser parser = new AutoDetectParser();
-
- public static void main(String[] args) throws Exception {
- FuzzOneConfig config = FuzzOneConfig.parse(args);
- FuzzOne fuzzOne = new FuzzOne();
- fuzzOne.execute(config);
- }
-
- private void execute(FuzzOneConfig config) {
- Path src = config.inputFile;
- Path targetDir = config.outputFileBase;
- AutoDetectTransformer transformer = new AutoDetectTransformer();
- for (int i = 0; i < config.perFileIterations; i++) {
- try {
- String ext = "-" + config.threadNum + "-" + config.retryNum + "-" + i;
- fuzz(ext, src, targetDir, transformer, config.timeoutMs);
- } catch (IOException e) {
- LOG.warn("problem transforming file", e);
- } catch (CantFuzzException e) {
- LOG.warn("can't fuzz this file " + src, e);
- return;
- } catch (TikaException e) {
- e.printStackTrace();
- }
- }
- }
-
- private void fuzz(String ext, Path src, Path targetFileBase, Transformer transformer,
- long timeoutMs) throws IOException, TikaException {
-
- Path target =
- targetFileBase.getParent().resolve(targetFileBase.getFileName().toString() + ext);
-
- try {
- transformFile(transformer, src, target);
- } catch (Throwable t) {
- LOG.warn("failed to transform: " + src.toString());
- Files.delete(target);
- throw t;
- }
- ExecutorService executor = Executors.newFixedThreadPool(1);
- Future<Integer> future = executor.submit(new ParseTask(target));
-
- try {
- int result = future.get(timeoutMs, TimeUnit.MILLISECONDS);
- if (result == 1 && Files.exists(target)) {
- LOG.warn("failed to delete target: " + target);
- }
- } catch (TimeoutException e) {
- LOG.warn("timeout exception:" + target);
- future.cancel(true);
- writeErrFile(target, ".timeout");
- System.exit(1);
- } catch (InterruptedException | ExecutionException e) {
- LOG.warn("problem parsing " + target, e);
- System.exit(1);
- } finally {
- executor.shutdownNow();
- }
- }
-
- private void writeErrFile(Path target, String ext) {
- try {
- Path err = target.getParent().resolve(target.getFileName().toString() + ext);
- Files.write(err, new byte[0]);
- } catch (IOException e) {
- LOG.warn("things aren't going right today.", e);
- }
- }
-
- private void handleThrowable(Path target, Throwable t) {
-
- try {
- Path errMsg =
- target.getParent().resolve(target.getFileName().toString() + ".stacktrace");
- Files.write(errMsg, ExceptionUtils.getStackTrace(t).getBytes(StandardCharsets.UTF_8));
- } catch (IOException e) {
- LOG.warn("things aren't going right today.", t);
- }
-
- }
-
- private void transformFile(Transformer transformer, Path src, Path target)
- throws IOException, TikaException {
- try (InputStream is = Files.newInputStream(src);
- OutputStream os = Files.newOutputStream(target)) {
- transformer.transform(is, os);
- }
- }
-
- private static class FuzzOneConfig {
- int perFileIterations;
- int maxTransformers;
- int threadNum;
- int retryNum;
- long timeoutMs;
- private Path inputFile;
- private Path outputFileBase;
-
- static FuzzOneConfig parse(String[] args) throws ParseException {
- CommandLineParser parser = new DefaultParser();
- CommandLine commandLine = parser.parse(OPTIONS, args);
- FuzzOneConfig config = new FuzzOneConfig();
- config.inputFile = Paths.get(commandLine.getOptionValue("i"));
- config.outputFileBase = Paths.get(commandLine.getOptionValue("o"));
- config.perFileIterations = Integer.parseInt(commandLine.getOptionValue("p"));
- config.maxTransformers = Integer.parseInt(commandLine.getOptionValue("t"));
- config.threadNum = Integer.parseInt(commandLine.getOptionValue("n"));
- config.retryNum = Integer.parseInt(commandLine.getOptionValue("r"));
- config.timeoutMs = Integer.parseInt(commandLine.getOptionValue("m"));
- return config;
- }
-
- }
-
- private class ParseTask implements Callable<Integer> {
- private final Path target;
-
- public ParseTask(Path target) {
- this.target = target;
- }
-
- /**
- * @return 1 if success
- * @throws Exception
- */
- @Override
- public Integer call() throws Exception {
- boolean success = false;
- try (InputStream is = Files.newInputStream(target)) {
- LOG.debug("parsing " + target);
- parser.parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
- success = true;
- } catch (TikaException e) {
- if (e.getCause() instanceof RuntimeException) {
- //handleThrowable(target, e.getCause());
- success = true;
- } else {
- success = true;
- }
- } catch (SAXException | IOException e) {
- success = true;
- } catch (Throwable t) {
- handleThrowable(target, t);
- } finally {
- if (success) {
- try {
- Files.delete(target);
- } catch (IOException e) {
- LOG.warn("couldn't delete: " + target.toAbsolutePath());
- }
- } else {
- LOG.info("FOUND PROBLEM: " + target);
- }
- }
- return success ? 1 : 0;
- }
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
deleted file mode 100644
index baa0e14..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLI.java
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.cli;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.util.Locale;
-import java.util.UUID;
-import java.util.concurrent.ArrayBlockingQueue;
-import java.util.concurrent.Callable;
-import java.util.concurrent.ExecutionException;
-import java.util.concurrent.ExecutorCompletionService;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-import java.util.concurrent.atomic.AtomicInteger;
-
-import org.apache.commons.io.FileUtils;
-import org.apache.commons.io.FilenameUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.fuzzing.general.ByteDeleter;
-import org.apache.tika.fuzzing.general.ByteFlipper;
-import org.apache.tika.fuzzing.general.ByteInjector;
-import org.apache.tika.fuzzing.general.GeneralTransformer;
-import org.apache.tika.fuzzing.general.SpanSwapper;
-import org.apache.tika.fuzzing.general.Truncator;
-import org.apache.tika.pipes.core.FetchEmitTuple;
-import org.apache.tika.pipes.core.PipesConfig;
-import org.apache.tika.pipes.core.PipesParser;
-import org.apache.tika.pipes.core.PipesResult;
-import org.apache.tika.pipes.core.emitter.EmitKey;
-import org.apache.tika.pipes.core.fetcher.FetchKey;
-import org.apache.tika.pipes.core.fetcher.FetcherManager;
-import org.apache.tika.pipes.core.pipesiterator.PipesIterator;
-
-public class FuzzingCLI {
-
- private static final Logger LOG = LoggerFactory.getLogger(FuzzingCLI.class);
- private static final String TEMP_FETCHER_NAME = "temp";
- private static final String TEMP_EMITTER_NAME = "temp";
-
- public static void main(String[] args) throws Exception {
- FuzzingCLIConfig config = FuzzingCLIConfig.parse(args);
- if (config.getMaxTransformers() == 0) {
- LOG.warn("max transformers == 0!");
- }
-
- FuzzingCLI fuzzingCLI = new FuzzingCLI();
- Files.createDirectories(config.getProblemsDirectory());
- fuzzingCLI.execute(config);
- }
-
-
- private void execute(FuzzingCLIConfig config) throws Exception {
- ArrayBlockingQueue<FetchEmitTuple> q = new ArrayBlockingQueue(10000);
-
- PipesConfig pipesConfig = PipesConfig.load(config.getTikaConfig());
- FetcherManager fetcherManager = FetcherManager.load(config.getTikaConfig());
-
- int totalThreads = pipesConfig.getNumClients() + 1;
-
- ExecutorService executorService = Executors.newFixedThreadPool(totalThreads);
- ExecutorCompletionService executorCompletionService =
- new ExecutorCompletionService(executorService);
- PipesIterator pipesIterator = PipesIterator.build(config.getTikaConfig());
-
- FileAdder fileAdder = new FileAdder(pipesIterator, q);
- executorCompletionService.submit(fileAdder);
- try (PipesParser parser = new PipesParser(pipesConfig)) {
-
- for (int i = 0; i < pipesConfig.getNumClients(); i++) {
- executorCompletionService.submit(new Fuzzer(q, config, parser, fetcherManager));
- }
- int finished = 0;
- while (finished < totalThreads) {
- Future<Integer> future = null;
- try {
- future = executorCompletionService.poll(1, TimeUnit.SECONDS);
- if (future != null) {
- future.get();
- finished++;
- }
- LOG.info("Finished thread {} threads of {}", finished, totalThreads);
- } catch (InterruptedException | ExecutionException e) {
- e.printStackTrace();
- break;
- }
- }
- executorService.shutdown();
- executorService.shutdownNow();
- }
-
- }
-
- private static class Fuzzer implements Callable<Integer> {
- static AtomicInteger COUNTER = new AtomicInteger();
- static AtomicInteger FUZZED = new AtomicInteger();
- static AtomicInteger SOURCE_FILES = new AtomicInteger();
- private final int threadId = COUNTER.getAndIncrement();
- private final ArrayBlockingQueue<FetchEmitTuple> q;
- private final FuzzingCLIConfig config;
-
- private final PipesParser pipesParser;
-
- private final Transformer transformer;
-
- private final FetcherManager fetcherManager;
-
- public Fuzzer(ArrayBlockingQueue<FetchEmitTuple> q, FuzzingCLIConfig config,
- PipesParser pipesParser, FetcherManager fetcherManager) {
- this.q = q;
- this.config = config;
- this.pipesParser = pipesParser;
- //TODO - parameterize this
- this.transformer =
- new GeneralTransformer(config.getMaxTransformers(), new ByteDeleter(),
- new ByteFlipper(), new ByteInjector(), new Truncator(),
- new SpanSwapper());
- this.fetcherManager = fetcherManager;
- }
-
- @Override
- public Integer call() throws Exception {
- while (true) {
- FetchEmitTuple fetchEmitTuple = q.take();
- if (fetchEmitTuple.equals(PipesIterator.COMPLETED_SEMAPHORE)) {
- LOG.debug("Thread " + threadId + " stopping");
- q.put(PipesIterator.COMPLETED_SEMAPHORE);
- return 1;
- }
- int inputFiles = SOURCE_FILES.getAndIncrement();
- if (inputFiles % 100 == 0) {
- LOG.info("Processed {} source files", inputFiles);
- }
- for (int i = 0; i < config.perFileIterations; i++) {
- try {
- fuzzIt(fetchEmitTuple);
- } catch (InterruptedException e) {
- throw e;
- } catch (Exception e) {
- LOG.warn("serious problem with", e);
- }
- }
- }
- }
-
- private void fuzzIt(FetchEmitTuple fetchEmitTuple)
- throws IOException, InterruptedException, TikaException {
- Path cwd = Files.createTempDirectory("tika-fuzz-");
- try {
- Path fuzzedPath = fuzz(fetchEmitTuple, cwd);
- Path extract = Files.createTempFile(cwd, "tika-extract-", ".json");
- FetchEmitTuple fuzzedTuple = new FetchEmitTuple(fetchEmitTuple.getId(),
- new FetchKey(TEMP_FETCHER_NAME, fuzzedPath.toAbsolutePath().toString()),
- new EmitKey(TEMP_EMITTER_NAME, extract.toAbsolutePath().toString()));
- int count = FUZZED.getAndIncrement();
- if (count % 100 == 0) {
- LOG.info("processed {} fuzzed files", count);
- }
- boolean tryAgain = true;
- int tries = 0;
- while (tryAgain && tries < config.getRetries()) {
- tries++;
- try {
- PipesResult result = pipesParser.parse(fuzzedTuple);
- tryAgain = handleResult(result.getStatus(),
- fetchEmitTuple.getFetchKey().getFetchKey(), fuzzedPath, tries,
- config.getRetries());
- } catch (InterruptedException e) {
- throw e;
- } catch (Exception e) {
- tryAgain = handleResult(PipesResult.STATUS.UNSPECIFIED_CRASH,
- fetchEmitTuple.getFetchKey().getFetchKey(), fuzzedPath, tries,
- config.getRetries());
- }
- }
- } finally {
- try {
- FileUtils.deleteDirectory(cwd.toFile());
- } catch (IOException e) {
- e.printStackTrace();
- LOG.warn("Couldn't delete " + cwd.toAbsolutePath(), e);
- }
- }
- }
-
- private Path fuzz(FetchEmitTuple fetchEmitTuple, Path cwd)
- throws IOException, TikaException {
- Path target = Files.createTempFile(cwd, "tika-fuzz-target-",
- "." + FilenameUtils.getExtension(fetchEmitTuple.getFetchKey().getFetchKey()));
- try (InputStream is = fetcherManager.getFetcher(
- fetchEmitTuple.getFetchKey().getFetcherName())
- .fetch(fetchEmitTuple.getFetchKey().getFetchKey(), fetchEmitTuple.getMetadata(),
- fetchEmitTuple.getParseContext())) {
- try (OutputStream os = Files.newOutputStream(target)) {
- transformer.transform(is, os);
- }
- }
- return target;
- }
-
- private boolean handleResult(PipesResult.STATUS status, String origFetchKey,
- Path fuzzedPath, int tries, int maxRetries)
- throws IOException {
- switch (status) {
- case OOM:
- case TIMEOUT:
- case UNSPECIFIED_CRASH:
- if (tries < maxRetries) {
- LOG.info("trying again ({} of {}): {}", tries, maxRetries,
- status.name());
- return true;
- }
- Path problemFilePath = getProblemFile(status, origFetchKey);
- LOG.info("found a problem {} -> {} : {}", origFetchKey, problemFilePath,
- status.name());
- Files.copy(fuzzedPath, problemFilePath);
- return false;
- default:
- //if there wasn't a problem
- return false;
- }
- }
-
- private Path getProblemFile(PipesResult.STATUS status, String origFetchKey)
- throws IOException {
- String name = FilenameUtils.getName(origFetchKey) + "-" + UUID.randomUUID();
- Path problemFile =
- config.getProblemsDirectory().resolve(status.name().toLowerCase(Locale.US))
- .resolve(name);
- Files.createDirectories(problemFile.getParent());
- return problemFile;
- }
-
- }
-
- private static class FileAdder implements Callable<Integer> {
- private final PipesIterator pipesIterator;
- private final ArrayBlockingQueue<FetchEmitTuple> queue;
- private int added = 0;
-
- public FileAdder(PipesIterator pipesIterator, ArrayBlockingQueue<FetchEmitTuple> queue) {
- this.pipesIterator = pipesIterator;
- this.queue = queue;
- }
-
- @Override
- public Integer call() throws Exception {
- int added = 0;
- for (FetchEmitTuple tuple : pipesIterator) {
- //hang forever -- should offer and timeout
- queue.put(tuple);
- added++;
- }
- queue.put(PipesIterator.COMPLETED_SEMAPHORE);
- LOG.info("file adder finished " + added);
- return 1;
- }
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
deleted file mode 100644
index f06688c..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/cli/FuzzingCLIConfig.java
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.cli;
-
-import java.nio.file.Path;
-import java.nio.file.Paths;
-
-import org.apache.commons.cli.CommandLine;
-import org.apache.commons.cli.CommandLineParser;
-import org.apache.commons.cli.DefaultParser;
-import org.apache.commons.cli.Option;
-import org.apache.commons.cli.Options;
-import org.apache.commons.cli.ParseException;
-
-public class FuzzingCLIConfig {
-
- private static final int DEFAULT_NUM_ITERATIONS = 100;
-
- //allow all transformers to operate
- private static final int DEFAULT_MAX_TRANSFORMERS = 1;
-
- private static final int DEFAULT_RETRIES = 1;
-
- static Options OPTIONS;
-
- static {
- Option problems = new Option("o", "output", true, "directory for problems files");
- problems.setRequired(true);
-
-
- OPTIONS = new Options().addOption(problems)
- .addOption(Option.builder("c").longOpt("config").hasArg(true)
- .desc("tika config " +
- "file with " +
- "specs for pipes parser, pipes iterator, fetchers and emitters")
- .required(true).get())
- .addOption(Option.builder("p").longOpt("perFile")
- .desc("number of iterations to run per seed file").hasArg(true).required(false)
- .get())
- .addOption(Option.builder("t").longOpt("maxTransformers")
- .desc("maximum number of transformers to run per iteration").hasArg(true)
- .required(false).get())
- .addOption(Option.builder("r").longOpt("retries")
- .desc("number of times to retry a seed file if there's a catastrophic failure")
- .hasArg(true).required(false).get());
-
- }
- //number of variants tried per file
- int perFileIterations = DEFAULT_NUM_ITERATIONS;
- //maxTransformers per file
- int maxTransformers = DEFAULT_MAX_TRANSFORMERS;
- //max time allowed to process each file in milliseconds
- long timeoutMS;
- //times to retry a seed file after a catastrophic failure
- int retries = DEFAULT_RETRIES;
-
- Path tikaConfig;
-
- Path problemsDir;
-
- public static FuzzingCLIConfig parse(String[] args) throws ParseException {
- CommandLineParser parser = new DefaultParser();
- CommandLine commandLine = parser.parse(OPTIONS, args);
- FuzzingCLIConfig config = new FuzzingCLIConfig();
- config.tikaConfig = Paths.get(commandLine.getOptionValue("c"));
- config.problemsDir = Paths.get(commandLine.getOptionValue("o"));
- config.retries =
- (commandLine.hasOption("r")) ? Integer.parseInt(commandLine.getOptionValue("r")) :
- DEFAULT_RETRIES;
- config.maxTransformers = (commandLine.hasOption("t")) ?
- Integer.parseInt(commandLine.getOptionValue("t")) : DEFAULT_MAX_TRANSFORMERS;
- return config;
- }
-
- public Path getProblemsDirectory() {
- return problemsDir;
- }
-
- public Path getTikaConfig() {
- return tikaConfig;
- }
-
- public int getMaxTransformers() {
- return maxTransformers;
- }
-
- public int getPerFileIterations() {
- return perFileIterations;
- }
-
- public int getRetries() {
- return retries;
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java
deleted file mode 100644
index 3540822..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/exceptions/CantFuzzException.java
+++ /dev/null
@@ -1,25 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.exceptions;
-
-import org.apache.tika.exception.TikaException;
-
-public class CantFuzzException extends TikaException {
- public CantFuzzException(String msg) {
- super(msg);
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java
deleted file mode 100644
index 43ba46b..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteDeleter.java
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.general;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Collections;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.mime.MediaType;
-
-public class ByteDeleter implements Transformer {
- static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
- Random random = new Random();
- float percentDeleted = 0.01f;
-
- @Override
- public Set<MediaType> getSupportedTypes() {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public void transform(InputStream is, OutputStream os) throws IOException {
- int c = is.read();
- while (c != -1) {
- if (random.nextFloat() >= percentDeleted) {
- os.write(c);
- } else {
- //skip
- }
- c = is.read();
- }
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java
deleted file mode 100644
index b830c7a..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteFlipper.java
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.general;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Collections;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.mime.MediaType;
-
-public class ByteFlipper implements Transformer {
-
- static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
- //TODO add something about protecting first x bytes?
- private final Random random = new Random();
- private float percentCorrupt = 0.01f;
-
- @Override
- public Set<MediaType> getSupportedTypes() {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public void transform(InputStream is, OutputStream os) throws IOException {
- //TODO -- don't load the full thing into memory
- byte[] input = IOUtils.toByteArray(is);
- if (input.length == 0) {
- return;
- }
- byte[] singleByte = new byte[1];
- //make sure that there's at least one change, even in short files
- int atLeastOneIndex = random.nextInt(input.length);
-
- for (int i = 0; i < input.length; i++) {
- if (random.nextFloat() <= percentCorrupt || i == atLeastOneIndex) {
- random.nextBytes(singleByte);
- os.write(singleByte[0]);
- } else {
- os.write(input[i]);
- }
- }
- }
-
- public void setPercentCorrupt(float percentCorrupt) {
- this.percentCorrupt = percentCorrupt;
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java
deleted file mode 100644
index b6a5cd0..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/ByteInjector.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.general;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.mime.MediaType;
-
-public class ByteInjector implements Transformer {
- static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
- Random random = new Random();
- float injectionFrequency = 0.01f;
- int maxSpan = 100;
-
- @Override
- public Set<MediaType> getSupportedTypes() {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public void transform(InputStream is, OutputStream os) throws IOException {
- //TODO -- don't load the full thing into memory
- byte[] input = IOUtils.toByteArray(is);
- int numInjections = (int) Math.floor((double) injectionFrequency * (double) input.length);
- //at least one injection
- numInjections = numInjections == 0 ? 1 : numInjections;
- int[] starts = new int[numInjections];
- if (numInjections > 1) {
- for (int i = 0; i < numInjections; i++) {
- starts[i] = random.nextInt(input.length - 1);
- }
- } else {
- starts[0] = 0;
- }
- Arrays.sort(starts);
- int startIndex = 0;
-
- for (int i = 0; i < input.length; i++) {
- os.write(input[i]);
- if (startIndex < starts.length && starts[startIndex] == i) {
- inject(os);
- startIndex++;
- }
- }
- }
-
- private void inject(OutputStream os) throws IOException {
- int len = random.nextInt(maxSpan);
- byte[] randBytes = new byte[len];
- random.nextBytes(randBytes);
- os.write(randBytes);
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java
deleted file mode 100644
index 20ca55f..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/GeneralTransformer.java
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.general;
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.mime.MediaType;
-
-public class GeneralTransformer implements Transformer {
-
- private static final Logger LOG = LoggerFactory.getLogger(GeneralTransformer.class);
- private final int maxTransforms;
- private final Transformer[] transformers;
- private final Set<MediaType> supportedTypes;
- Random random = new Random();
-
- public GeneralTransformer() {
- this(new ByteDeleter(), new ByteFlipper(), new ByteInjector(), new Truncator(),
- new SpanSwapper());
- }
-
- public GeneralTransformer(Transformer... transformers) {
- this(transformers.length, transformers);
- }
-
- public GeneralTransformer(int maxTransforms, Transformer... transformers) {
- this.maxTransforms = (maxTransforms < 0) ? transformers.length : maxTransforms;
- this.transformers = transformers;
- Set<MediaType> tmpTypes = new HashSet<>();
- for (Transformer transformer : transformers) {
- tmpTypes.addAll(transformer.getSupportedTypes());
- }
- supportedTypes = Collections.unmodifiableSet(tmpTypes);
- }
-
- @Override
- public Set<MediaType> getSupportedTypes() {
- return supportedTypes;
- }
-
- @Override
- public void transform(InputStream is, OutputStream os) throws IOException, TikaException {
- //used for debugging
- if (maxTransforms == 0) {
- IOUtils.copy(is, os);
- return;
- }
- int transformerCount = (maxTransforms == 1) ? 1 : 1 + random.nextInt(maxTransforms);
- int[] transformerIndices = new int[transformerCount];
- for (int i = 0; i < transformerCount; i++) {
- transformerIndices[i] = random.nextInt(transformers.length);
- }
- //TODO -- make this actually streaming
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- IOUtils.copy(is, bos);
- for (int transformerIndex : transformerIndices) {
- byte[] bytes = bos.toByteArray();
- bos = new ByteArrayOutputStream();
- transformers[transformerIndex].transform(new ByteArrayInputStream(bytes), bos);
- bos.flush();
- if (bos.toByteArray().length == 0) {
- LOG.warn("zero length: " + transformers[transformerIndex]);
- }
- }
- os.write(bos.toByteArray());
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java
deleted file mode 100644
index a15a750..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/SpanSwapper.java
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.general;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Collections;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.mime.MediaType;
-
-/**
- * randomly swaps spans from the input
- */
-public class SpanSwapper implements Transformer {
-
- static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
- Random random = new Random();
- int maxSpanLength = 10000;
- private final float swapProbability = 0.01f;
-
- @Override
- public Set<MediaType> getSupportedTypes() {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public void transform(InputStream is, OutputStream os) throws IOException {
- byte[] input = IOUtils.toByteArray(is);
- int numSwaps = (int) Math.floor(swapProbability * input.length);
- //at least one swap
- numSwaps = numSwaps == 0 ? 1 : numSwaps;
- byte[] ret = new byte[input.length];
- System.arraycopy(input, 0, ret, 0, input.length);
- for (int i = 0; i < numSwaps; i++) {
- ret = swap(ret);
- }
- os.write(ret);
- }
-
- private byte[] swap(byte[] ret) {
- if (ret.length == 0) {
- return new byte[0];
- }
- int srcStart = random.nextInt(ret.length);
- int targStart = random.nextInt(ret.length);
- //these spans can overlap;
-
- int len = random.nextInt(maxSpanLength);
- int maxStart = Math.max(srcStart, targStart);
- len = (len + maxStart < ret.length) ? len : ret.length - maxStart;
-
- byte[] landingBytes = new byte[len];
- //copy the landing zone
- System.arraycopy(ret, targStart, landingBytes, 0, len);
- //now copy the src onto the targ
- System.arraycopy(ret, srcStart, ret, targStart, len);
- //now copy the targ over to the src
- System.arraycopy(landingBytes, 0, ret, srcStart, len);
- return ret;
- }
-
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java
deleted file mode 100644
index bf55836..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/general/Truncator.java
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.general;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Collections;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.mime.MediaType;
-
-public class Truncator implements Transformer {
-
- static Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM);
- Random random = new Random();
-
- @Override
- public Set<MediaType> getSupportedTypes() {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public void transform(InputStream is, OutputStream os) throws IOException {
- //TODO -- redo streaming
- byte[] input = IOUtils.toByteArray(is);
- if (input.length == 0) {
- return;
- }
- int len = 1 + random.nextInt(input.length);
- //at least one
- if (len >= input.length) {
- len = input.length - 2;
- if (len < 0) {
- len = 0;
- }
- }
-
- byte[] ret = new byte[len];
- System.arraycopy(input, 0, ret, 0, len);
- os.write(ret);
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
deleted file mode 100644
index 4e88f14..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/EvilCOSWriter.java
+++ /dev/null
@@ -1,1486 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.pdf;
-
-import java.io.BufferedOutputStream;
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.Closeable;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.SequenceInputStream;
-import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-import java.text.DecimalFormat;
-import java.text.DecimalFormatSymbols;
-import java.text.NumberFormat;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Deque;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Hashtable;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
-import org.apache.pdfbox.cos.COSArray;
-import org.apache.pdfbox.cos.COSBase;
-import org.apache.pdfbox.cos.COSBoolean;
-import org.apache.pdfbox.cos.COSDictionary;
-import org.apache.pdfbox.cos.COSDocument;
-import org.apache.pdfbox.cos.COSFloat;
-import org.apache.pdfbox.cos.COSInteger;
-import org.apache.pdfbox.cos.COSName;
-import org.apache.pdfbox.cos.COSNull;
-import org.apache.pdfbox.cos.COSNumber;
-import org.apache.pdfbox.cos.COSObject;
-import org.apache.pdfbox.cos.COSObjectKey;
-import org.apache.pdfbox.cos.COSStream;
-import org.apache.pdfbox.cos.COSString;
-import org.apache.pdfbox.cos.COSUpdateInfo;
-import org.apache.pdfbox.cos.ICOSVisitor;
-import org.apache.pdfbox.filter.Filter;
-import org.apache.pdfbox.filter.FilterFactory;
-import org.apache.pdfbox.io.IOUtils;
-import org.apache.pdfbox.io.RandomAccessInputStream;
-import org.apache.pdfbox.io.RandomAccessRead;
-import org.apache.pdfbox.pdfparser.PDFXRefStream;
-import org.apache.pdfbox.pdfparser.xref.FreeXReference;
-import org.apache.pdfbox.pdfparser.xref.NormalXReference;
-import org.apache.pdfbox.pdfparser.xref.XReferenceEntry;
-import org.apache.pdfbox.pdfwriter.COSStandardOutputStream;
-import org.apache.pdfbox.pdfwriter.COSWriter;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.common.PDStream;
-import org.apache.pdfbox.pdmodel.encryption.SecurityHandler;
-import org.apache.pdfbox.pdmodel.fdf.FDFDocument;
-import org.apache.pdfbox.pdmodel.interactive.digitalsignature.COSFilterInputStream;
-import org.apache.pdfbox.pdmodel.interactive.digitalsignature.SignatureInterface;
-import org.apache.pdfbox.util.Hex;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-
-//TODO PDFBOX30 replace COSWriterXRefEntry with XReferenceEntry (and much more)
-
-public class EvilCOSWriter implements ICOSVisitor, Closeable {
-
- /**
- * The dictionary open token.
- */
- public static final byte[] DICT_OPEN = "<<".getBytes(StandardCharsets.US_ASCII);
- /**
- * The dictionary close token.
- */
- public static final byte[] DICT_CLOSE = ">>".getBytes(StandardCharsets.US_ASCII);
- /**
- * space character.
- */
- public static final byte[] SPACE = {' '};
- /**
- * The start to a PDF comment.
- */
- public static final byte[] COMMENT = {'%'};
- /**
- * The output version of the PDF.
- */
- public static final byte[] VERSION = "PDF-1.4".getBytes(StandardCharsets.US_ASCII);
- /**
- * Garbage bytes used to create the PDF header.
- */
- public static final byte[] GARBAGE =
- new byte[]{(byte) 0xf6, (byte) 0xe4, (byte) 0xfc, (byte) 0xdf};
- /**
- * The EOF constant.
- */
- public static final byte[] EOF = "%%EOF".getBytes(StandardCharsets.US_ASCII);
- /**
- * The reference token.
- */
- public static final byte[] REFERENCE = "R".getBytes(StandardCharsets.US_ASCII);
- // pdf tokens
- /**
- * The XREF token.
- */
- public static final byte[] XREF = "xref".getBytes(StandardCharsets.US_ASCII);
- /**
- * The xref free token.
- */
- public static final byte[] XREF_FREE = "f".getBytes(StandardCharsets.US_ASCII);
- /**
- * The xref used token.
- */
- public static final byte[] XREF_USED = "n".getBytes(StandardCharsets.US_ASCII);
- /**
- * The trailer token.
- */
- public static final byte[] TRAILER = "trailer".getBytes(StandardCharsets.US_ASCII);
- /**
- * The start xref token.
- */
- public static final byte[] STARTXREF = "startxref".getBytes(StandardCharsets.US_ASCII);
- /**
- * The starting object token.
- */
- public static final byte[] OBJ = "obj".getBytes(StandardCharsets.US_ASCII);
- /**
- * The end object token.
- */
- public static final byte[] ENDOBJ = "endobj".getBytes(StandardCharsets.US_ASCII);
- /**
- * The array open token.
- */
- public static final byte[] ARRAY_OPEN = "[".getBytes(StandardCharsets.US_ASCII);
- /**
- * The array close token.
- */
- public static final byte[] ARRAY_CLOSE = "]".getBytes(StandardCharsets.US_ASCII);
- /**
- * The open stream token.
- */
- public static final byte[] STREAM = "stream".getBytes(StandardCharsets.US_ASCII);
- /**
- * The close stream token.
- */
- public static final byte[] ENDSTREAM = "endstream".getBytes(StandardCharsets.US_ASCII);
- private static final Logger LOG = LoggerFactory.getLogger(EvilCOSWriter.class);
- private final NumberFormat formatXrefOffset =
- new DecimalFormat("0000000000", DecimalFormatSymbols.getInstance(Locale.US));
-
- // the decimal format for the xref object generation number data
- private final NumberFormat formatXrefGeneration =
- new DecimalFormat("00000", DecimalFormatSymbols.getInstance(Locale.US));
- // maps the object to the keys generated in the writer
- // these are used for indirect references in other objects
- //A hashtable is used on purpose over a hashmap
- //so that null entries will not get added.
- @SuppressWarnings({"squid:S1149"})
- private final Map<COSBase, COSObjectKey> objectKeys = new Hashtable<>();
- private final Map<COSObjectKey, COSBase> keyObject = new HashMap<>();
- // the list of x ref entries to be made so far
- private final List<XReferenceEntry> xRefEntries = new ArrayList<>();
- private final Set<COSBase> objectsToWriteSet = new HashSet<>();
- //A list of objects to write.
- private final Deque<COSBase> objectsToWrite = new LinkedList<>();
- //a list of objects already written
- private final Set<COSBase> writtenObjects = new HashSet<>();
- //An 'actual' is any COSBase that is not a COSObject.
- //need to keep a list of the actuals that are added
- //as well as the objects because there is a problem
- //when adding a COSObject and then later adding
- //the actual for that object, so we will track
- //actuals separately.
- private final Set<COSBase> actualsAdded = new HashSet<>();
- private final PDFTransformerConfig config;
- private final Random random = new Random();
- // the stream where we create the pdf output
- private OutputStream output;
- // the stream used to write standard cos data
- private COSStandardOutputStream standardOutput;
- // the start position of the x ref section
- private long startxref = 0;
- // the current object number
- private long number = 0;
- private int roughNumberOfObjects = 0;
- private COSObjectKey currentObjectKey = null;
- private PDDocument pdDocument = null;
- private FDFDocument fdfDocument = null;
- private boolean willEncrypt = false;
- // signing
- private final boolean incrementalUpdate = false;
- private boolean reachedSignature = false;
- private long signatureOffset;
- private long signatureLength;
- private long byteRangeOffset;
- private long byteRangeLength;
- private RandomAccessRead incrementalInput;
- private OutputStream incrementalOutput;
- private SignatureInterface signatureInterface;
- private byte[] incrementPart;
- private COSArray byteRangeArray;
- private final FilterFactory filterFactory = FilterFactory.INSTANCE;
-
- /**
- * COSWriter constructor.
- *
- * @param outputStream The output stream to write the PDF. It will be closed when this object is
- * closed.
- */
- public EvilCOSWriter(OutputStream outputStream, PDFTransformerConfig config) {
- setOutput(outputStream);
- setStandardOutput(new COSStandardOutputStream(output));
- this.config = config;
- }
-
- /**
- * This will output the given byte getString as a PDF object.
- *
- * @param string COSString to be written
- * @param output The stream to write to.
- * @throws IOException If there is an error writing to the stream.
- */
- public static void writeString(COSString string, OutputStream output) throws IOException {
- writeString(string.getBytes(), string.getForceHexForm(), output);
- }
-
- /**
- * This will output the given text/byte getString as a PDF object.
- *
- * @param bytes byte array representation of a string to be written
- * @param output The stream to write to.
- * @throws IOException If there is an error writing to the stream.
- */
- public static void writeString(byte[] bytes, OutputStream output) throws IOException {
- writeString(bytes, false, output);
- }
-
- /**
- * This will output the given text/byte string as a PDF object.
- *
- * @param output The stream to write to.
- * @throws IOException If there is an error writing to the stream.
- */
- private static void writeString(byte[] bytes, boolean forceHex, OutputStream output)
- throws IOException {
- // check for non-ASCII characters
- boolean isASCII = true;
- if (!forceHex) {
- for (byte b : bytes) {
- // if the byte is negative then it is an eight bit byte and is outside the ASCII range
- if (b < 0) {
- isASCII = false;
- break;
- }
- // PDFBOX-3107 EOL markers within a string are troublesome
- if (b == 0x0d || b == 0x0a) {
- isASCII = false;
- break;
- }
- }
- }
-
- if (isASCII && !forceHex) {
- // write ASCII string
- output.write('(');
- for (byte b : bytes) {
- switch (b) {
- case '(':
- case ')':
- case '\\':
- output.write('\\');
- output.write(b);
- break;
- default:
- output.write(b);
- break;
- }
- }
- output.write(')');
- } else {
- // write hex string
- output.write('<');
- Hex.writeHexBytes(bytes, output);
- output.write('>');
- }
- }
-
- private void prepareIncrement(PDDocument doc) throws IOException {
- if (doc != null) {
- COSDocument cosDoc = doc.getDocument();
-
- Map<COSObjectKey, Long> xrefTable = cosDoc.getXrefTable();
- Set<COSObjectKey> keySet = xrefTable.keySet();
- long highestNumber = doc.getDocument().getHighestXRefObjectNumber();
- for (COSObjectKey cosObjectKey : keySet) {
- COSBase object = cosDoc.getObjectFromPool(cosObjectKey).getObject();
- if (object != null && cosObjectKey != null && !(object instanceof COSNumber)) {
- objectKeys.put(object, cosObjectKey);
- keyObject.put(cosObjectKey, object);
- }
-
- if (cosObjectKey != null) {
- long num = cosObjectKey.getNumber();
- if (num > highestNumber) {
- highestNumber = num;
- }
- }
- }
- setNumber(highestNumber);
- }
- }
-
- /**
- * add an entry in the x ref table for later dump.
- *
- * @param entry The new entry to add.
- */
- protected void addXRefEntry(XReferenceEntry entry) {
- getXRefEntries().add(entry);
- }
-
- /**
- * This will close the stream.
- *
- * @throws IOException If the underlying stream throws an exception.
- */
- @Override
- public void close() throws IOException {
- if (getStandardOutput() != null) {
- getStandardOutput().close();
- }
- if (incrementalOutput != null) {
- incrementalOutput.close();
- }
- }
-
- /**
- * This will get the current object number.
- *
- * @return The current object number.
- */
- protected long getNumber() {
- return number;
- }
-
- /**
- * This will set the current object number.
- *
- * @param newNumber The new object number.
- */
- protected void setNumber(long newNumber) {
- number = newNumber;
-
- }
-
- /**
- * This will get all available object keys.
- *
- * @return A map of all object keys.
- */
- public Map<COSBase, COSObjectKey> getObjectKeys() {
- return objectKeys;
- }
-
- /**
- * This will get the output stream.
- *
- * @return The output stream.
- */
- protected java.io.OutputStream getOutput() {
- return output;
- }
-
- /**
- * This will set the output stream.
- *
- * @param newOutput The new output stream.
- */
- private void setOutput(OutputStream newOutput) {
- output = newOutput;
- }
-
- /**
- * This will get the standard output stream.
- *
- * @return The standard output stream.
- */
- protected COSStandardOutputStream getStandardOutput() {
- return standardOutput;
- }
-
- /**
- * This will set the standard output stream.
- *
- * @param newStandardOutput The new standard output stream.
- */
- private void setStandardOutput(COSStandardOutputStream newStandardOutput) {
- standardOutput = newStandardOutput;
- }
-
- /**
- * This will get the current start xref.
- *
- * @return The current start xref.
- */
- protected long getStartxref() {
- return startxref;
- }
-
- /**
- * This will set the start xref.
- *
- * @param newStartxref The new start xref attribute.
- */
- protected void setStartxref(long newStartxref) {
- startxref = newStartxref;
- }
-
- /**
- * This will get the xref entries.
- *
- * @return All available xref entries.
- */
- protected List<XReferenceEntry> getXRefEntries() {
- return xRefEntries;
- }
-
- /**
- * This will write the body of the document.
- *
- * @param doc The document to write the body for.
- * @throws IOException If there is an error writing the data.
- */
- protected void doWriteBody(COSDocument doc) throws IOException {
- COSDictionary trailer = doc.getTrailer();
- COSDictionary root = trailer.getCOSDictionary(COSName.ROOT);
- COSDictionary info = trailer.getCOSDictionary(COSName.INFO);
- COSDictionary encrypt = trailer.getCOSDictionary(COSName.ENCRYPT);
- roughNumberOfObjects = doc.getXrefTable().size();
- if (root != null) {
- addObjectToWrite(root);
- }
- if (info != null) {
- addObjectToWrite(info);
- }
-
- doWriteObjects();
- willEncrypt = false;
- if (encrypt != null) {
- addObjectToWrite(encrypt);
- }
-
- doWriteObjects();
- }
-
- private void doWriteObjects() throws IOException {
- while (objectsToWrite.size() > 0) {
- COSBase nextObject = objectsToWrite.removeFirst();
- objectsToWriteSet.remove(nextObject);
- doWriteObject(nextObject);
- }
- }
-
- private void addObjectToWrite(COSBase object) {
- COSBase actual = object;
- if (actual instanceof COSObject) {
- actual = ((COSObject) actual).getObject();
- }
-
- if (!writtenObjects.contains(object) && !objectsToWriteSet.contains(object) &&
- !actualsAdded.contains(actual)) {
- COSBase cosBase = null;
- COSObjectKey cosObjectKey = null;
- if (actual != null) {
- cosObjectKey = objectKeys.get(actual);
- }
- if (cosObjectKey != null) {
- cosBase = keyObject.get(cosObjectKey);
- }
- if (actual != null && objectKeys.containsKey(actual) &&
- object instanceof COSUpdateInfo &&
- !((COSUpdateInfo) object).isNeedToBeUpdated() &&
- cosBase instanceof COSUpdateInfo &&
- !((COSUpdateInfo) cosBase).isNeedToBeUpdated()) {
- return;
- }
- objectsToWrite.add(object);
- objectsToWriteSet.add(object);
- if (actual != null) {
- actualsAdded.add(actual);
- }
- }
- }
-
- public void doWriteObject( COSBase obj ) throws IOException {
- writtenObjects.add( obj );
- // find the physical reference
- currentObjectKey = getObjectKey( obj );
- doWriteObject(currentObjectKey, obj);
- }
-
- public void doWriteObject(COSObjectKey key, COSBase obj) throws IOException
- {
- // don't write missing objects to avoid broken xref tables
- if (obj == null || (obj instanceof COSObject && ((COSObject) obj).getObject() == null))
- {
- return;
- }
- writtenObjects.add(obj);
- // find the physical reference
- currentObjectKey = getObjectKey(obj);
-
- // add a x ref entry
- addXRefEntry(new NormalXReference(getStandardOutput().getPos(), key, obj));
- long objectNumber = currentObjectKey.getNumber();
- if (config.getRandomizeObjectNumbers() > 0.0f &&
- random.nextFloat() < config.getRandomizeObjectNumbers()) {
- objectNumber = random.nextInt(((int) objectNumber) * 2);
- }
- // write the object
- getStandardOutput()
- .write(Long.toString(objectNumber).getBytes(StandardCharsets.ISO_8859_1));
- getStandardOutput().write(SPACE);
- getStandardOutput()
- .write(String.valueOf(key.getGeneration()).getBytes(StandardCharsets.ISO_8859_1));
- getStandardOutput().write(SPACE);
- getStandardOutput().write(OBJ);
- getStandardOutput().writeEOL();
- mutate(obj);
- if (obj != null) {
- writeObjContents(obj);
- }
- getStandardOutput().writeEOL();
- getStandardOutput().write(ENDOBJ);
- getStandardOutput().writeEOL();
- }
-
- private void writeObjContents(COSBase obj) throws IOException {
- if (!(obj instanceof COSObject)) {
- obj.accept(this);
- return;
- }
-
- COSObject cosObject = (COSObject) obj;
- COSBase underlyingObject = cosObject.getObject();
- if (underlyingObject instanceof COSStream &&
- config.getUnfilteredStreamTransformer() != null) {
- COSStream cosStream = (COSStream) underlyingObject;
- Transformer unfilteredStreamTransformer = config.getUnfilteredStreamTransformer();
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- try (InputStream is = cosStream.createRawInputStream()) {
- IOUtils.copy(is, bos);
- }
- ByteArrayOutputStream transformed = new ByteArrayOutputStream();
- try {
- unfilteredStreamTransformer.transform(new ByteArrayInputStream(bos.toByteArray()),
- transformed);
- } catch (TikaException e) {
- throw new IOException(e);
- }
- try (OutputStream os = cosStream.createRawOutputStream()) {
- IOUtils.copy(new ByteArrayInputStream(transformed.toByteArray()), os);
- }
- //stream automatically sets the length correctly
- obj.accept(this);
- } else {
- obj.accept(this);
- }
- }
-
- private void mutate(COSBase obj) throws IOException {
-
- //stub
- if (obj instanceof COSStream) {
- COSStream stream = (COSStream) obj;
- //get the raw unfiltered bytes
- byte[] bytes = new PDStream(stream).toByteArray();
- //transform the underlying stream _before_ filters are applied
- if (config.getStreamTransformer() != null) {
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- try {
- config.getStreamTransformer().transform(new ByteArrayInputStream(bytes), bos);
- } catch (TikaException e) {
- throw new IOException(e);
- }
- bytes = bos.toByteArray();
- }
- COSBase filters = getFilters(stream.getFilters());
- if (filters instanceof COSNull) {
- stream.removeItem(COSName.FILTER);
- } else {
- List<COSName> usedFilters = new ArrayList<>();
- long length = -1;
- try (TikaInputStream rawBytes = TikaInputStream.get(bytes)) {
- try (TikaInputStream filtered = runFilters(filters, rawBytes, usedFilters)) {
- //rewrite raw bytes after running own filters
- try (OutputStream streamOut = stream.createRawOutputStream()) {
- IOUtils.copy(filtered, streamOut);
- }
- length = filtered.getLength();
- }
- }
- Collections.reverse(usedFilters);
- COSArray actualFilters = new COSArray();
- for (COSName f : usedFilters) {
- actualFilters.add(f);
- }
- //TODO: parameterize wonkifying length and filters
- stream.setLong(COSName.LENGTH, length);
- stream.setItem(COSName.FILTER, actualFilters);
- }
- } else if (obj instanceof COSObject) {
- COSBase underlyingObject = ((COSObject) obj).getObject();
- mutate(underlyingObject);
-
- }
- }
-
- private TikaInputStream runFilters(COSBase filters, TikaInputStream is,
- List<COSName> usedFilters) throws IOException {
- if (filters instanceof COSNull) {
- } else if (filters instanceof COSName) {
- is = runFilter((COSName) filters, is, new COSDictionary(), 0);
- usedFilters.add((COSName) filters);
- LOG.debug("filter:" + filters + " " + 0 + " : " + is.getLength());
- } else if (filters instanceof COSArray) {
- COSArray filterArray = (COSArray) filters;
- //need to apply them in reverse order!
- boolean transformed = false;
- for (int i = filterArray.size() - 1; i >= 0; i--) {
- COSName filter = (COSName) filterArray.get(i);
- is = runFilter(filter, is, new COSDictionary(), 0);
- if (random.nextFloat() > 0.1 && transformed == false) {
- is = transformRawStream(is);
- transformed = true;
- }
- usedFilters.add(filter);
- LOG.debug("filter:" + filter.toString() + " " + i + " : " + is.getLength());
- if (is.getLength() > config.getMaxFilteredStreamLength()) {
- LOG.debug("stopping early");
- return is;
- }
- }
- return is;
- } else {
- throw new IllegalArgumentException(
- "Can't handle this class here: " + filters.getClass());
- }
- return transformRawStream(is);
- }
-
- private TikaInputStream transformRawStream(TikaInputStream is) throws IOException {
- if (config.getUnfilteredStreamTransformer() != null) {
- if (is.getLength() < 10000000) {
- try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
- config.getUnfilteredStreamTransformer().transform(is, bos);
- bos.flush();
- bos.close();
- return TikaInputStream.get(bos.toByteArray());
- } catch (TikaException e) {
- throw new IOException(e);
- }
- } else {
- TemporaryResources tmp = new TemporaryResources();
- Path p = tmp.createTempFile();
- try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(p))) {
- config.getUnfilteredStreamTransformer().transform(is, os);
- os.flush();
- } catch (TikaException e) {
- throw new IOException(e);
- }
- return TikaInputStream.get(p, new Metadata(), tmp);
- }
- }
- return is;
- }
-
- private TikaInputStream runFilter(COSName filterCOSName, TikaInputStream tis,
- COSDictionary filterParameters, int filterIndex)
- throws IOException {
-
- Filter filter = filterFactory.getFilter(filterCOSName);
- if (tis.getLength() < 100000000) {
- try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
- filter.encode(tis, bos, filterParameters, filterIndex);
- bos.flush();
- bos.close();
- return TikaInputStream.get(bos.toByteArray());
- } finally {
- tis.close();
- }
- } else {
- TemporaryResources tmp = new TemporaryResources();
- Path p = tmp.createTempFile();
- try (OutputStream os = new BufferedOutputStream(Files.newOutputStream(p))) {
- filter.encode(tis, os, filterParameters, filterIndex);
- } finally {
- tis.close();
- }
- return TikaInputStream.get(p, new Metadata(), tmp);
- }
- }
-
- private COSBase getFilters(COSBase existingFilters) {
- List<COSName> filters = config.getFilters(existingFilters);
- if (filters.size() == 0) {
- return COSNull.NULL;
- } else if (filters.size() == 1) {
- return filters.get(0);
- } else {
- COSArray arr = new COSArray();
- for (COSName n : filters) {
- arr.add(n);
- }
- return arr;
- }
- }
-
- /**
- * This will write the header to the PDF document.
- *
- * @param doc The document to get the data from.
- * @throws IOException If there is an error writing to the stream.
- */
- protected void doWriteHeader(COSDocument doc) throws IOException {
- String headerString;
- if (fdfDocument != null) {
- headerString = "%FDF-" + doc.getVersion();
- } else {
- headerString = "%PDF-" + doc.getVersion();
- }
- getStandardOutput().write(headerString.getBytes(StandardCharsets.ISO_8859_1));
-
- getStandardOutput().writeEOL();
- getStandardOutput().write(COMMENT);
- getStandardOutput().write(GARBAGE);
- getStandardOutput().writeEOL();
- }
-
- /**
- * This will write the trailer to the PDF document.
- *
- * @param doc The document to create the trailer for.
- * @throws IOException If there is an IOError while writing the document.
- */
- protected void doWriteTrailer(COSDocument doc) throws IOException {
- getStandardOutput().write(TRAILER);
- getStandardOutput().writeEOL();
-
- COSDictionary trailer = doc.getTrailer();
- //sort xref, needed only if object keys not regenerated
- Collections.sort(getXRefEntries());
- XReferenceEntry lastEntry = getXRefEntries().get(getXRefEntries().size() - 1);
-
- trailer.setLong(COSName.SIZE, lastEntry.getReferencedKey().getNumber() + 1);
- // Only need to stay, if an incremental update will be performed
- if (!incrementalUpdate) {
- trailer.removeItem(COSName.PREV);
- }
- if (!doc.isXRefStream()) {
- trailer.removeItem(COSName.XREF_STM);
- }
- // Remove a checksum if present
- trailer.removeItem(COSName.DOC_CHECKSUM);
-
- COSArray idArray = trailer.getCOSArray(COSName.ID);
- if (idArray != null) {
- idArray.setDirect(true);
- }
-
- trailer.accept(this);
- }
-
- private void doWriteXRefInc(COSDocument doc, long hybridPrev) throws IOException {
- if (doc.isXRefStream() || hybridPrev != -1) {
- // the file uses XrefStreams, so we need to update
- // it with an xref stream. We create a new one and fill it
- // with data available here
-
- // create a new XRefStrema object
- PDFXRefStream pdfxRefStream = new PDFXRefStream(doc);
-
- // add all entries from the incremental update.
- List<XReferenceEntry> xRefEntries2 = getXRefEntries();
- for (XReferenceEntry cosWriterXRefEntry : xRefEntries2) {
- pdfxRefStream.addEntry(cosWriterXRefEntry);
- }
-
- COSDictionary trailer = doc.getTrailer();
- if (incrementalUpdate) {
- // use previous startXref value as new PREV value
- trailer.setLong(COSName.PREV, doc.getStartXref());
- } else {
- trailer.removeItem(COSName.PREV);
- }
- pdfxRefStream.addTrailerInfo(trailer);
- // the size is the highest object number+1. we add one more
- // for the xref stream object we are going to write
- pdfxRefStream.setSize(getNumber() + 2);
-
- setStartxref(getStandardOutput().getPos());
- COSStream stream2 = pdfxRefStream.getStream();
- doWriteObject(stream2);
- }
-
- if (!doc.isXRefStream() || hybridPrev != -1) {
- COSDictionary trailer = doc.getTrailer();
- trailer.setLong(COSName.PREV, doc.getStartXref());
- if (hybridPrev != -1) {
- COSName xrefStm = COSName.XREF_STM;
- trailer.removeItem(xrefStm);
- trailer.setLong(xrefStm, getStartxref());
- }
- doWriteXRefTable();
- doWriteTrailer(doc);
- }
- }
-
- // writes the "xref" table
- private void doWriteXRefTable() throws IOException {
- addXRefEntry(FreeXReference.NULL_ENTRY);
-
- // sort xref, needed only if object keys not regenerated
- Collections.sort(getXRefEntries());
-
- // remember the position where x ref was written
- setStartxref(getStandardOutput().getPos());
-
- getStandardOutput().write(XREF);
- getStandardOutput().writeEOL();
- // write start object number and object count for this x ref section
- // we assume starting from scratch
-
- Long[] xRefRanges = getXRefRanges(getXRefEntries());
- int xRefLength = xRefRanges.length;
- int x = 0;
- int j = 0;
- while (x < xRefLength && (xRefLength % 2) == 0) {
- writeXrefRange(xRefRanges[x], xRefRanges[x + 1]);
-
- for (int i = 0; i < xRefRanges[x + 1]; ++i) {
- writeXrefEntry(xRefEntries.get(j++));
- }
- x += 2;
- }
- }
-
- /**
- * Write an incremental update for a non signature case. This can be used for e.g. augmenting
- * signatures.
- *
- * @throws IOException
- */
- private void doWriteIncrement() throws IOException {
- // write existing PDF
- IOUtils.copy(new RandomAccessInputStream(incrementalInput), incrementalOutput);
- // write the actual incremental update
- incrementalOutput.write(getBytes(output));
- }
-
- private void doWriteSignature() throws IOException {
- // calculate the ByteRange values
- long inLength = incrementalInput.length();
- long beforeLength = signatureOffset;
- long afterOffset = signatureOffset + signatureLength;
- long afterLength = getStandardOutput().getPos() - (inLength + signatureLength) -
- (signatureOffset - inLength);
-
- String byteRange = "0 " + beforeLength + " " + afterOffset + " " + afterLength + "]";
-
- // Assign the values to the actual COSArray, so that the user can access it before closing
- byteRangeArray.set(0, COSInteger.ZERO);
- byteRangeArray.set(1, COSInteger.get(beforeLength));
- byteRangeArray.set(2, COSInteger.get(afterOffset));
- byteRangeArray.set(3, COSInteger.get(afterLength));
-
- if (byteRange.length() > byteRangeLength) {
- throw new IOException("Can't write new byteRange '" + byteRange +
- "' not enough space: byteRange.length(): " + byteRange.length() +
- ", byteRangeLength: " + byteRangeLength);
- }
-
- // copy the new incremental data into a buffer (e.g. signature dict, trailer)
- output.flush();
- incrementPart = getBytes(output);
-
- // overwrite the ByteRange in the buffer
- byte[] byteRangeBytes = byteRange.getBytes(StandardCharsets.ISO_8859_1);
- for (int i = 0; i < byteRangeLength; i++) {
- if (i >= byteRangeBytes.length) {
- incrementPart[(int) (byteRangeOffset + i - inLength)] = 0x20; // SPACE
- } else {
- incrementPart[(int) (byteRangeOffset + i - inLength)] = byteRangeBytes[i];
- }
- }
-
- if (signatureInterface != null) {
- // data to be signed
- try (InputStream dataToSign = getDataToSign()) {
- // sign the bytes
- byte[] signatureBytes = signatureInterface.sign(dataToSign);
- writeExternalSignature(signatureBytes);
- }
- }
- // else signature should created externally and set via writeSignature()
- }
-
- /**
- * Return the stream of PDF data to be signed. Clients should use this method only to create
- * signatures externally. {@link #write(PDDocument)} method should have been called prior. The
- * created signature should be set using {@link #writeExternalSignature(byte[])}.
- * <p>
- * When {@link SignatureInterface} instance is used, COSWriter obtains and writes the signature
- * itself.
- * </p>
- *
- * @return data stream to be signed
- * @throws IllegalStateException if PDF is not prepared for external signing
- * @throws IOException if input data is closed
- */
- public InputStream getDataToSign() throws IOException {
- if (incrementPart == null || incrementalInput == null) {
- throw new IllegalStateException("PDF not prepared for signing");
- }
- // range of incremental bytes to be signed (includes /ByteRange but not /Contents)
- int incPartSigOffset = (int) (signatureOffset - incrementalInput.length());
- int afterSigOffset = incPartSigOffset + (int) signatureLength;
- int[] range = {0, incPartSigOffset, afterSigOffset, incrementPart.length - afterSigOffset};
-
- return new SequenceInputStream(new RandomAccessInputStream(incrementalInput),
- new COSFilterInputStream(incrementPart, range));
- }
-
- /**
- * Write externally created signature of PDF data obtained via {@link #getDataToSign()} method.
- *
- * @param cmsSignature CMS signature byte array
- * @throws IllegalStateException if PDF is not prepared for external signing
- * @throws IOException if source data stream is closed
- */
- public void writeExternalSignature(byte[] cmsSignature) throws IOException {
-
- if (incrementPart == null || incrementalInput == null) {
- throw new IllegalStateException("PDF not prepared for setting signature");
- }
- byte[] signatureBytes = Hex.getBytes(cmsSignature);
-
- // subtract 2 bytes because of the enclosing "<>"
- if (signatureBytes.length > signatureLength - 2) {
- throw new IOException("Can't write signature, not enough space");
- }
-
- // overwrite the signature Contents in the buffer
- int incPartSigOffset = (int) (signatureOffset - incrementalInput.length());
- System.arraycopy(signatureBytes, 0, incrementPart, incPartSigOffset + 1,
- signatureBytes.length);
-
- // write the data to the incremental output stream
- IOUtils.copy(new RandomAccessInputStream(incrementalInput), incrementalOutput);
- incrementalOutput.write(incrementPart);
-
- // prevent further use
- incrementPart = null;
- }
-
- private void writeXrefRange(long x, long y) throws IOException {
- getStandardOutput().write(String.valueOf(x).getBytes(StandardCharsets.ISO_8859_1));
- getStandardOutput().write(SPACE);
- getStandardOutput().write(String.valueOf(y).getBytes(StandardCharsets.ISO_8859_1));
- getStandardOutput().writeEOL();
- }
-
- private void writeXrefEntry(XReferenceEntry entry) throws IOException
- {
- String offset = formatXrefOffset.format(entry.getSecondColumnValue());
- String generation = formatXrefGeneration.format(entry.getThirdColumnValue());
- getStandardOutput().write(offset.getBytes(StandardCharsets.ISO_8859_1));
- getStandardOutput().write(SPACE);
- getStandardOutput().write(generation.getBytes(StandardCharsets.ISO_8859_1));
- getStandardOutput().write(SPACE);
- getStandardOutput().write(entry instanceof FreeXReference ? XREF_FREE : XREF_USED);
- getStandardOutput().writeCRLF();
- }
-
- /**
- * check the xref entries and write out the ranges. The format of the
- * returned array is exactly the same as the pdf specification. See section
- * 7.5.4 of ISO32000-1:2008, example 1 (page 40) for reference.
- * <p>
- * example: 0 1 2 5 6 7 8 10
- * <p>
- * will create a array with follow ranges
- * <p>
- * 0 3 5 4 10 1
- * <p>
- * this mean that the element 0 is followed by two other related numbers
- * that represent a cluster of the size 3. 5 is follow by three other
- * related numbers and create a cluster of size 4. etc.
- *
- * @param xRefEntriesList list with the xRef entries that was written
- * @return a integer array with the ranges
- */
- protected Long[] getXRefRanges(List<XReferenceEntry> xRefEntriesList) {
- long last = -2;
- long count = 1;
-
- List<Long> list = new ArrayList<>();
- for (XReferenceEntry object : xRefEntriesList) {
- long nr = (int) object.getReferencedKey().getNumber();
- if (nr == last + 1) {
- ++count;
- last = nr;
- } else if (last == -2) {
- last = nr;
- } else {
- list.add(last - count + 1);
- list.add(count);
- last = nr;
- count = 1;
- }
- }
- // If no new entry is found, we need to write out the last result
- if (xRefEntriesList.size() > 0) {
- list.add(last - count + 1);
- list.add(count);
- }
- return list.toArray(new Long[0]);
- }
-
- /**
- * This will get the object key for the object.
- *
- * @param obj The object to get the key for.
- * @return The object key for the object.
- */
- private COSObjectKey getObjectKey(COSBase obj) {
- COSBase actual = obj;
- if (actual instanceof COSObject) {
- actual = ((COSObject) obj).getObject();
- }
- // PDFBOX-4540: because objectKeys is accessible from outside, it is possible
- // that a COSObject obj is already in the objectKeys map.
- COSObjectKey key = objectKeys.get(obj);
- if (key == null && actual != null) {
- key = objectKeys.get(actual);
- }
- if (key == null) {
- setNumber(getNumber() + 1);
- key = new COSObjectKey(getNumber(), 0);
- objectKeys.put(obj, key);
- if (actual != null) {
- objectKeys.put(actual, key);
- }
- }
- return key;
- }
-
- @Override
- public void visitFromArray(COSArray obj) throws IOException {
- int count = 0;
- getStandardOutput().write(ARRAY_OPEN);
- for (Iterator<COSBase> i = obj.iterator(); i.hasNext(); ) {
- COSBase current = i.next();
- if (current instanceof COSDictionary) {
- if (current.isDirect()) {
- visitFromDictionary((COSDictionary) current);
- } else {
- addObjectToWrite(current);
- writeReference(current);
- }
- } else if (current instanceof COSObject) {
- COSBase subValue = ((COSObject) current).getObject();
- if (willEncrypt || incrementalUpdate || subValue instanceof COSDictionary ||
- subValue == null) {
- // PDFBOX-4308: added willEncrypt to prevent an object
- // that is referenced several times from being written
- // direct and indirect, thus getting encrypted
- // with wrong object number or getting encrypted twice
- addObjectToWrite(current);
- writeReference(current);
- } else {
- subValue.accept(this);
- }
- } else if (current == null) {
- COSNull.NULL.accept(this);
- } else {
- current.accept(this);
- }
- count++;
- if (i.hasNext()) {
- if (count % 10 == 0) {
- getStandardOutput().writeEOL();
- } else {
- getStandardOutput().write(SPACE);
- }
- }
- }
- getStandardOutput().write(ARRAY_CLOSE);
- getStandardOutput().writeEOL();
- }
-
- @Override
- public void visitFromBoolean(COSBoolean obj) throws IOException {
- obj.writePDF(getStandardOutput());
- }
-
- @Override
- public void visitFromDictionary(COSDictionary obj) throws IOException {
- if (!reachedSignature) {
- COSBase itemType = obj.getItem(COSName.TYPE);
- if (COSName.SIG.equals(itemType) || COSName.DOC_TIME_STAMP.equals(itemType)) {
- reachedSignature = true;
- }
- }
- getStandardOutput().write(DICT_OPEN);
- getStandardOutput().writeEOL();
- for (Map.Entry<COSName, COSBase> entry : obj.entrySet()) {
- COSBase value = entry.getValue();
- if (value != null) {
- entry.getKey().accept(this);
- getStandardOutput().write(SPACE);
- if (value instanceof COSDictionary) {
- COSDictionary dict = (COSDictionary) value;
-
- if (!incrementalUpdate) {
- // write all XObjects as direct objects, this will save some size
- // PDFBOX-3684: but avoid dictionary that references itself
- COSBase item = dict.getItem(COSName.XOBJECT);
- if (item != null && !COSName.XOBJECT.equals(entry.getKey())) {
- item.setDirect(true);
- }
- item = dict.getItem(COSName.RESOURCES);
- if (item != null && !COSName.RESOURCES.equals(entry.getKey())) {
- item.setDirect(true);
- }
- }
-
- if (dict.isDirect()) {
- // If the object should be written direct, we need
- // to pass the dictionary to the visitor again.
- visitFromDictionary(dict);
- } else {
- addObjectToWrite(dict);
- writeReference(dict);
- }
- } else if (value instanceof COSObject) {
- COSBase subValue = ((COSObject) value).getObject();
- if (willEncrypt || incrementalUpdate || subValue instanceof COSDictionary ||
- subValue == null) {
- // PDFBOX-4308: added willEncrypt to prevent an object
- // that is referenced several times from being written
- // direct and indirect, thus getting encrypted
- // with wrong object number or getting encrypted twice
- addObjectToWrite(value);
- writeReference(value);
- } else {
- subValue.accept(this);
- }
- } else {
- // If we reach the pdf signature, we need to determinate the position of the
- // content and byterange
- if (reachedSignature && COSName.CONTENTS.equals(entry.getKey())) {
- signatureOffset = getStandardOutput().getPos();
- value.accept(this);
- signatureLength = getStandardOutput().getPos() - signatureOffset;
- } else if (reachedSignature && COSName.BYTERANGE.equals(entry.getKey())) {
- byteRangeArray = (COSArray) entry.getValue();
- byteRangeOffset = getStandardOutput().getPos() + 1;
- value.accept(this);
- byteRangeLength = getStandardOutput().getPos() - 1 - byteRangeOffset;
- reachedSignature = false;
- } else {
- value.accept(this);
- }
- }
- getStandardOutput().writeEOL();
-
- } else {
- //then we won't write anything, there are a couple cases
- //were the value of an entry in the COSDictionary will
- //be a dangling reference that points to nothing
- //so we will just not write out the entry if that is the case
- }
- }
- getStandardOutput().write(DICT_CLOSE);
- getStandardOutput().writeEOL();
- }
-
- @Override
- public void visitFromDocument(COSDocument doc) throws IOException {
- if (!incrementalUpdate) {
- doWriteHeader(doc);
- } else {
- // Sometimes the original file will be missing a newline at the end
- // In order to avoid having %%EOF the first object on the same line
- // as the %%EOF, we put a newline here. If there's already one at
- // the end of the file, an extra one won't hurt. PDFBOX-1051
- getStandardOutput().writeCRLF();
- }
-
- doWriteBody(doc);
-
- // get the previous trailer
- COSDictionary trailer = doc.getTrailer();
- long hybridPrev = -1;
-
- if (trailer != null) {
- hybridPrev = trailer.getLong(COSName.XREF_STM);
- }
-
- if (incrementalUpdate || doc.isXRefStream()) {
- doWriteXRefInc(doc, hybridPrev);
- } else {
- doWriteXRefTable();
- doWriteTrailer(doc);
- }
-
- // write endof
- getStandardOutput().write(STARTXREF);
- getStandardOutput().writeEOL();
- getStandardOutput().write(
- String.valueOf(getStartxref()).getBytes(StandardCharsets.ISO_8859_1));
- getStandardOutput().writeEOL();
- getStandardOutput().write(EOF);
- getStandardOutput().writeEOL();
-
- if (incrementalUpdate) {
- if (signatureOffset == 0 || byteRangeOffset == 0) {
- doWriteIncrement();
- } else {
- doWriteSignature();
- }
- }
- }
-
- @Override
- public void visitFromFloat(COSFloat obj) throws IOException {
- obj.writePDF(getStandardOutput());
-
- }
-
- @Override
- public void visitFromInt(COSInteger obj) throws IOException {
- obj.writePDF(getStandardOutput());
- }
-
- @Override
- public void visitFromName(COSName obj) throws IOException {
- obj.writePDF(getStandardOutput());
- }
-
- @Override
- public void visitFromNull(COSNull obj) throws IOException {
- obj.writePDF(getStandardOutput());
- }
-
- /**
- * visitFromObjRef method comment.
- *
- * @param obj The object that is being visited.
- * @throws IOException If there is an exception while visiting this object.
- */
- public void writeReference(COSBase obj) throws IOException {
- COSObjectKey key = getObjectKey(obj);
- float randomThreshold = config.getRandomizeRefNumbers();
- float r = random.nextFloat();
- if (randomThreshold > 0.0f && r < randomThreshold) {
- long num = random.nextInt(roughNumberOfObjects);
- LOG.debug("corrupting ref number: " + key.getNumber() + " -> " + num);
- getStandardOutput().write(String.valueOf(num).getBytes(StandardCharsets.ISO_8859_1));
- } else {
- getStandardOutput().write(
- String.valueOf(key.getNumber()).getBytes(StandardCharsets.ISO_8859_1));
-
- }
- getStandardOutput().write(SPACE);
- getStandardOutput().write(
- String.valueOf(key.getGeneration()).getBytes(StandardCharsets.ISO_8859_1));
- getStandardOutput().write(SPACE);
- getStandardOutput().write(REFERENCE);
- }
-
- @Override
- public void visitFromStream(COSStream obj) throws IOException {
- if (willEncrypt) {
- pdDocument.getEncryption().getSecurityHandler()
- .encryptStream(obj, currentObjectKey.getNumber(),
- currentObjectKey.getGeneration());
- }
-
- InputStream input = null;
- try {
- // write the stream content
- visitFromDictionary(obj);
- getStandardOutput().write(STREAM);
- getStandardOutput().writeCRLF();
-
- input = obj.createRawInputStream();
- IOUtils.copy(input, getStandardOutput());
-
- getStandardOutput().writeCRLF();
- getStandardOutput().write(ENDSTREAM);
- getStandardOutput().writeEOL();
- } finally {
- if (input != null) {
- input.close();
- }
- }
-
- }
-
- @Override
- public void visitFromString(COSString obj) throws IOException {
- if (willEncrypt) {
- pdDocument.getEncryption().getSecurityHandler()
- .encryptString(obj, currentObjectKey.getNumber(),
- currentObjectKey.getGeneration());
- }
- COSWriter.writeString(obj, getStandardOutput());
- }
-
- /**
- * This will write the pdf document. }
- *
- * @param doc The document to write.
- * @throws IOException If an error occurs while generating the data.
- */
- public void write(COSDocument doc) throws IOException {
- PDDocument pdDoc = new PDDocument(doc);
- write(pdDoc);
- }
-
- /**
- * This will write the pdf document. If signature should be created externally,
- * {@link #writeExternalSignature(byte[])} should be invoked to set signature after calling this method.
- *
- * @param doc The document to write.
- * @throws IOException If an error occurs while generating the data.
- */
- public void write(PDDocument doc) throws IOException {
- write(doc, null);
- }
-
- /**
- * This will write the pdf document. If signature should be created externally,
- * {@link #writeExternalSignature(byte[])} should be invoked to set signature after calling this method.
- *
- * @param doc The document to write.
- * @param signInterface class to be used for signing; {@code null} if external signing would be performed
- * or there will be no signing at all
- * @throws IOException If an error occurs while generating the data.
- * @throws IllegalStateException If the document has an encryption dictionary but no protection
- * policy.
- */
- public void write(PDDocument doc, SignatureInterface signInterface) throws IOException {
- long idTime =
- doc.getDocumentId() == null ? System.currentTimeMillis() : doc.getDocumentId();
-
- pdDocument = doc;
- signatureInterface = signInterface;
-
- if (incrementalUpdate) {
- prepareIncrement(doc);
- }
-
- // if the document says we should remove encryption, then we shouldn't encrypt
- if (doc.isAllSecurityToBeRemoved()) {
- willEncrypt = false;
- // also need to get rid of the "Encrypt" in the trailer so readers
- // don't try to decrypt a document which is not encrypted
- COSDocument cosDoc = doc.getDocument();
- COSDictionary trailer = cosDoc.getTrailer();
- trailer.removeItem(COSName.ENCRYPT);
- } else {
- if (pdDocument.getEncryption() != null) {
- if (!incrementalUpdate) {
- SecurityHandler securityHandler =
- pdDocument.getEncryption().getSecurityHandler();
- if (!securityHandler.hasProtectionPolicy()) {
- throw new IllegalStateException(
- "PDF contains an encryption dictionary, please remove it with " +
- "setAllSecurityToBeRemoved() or set a protection policy with protect()");
- }
- securityHandler.prepareDocumentForEncryption(pdDocument);
- }
- willEncrypt = true;
- } else {
- willEncrypt = false;
- }
- }
-
- COSDocument cosDoc = pdDocument.getDocument();
- COSDictionary trailer = cosDoc.getTrailer();
- COSArray idArray;
- boolean missingID = true;
- COSBase base = trailer.getDictionaryObject(COSName.ID);
- if (base instanceof COSArray) {
- idArray = (COSArray) base;
- if (idArray.size() == 2) {
- missingID = false;
- }
- } else {
- idArray = new COSArray();
- }
- if (missingID || incrementalUpdate) {
- MessageDigest md5;
- try {
- md5 = MessageDigest.getInstance("MD5");
- } catch (NoSuchAlgorithmException e) {
- // should never happen
- throw new RuntimeException(e);
- }
-
- // algorithm says to use time/path/size/values in doc to generate the id.
- // we don't have path or size, so do the best we can
- md5.update(Long.toString(idTime).getBytes(StandardCharsets.ISO_8859_1));
-
- COSDictionary info = trailer.getCOSDictionary(COSName.INFO);
- if (info != null) {
- for (COSBase cosBase : info.getValues()) {
- md5.update(cosBase.toString().getBytes(StandardCharsets.ISO_8859_1));
- }
- }
- // reuse origin documentID if available as first value
- COSString firstID =
- missingID ? new COSString(md5.digest()) : (COSString) idArray.get(0);
- // it's ok to use the same ID for the second part if the ID is created for the first time
- COSString secondID = missingID ? firstID : new COSString(md5.digest());
- idArray = new COSArray();
- idArray.add(firstID);
- idArray.add(secondID);
- trailer.setItem(COSName.ID, idArray);
- }
- cosDoc.accept(this);
- }
-
- /**
- * This will write the fdf document.
- *
- * @param doc The document to write.
- * @throws IOException If an error occurs while generating the data.
- */
- public void write(FDFDocument doc) throws IOException {
- fdfDocument = doc;
- willEncrypt = false;
- COSDocument cosDoc = fdfDocument.getDocument();
- cosDoc.accept(this);
- }
-
- private byte[] getBytes(OutputStream stream) throws IOException {
- if (stream instanceof ByteArrayOutputStream) {
- return ((ByteArrayOutputStream) stream).toByteArray();
- } else if (stream instanceof UnsynchronizedByteArrayOutputStream) {
- return ((UnsynchronizedByteArrayOutputStream) stream).toByteArray();
- }
- throw new IOException("OutputStream " + stream.getClass().getName() + " is not supported");
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java
deleted file mode 100644
index d4edac7..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformer.java
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.pdf;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.pdfbox.Loader;
-import org.apache.pdfbox.io.RandomAccessReadBuffer;
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.fuzzing.exceptions.CantFuzzException;
-import org.apache.tika.mime.MediaType;
-
-public class PDFTransformer implements Transformer {
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("pdf"));
- private PDFTransformerConfig config = new PDFTransformerConfig();
-
- @Override
- public Set<MediaType> getSupportedTypes() {
- return SUPPORTED_TYPES;
- }
-
- @Override
- public void transform(InputStream is, OutputStream os) throws IOException, TikaException {
- try (PDDocument pdDocument = Loader.loadPDF(new RandomAccessReadBuffer(is))) {
- //some docs have security which prevents mods and writing
- //given our purposes here, we should remove security
- pdDocument.setAllSecurityToBeRemoved(true);
- try (EvilCOSWriter cosWriter = new EvilCOSWriter(os, config)) {
- cosWriter.write(pdDocument);
- }
- } catch (InvalidPasswordException e) {
- throw new CantFuzzException("encrypted doc");
- }
- }
-
- public void setConfig(PDFTransformerConfig pdfTransformerConfig) {
- this.config = pdfTransformerConfig;
- }
-}
diff --git a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java b/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java
deleted file mode 100644
index a494d4a..0000000
--- a/tika-fuzzing/src/main/java/org/apache/tika/fuzzing/pdf/PDFTransformerConfig.java
+++ /dev/null
@@ -1,192 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.fuzzing.pdf;
-
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Random;
-import java.util.Set;
-
-import org.apache.pdfbox.cos.COSArray;
-import org.apache.pdfbox.cos.COSBase;
-import org.apache.pdfbox.cos.COSName;
-
-import org.apache.tika.fuzzing.Transformer;
-import org.apache.tika.fuzzing.general.ByteDeleter;
-import org.apache.tika.fuzzing.general.ByteFlipper;
-import org.apache.tika.fuzzing.general.ByteInjector;
-import org.apache.tika.fuzzing.general.GeneralTransformer;
-import org.apache.tika.fuzzing.general.SpanSwapper;
-import org.apache.tika.fuzzing.general.Truncator;
-
-public class PDFTransformerConfig {
-
- private final Random random = new Random();
-
- private float randomizeObjectNumbers = -1.0f;
-
- private float randomizeRefNumbers = -1.0f;
-
- private int maxFilters = 1;
- private int minFilters = 1;
-
- private long maxFilteredStreamLength = -1;
-
- private Set<COSName> allowableFilters = new HashSet<>();
-
- private Transformer streamTransformer =
- new GeneralTransformer(1, new ByteDeleter(), new ByteFlipper(), new ByteInjector(),
- new SpanSwapper(), new Truncator());
-
- private Transformer unfilteredStreamTransformer =
- new GeneralTransformer(1, new ByteDeleter(), new ByteFlipper(), new ByteInjector(),
- new SpanSwapper(), new Truncator());
-
- public float getRandomizeObjectNumbers() {
- return randomizeObjectNumbers;
- }
-
- /**
- * @param randomizeObjectNumbers probability that a given object number will be randomized.
- * If < 0, this will be ignored.
- */
- public void setRandomizeObjectNumbers(float randomizeObjectNumbers) {
- this.randomizeObjectNumbers = randomizeObjectNumbers;
- }
-
- public float getRandomizeRefNumbers() {
- return randomizeRefNumbers;
- }
-
- /**
- * @param randomizeRefNumbers probability that a given reference number will be randomized.
- * If < 0, this will be ignored.
- */
- public void setRandomizeRefNumbers(float randomizeRefNumbers) {
- this.randomizeRefNumbers = randomizeRefNumbers;
- }
-
- public Transformer getUnfilteredStreamTransformer() {
- return unfilteredStreamTransformer;
- }
-
- /**
- * This transformer is applied to the stream _before_ any filters
- * are applied.
- *
- * @param transformer
- */
- public void setUnfilteredStreamTransformer(Transformer transformer) {
- this.unfilteredStreamTransformer = transformer;
- }
-
- public Transformer getStreamTransformer() {
- return streamTransformer;
- }
-
- /**
- * This transformer is applied to the stream _after_ each filter has been applied.
- *
- * @param transformer
- */
- public void setStreamTransformer(Transformer transformer) {
- this.streamTransformer = transformer;
- }
-
- /**
- * @param maxFilters maximum number of filters to apply
- */
- public void setMaxFilters(int maxFilters) {
- this.maxFilters = maxFilters;
- }
-
- /**
- * Which filters are allowed
- *
- * @return
- */
- public Set<COSName> getAllowableFilters() {
- return allowableFilters;
- }
-
- public void setAllowableFilters(Set<COSName> allowableFilters) {
- this.allowableFilters = allowableFilters;
- }
-
- /**
- * If {@link #maxFilters} > 0, this will randomly select filters given
- * the {@link #maxFilters} and {@link #minFilters}. If {@link #maxFilters} < 0,
- * this will return the existing filters.
- *
- * @param existingFilters
- * @return
- */
- public List<COSName> getFilters(COSBase existingFilters) {
- if (maxFilters < 0) {
- List<COSName> ret = new ArrayList<>();
- if (existingFilters instanceof COSArray) {
- for (COSBase obj : ((COSArray) existingFilters)) {
- ret.add((COSName) obj);
- }
- } else if (existingFilters instanceof COSName) {
- ret.add((COSName) existingFilters);
- }
- return ret;
- }
-
- int numFilters;
- if (maxFilters - minFilters == 0) {
- numFilters = maxFilters;
- } else {
- numFilters = minFilters + random.nextInt(maxFilters - minFilters);
- }
-
- List<COSName> allowable = new ArrayList<>(allowableFilters);
-
- List<COSName> filters = new ArrayList<>();
- for (int i = 0; i < numFilters; i++) {
- int index = random.nextInt(allowable.size());
- filters.add(allowable.get(index));
- }
- return filters;
- }
-
- /**
- * Minimum number of filters to apply to streams.
- *
- * @param minFilters
- */
- public void setMinFilters(int minFilters) {
- this.minFilters = minFilters;
- }
-
- public long getMaxFilteredStreamLength() {
- return maxFilteredStreamLength;
- }
-
- /**
- * Maximum filtered stream length. AsciiHex doubles the size of the stream with
- * each encoding. This is used as a circuit breaker to stop adding filters
- * if the stream goes above a given length.
- *
- * @param maxFilteredStreamLength
- */
- public void setMaxFilteredStreamLength(long maxFilteredStreamLength) {
- this.maxFilteredStreamLength = maxFilteredStreamLength;
- }
-}
diff --git a/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer b/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer
deleted file mode 100644
index 07390de..0000000
--- a/tika-fuzzing/src/main/resources/META-INF/services/org.apache.tika.fuzzing.Transformer
+++ /dev/null
@@ -1,17 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements. See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-org.apache.tika.fuzzing.general.GeneralTransformer
-#org.apache.tika.fuzzing.pdf.PDFTransformer
\ No newline at end of file
diff --git a/tika-fuzzing/src/main/resources/log4j2.xml b/tika-fuzzing/src/main/resources/log4j2.xml
deleted file mode 100644
index 94ac22b..0000000
--- a/tika-fuzzing/src/main/resources/log4j2.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<Configuration status="WARN">
- <Appenders>
- <Console name="Console" target="SYSTEM_ERR">
- <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
- </Console>
- </Appenders>
- <Loggers>
- <Root level="info">
- <AppenderRef ref="Console"/>
- </Root>
- <Logger name="org.apache.tika.pipes" level="error" additivity="false">
- <AppenderRef ref="Console"/>
- </Logger>
- <Logger name="com.github.junrar" level="error" additivity="false">
- <AppenderRef ref="Console"/>
- </Logger>
- </Loggers>
-</Configuration>
diff --git a/tika-fuzzing/src/test/java/TestFuzzingCLI.java b/tika-fuzzing/src/test/java/TestFuzzingCLI.java
deleted file mode 100644
index 9e3e49d..0000000
--- a/tika-fuzzing/src/test/java/TestFuzzingCLI.java
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-
-import org.apache.commons.io.FileUtils;
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.fuzzing.cli.FuzzingCLI;
-import org.apache.tika.utils.ProcessUtils;
-
-public class TestFuzzingCLI {
-
- @Test
- @Disabled
- public void testBasic() throws Exception {
- //convert to actual unit test
- String inputDir = "";// fill in
- String outputDir = "";//fill in
- String[] args = new String[]{"-i", inputDir, "-o", outputDir, "-n", "8", // num threads
- "-t", "1", //max transformers
- "-p", "100", //per file iterations
- "-r", "3"};
- FuzzingCLI.main(args);
- }
-
- @Test
- @Disabled
- public void testMock() throws Exception {
- //convert to actual unit test
- Path inputDir = Paths.get(getClass().getResource("/test-documents").toURI());
- Path outputDir = Files.createTempDirectory("tika-fuzzing-");
- String[] args = new String[]{"-i",
- ProcessUtils.escapeCommandLine(inputDir.toAbsolutePath().toString()), "-o",
- ProcessUtils.escapeCommandLine(outputDir.toAbsolutePath().toString()), "-n", "8",
- // num threads
- "-t", "0", //max transformers
- "-p", "10", //per file iterations
- "-m", "10000", //max ms per file
- "-r", "3"};
- try {
- FuzzingCLI.main(args);
- } finally {
- FileUtils.deleteDirectory(outputDir.toFile());
- }
- }
-}
diff --git a/tika-fuzzing/src/test/java/TestTransformer.java b/tika-fuzzing/src/test/java/TestTransformer.java
deleted file mode 100644
index 3adc4e3..0000000
--- a/tika-fuzzing/src/test/java/TestTransformer.java
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.ByteArrayInputStream;
-import java.io.ByteArrayOutputStream;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.Paths;
-import java.util.Arrays;
-
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-
-import org.apache.tika.fuzzing.general.GeneralTransformer;
-
-public class TestTransformer {
-
- @Test
- @Disabled
- public void testBasic() throws Exception {
- //turn into actual unit test
- Path path = Paths.get("");//put something meaningful here
-
- GeneralTransformer transformer = new GeneralTransformer();
- byte[] bytes = Files.readAllBytes(path);
-
- for (int i = 0; i < 100; i++) {
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- transformer.transform(new ByteArrayInputStream(bytes), bos);
-
- if (Arrays.equals(bos.toByteArray(), bytes)) {
- System.out.println("SAME");
- }
- }
- }
-}
diff --git a/tika-fuzzing/src/test/resources/configs/tika-fuzzing-config.xml b/tika-fuzzing/src/test/resources/configs/tika-fuzzing-config.xml
deleted file mode 100644
index 2210ae6..0000000
--- a/tika-fuzzing/src/test/resources/configs/tika-fuzzing-config.xml
+++ /dev/null
@@ -1,57 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<!-- this is an example configuration file to run the fuzzer against
- an input directory. Make sure to specify the input file directory
- in the base paths. We need the "empty" fetchers and emitters to
- handle the temp files that are created via fuzzing-->
-<properties>
- <fetchers>
- <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
- <name>fsf</name>
- <basePath>{FILL_IN_HERE}</basePath>
- </fetcher>
- <fetcher class="org.apache.tika.pipes.fetcher.fs.FileSystemFetcher">
- <name>temp</name>
- </fetcher>
- </fetchers>
- <emitters>
- <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
- <name>fse</name>
- <basePath>{FILL_IN_HERE}</basePath>
- </emitter>
- <emitter class="org.apache.tika.pipes.emitter.fs.FileSystemEmitter">
- <name>temp</name>
- </emitter>
- </emitters>
- <pipesIterator class="org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator">
- <basePath>{FILL_IN_HERE}</basePath>
- <fetcherName>fsf</fetcherName>
- <emitterName>fse</emitterName>
- </pipesIterator>
- <pipes>
- <numClients>5</numClients>
- <forkedJvmArgs>
- <arg>-Xmx1g</arg>
- <arg>-XX:ParallelGCThreads=2</arg>
- <arg>-Dlog4j.configurationFile={FILL_IN_HERE}</arg>
- </forkedJvmArgs>
- <timeoutMillis>10000</timeoutMillis>
- </pipes>
-</properties>
\ No newline at end of file
diff --git a/tika-fuzzing/src/test/resources/log4j2.xml b/tika-fuzzing/src/test/resources/log4j2.xml
deleted file mode 100644
index eaeca67..0000000
--- a/tika-fuzzing/src/test/resources/log4j2.xml
+++ /dev/null
@@ -1,42 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no" ?>
-
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-<Configuration status="WARN">
- <Appenders>
- <Console name="Console" target="SYSTEM_ERR">
- <PatternLayout pattern="%-5p [%t] %d{HH:mm:ss,SSS} %c %m%n"/>
- </Console>
- </Appenders>
- <Loggers>
- <Root level="info">
- <AppenderRef ref="Console"/>
- </Root>
- <Logger name="org.apache.tika.pipes" level="error" additivity="false">
- <AppenderRef ref="Console"/>
- </Logger>
- <Logger name="com.github.junrar" level="error" additivity="false">
- <AppenderRef ref="Console"/>
- </Logger>
- <Logger name="org.apache.pdfbox" level="fatal" additivity="false">
- <AppenderRef ref="Console"/>
- </Logger>
-
- </Loggers>
-</Configuration>
\ No newline at end of file
diff --git a/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml b/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml
deleted file mode 100644
index c9e028a..0000000
--- a/tika-fuzzing/src/test/resources/test-documents/heavy_hang.xml
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<mock>
- <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
- <write element="p">some content</write>
- <hang millis="30000" heavy="true" pulse_millis="100"/>
-</mock>
\ No newline at end of file
diff --git a/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml b/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml
deleted file mode 100644
index e497da5..0000000
--- a/tika-fuzzing/src/test/resources/test-documents/null_pointer.xml
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<mock>
- <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
- <write element="p">some content</write>
- <throw class="java.lang.NullPointerException">another null pointer exception</throw>
-</mock>
\ No newline at end of file
diff --git a/tika-fuzzing/src/test/resources/test-documents/system_exit.xml b/tika-fuzzing/src/test/resources/test-documents/system_exit.xml
deleted file mode 100644
index 52feede..0000000
--- a/tika-fuzzing/src/test/resources/test-documents/system_exit.xml
+++ /dev/null
@@ -1,25 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one
- or more contributor license agreements. See the NOTICE file
- distributed with this work for additional information
- regarding copyright ownership. The ASF licenses this file
- to you under the Apache License, Version 2.0 (the
- "License"); you may not use this file except in compliance
- with the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing,
- software distributed under the License is distributed on an
- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- KIND, either express or implied. See the License for the
- specific language governing permissions and limitations
- under the License.
--->
-
-<mock>
- <metadata action="add" name="author">Nikolai Lobachevsky</metadata>
- <write element="p">some content</write>
- <system_exit/>
-</mock>
\ No newline at end of file