Merge branch 'master' of https://github.com/apache/tika into multiple-parsers
diff --git a/.gitignore b/.gitignore
index d8e7384..7c3e3e6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,7 @@
*.DS_Store
*.tmp-inception
*.snap
+.*.swp
tika-deployment/tika-snap-app/parts/
tika-deployment/tika-snap-app/prime/
tika-deployment/tika-snap-app/snap/
diff --git a/tika-core/src/main/java/org/apache/tika/config/Param.java b/tika-core/src/main/java/org/apache/tika/config/Param.java
index 112955b..232521d 100644
--- a/tika-core/src/main/java/org/apache/tika/config/Param.java
+++ b/tika-core/src/main/java/org/apache/tika/config/Param.java
@@ -17,6 +17,7 @@
package org.apache.tika.config;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.multiple.AbstractMultipleParser;
import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
@@ -53,6 +54,7 @@
private static final Map<Class<?>, String> map = new HashMap<>();
private static final Map<String, Class<?>> reverseMap = new HashMap<>();
+ private static final Map<String, Class<?>> wellKnownMap = new HashMap<>();
static {
map.put(Boolean.class, "bool");
@@ -70,6 +72,7 @@
for (Map.Entry<Class<?>, String> entry : map.entrySet()) {
reverseMap.put(entry.getValue(), entry.getKey());
}
+ wellKnownMap.put("metadataPolicy", AbstractMultipleParser.MetadataPolicy.class);
}
private Class<T> type;
@@ -87,6 +90,10 @@
this.name = name;
this.type = type;
this.value = value.toString();
+
+ if (this.type == null) {
+ this.type = (Class<T>)wellKnownMap.get(name);
+ }
}
public Param(String name, T value){
@@ -183,11 +190,19 @@
Node nameAttr = node.getAttributes().getNamedItem("name");
Node typeAttr = node.getAttributes().getNamedItem("type");
+ Node valueAttr = node.getAttributes().getNamedItem("value");
Node value = node.getFirstChild();
+ if (valueAttr != null && (value == null || value.getTextContent() == null)) {
+ value = valueAttr;
+ }
Param<T> ret = new Param<T>();
ret.name = nameAttr.getTextContent();
- ret.setTypeString(typeAttr.getTextContent());
+ if (typeAttr != null) {
+ ret.setTypeString(typeAttr.getTextContent());
+ } else {
+ ret.type = (Class<T>)wellKnownMap.get(ret.name);
+ }
ret.value = value.getTextContent();
return ret;
@@ -205,6 +220,11 @@
private static <T> T getTypedValue(Class<T> type, String value) {
try {
+ if (type.isEnum()) {
+ Object val = Enum.valueOf((Class)type, value);
+ return (T)val;
+ }
+
Constructor<T> constructor = type.getConstructor(String.class);
constructor.setAccessible(true);
return constructor.newInstance(value);
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index 5b1b1ab..466d2bc 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -63,6 +63,7 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.multiple.AbstractMultipleParser;
import org.apache.tika.utils.AnnotationUtils;
import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
@@ -537,8 +538,8 @@
abstract CT createDefault(MimeTypes mimeTypes, ServiceLoader loader);
abstract CT createComposite(List<T> loaded, MimeTypes mimeTypes, ServiceLoader loader);
abstract T createComposite(Class<? extends T> compositeClass,
- List<T> children, Set<Class<? extends T>> excludeChildren,
- MimeTypes mimeTypes, ServiceLoader loader)
+ List<T> children, Set<Class<? extends T>> excludeChildren,
+ Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException;
abstract T decorate(T created, Element element)
throws IOException, TikaException; // eg explicit mime types
@@ -596,6 +597,9 @@
loaded = preLoadOne(loadedClass, name, mimeTypes);
if (loaded != null) return loaded;
+ // Get any parameters / settings for the parser
+ Map<String, Param> params = getParams(element);
+
// Is this a composite or decorated class? If so, support recursion
if (isComposite(loadedClass)) {
// Get the child objects for it
@@ -621,7 +625,7 @@
}
// Create the Composite
- loaded = createComposite(loadedClass, children, excludeChildren, mimeTypes, loader);
+ loaded = createComposite(loadedClass, children, excludeChildren, params, mimeTypes, loader);
// Default constructor fallback
if (loaded == null) {
@@ -634,7 +638,6 @@
// See the thread "Configuring parsers and translators" for details
}
- Map<String, Param> params = getParams(element);
//Assigning the params to bean fields/setters
AnnotationUtils.assignFieldParams(loaded, params);
if (loaded instanceof Initializable) {
@@ -738,6 +741,7 @@
@Override
boolean isComposite(Class<? extends Parser> loadedClass) {
if (CompositeParser.class.isAssignableFrom(loadedClass) ||
+ AbstractMultipleParser.class.isAssignableFrom(loadedClass) ||
ParserDecorator.class.isAssignableFrom(loadedClass)) {
return true;
}
@@ -755,7 +759,7 @@
@Override
Parser createComposite(Class<? extends Parser> parserClass,
List<Parser> childParsers, Set<Class<? extends Parser>> excludeParsers,
- MimeTypes mimeTypes, ServiceLoader loader)
+ Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException {
Parser parser = null;
Constructor<? extends Parser> c = null;
@@ -785,6 +789,12 @@
}
if (parser == null) {
try {
+ c = parserClass.getConstructor(MediaTypeRegistry.class, Collection.class, Map.class);
+ parser = c.newInstance(registry, childParsers, params);
+ } catch (NoSuchMethodException me) {}
+ }
+ if (parser == null) {
+ try {
c = parserClass.getConstructor(MediaTypeRegistry.class, List.class);
parser = c.newInstance(registry, childParsers);
} catch (NoSuchMethodException me) {}
@@ -878,7 +888,7 @@
Detector createComposite(Class<? extends Detector> detectorClass,
List<Detector> childDetectors,
Set<Class<? extends Detector>> excludeDetectors,
- MimeTypes mimeTypes, ServiceLoader loader)
+ Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
Detector detector = null;
@@ -951,7 +961,7 @@
Translator createComposite(Class<? extends Translator> compositeClass,
List<Translator> children,
Set<Class<? extends Translator>> excludeChildren,
- MimeTypes mimeTypes, ServiceLoader loader)
+ Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
throw new InstantiationException("Only one translator supported");
@@ -968,7 +978,7 @@
Class<? extends ConfigurableThreadPoolExecutor> compositeClass,
List<ConfigurableThreadPoolExecutor> children,
Set<Class<? extends ConfigurableThreadPoolExecutor>> excludeChildren,
- MimeTypes mimeTypes, ServiceLoader loader)
+ Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
throw new InstantiationException("Only one executor service supported");
@@ -1090,7 +1100,7 @@
EncodingDetector createComposite(Class<? extends EncodingDetector> encodingDetectorClass,
List<EncodingDetector> childEncodingDetectors,
Set<Class<? extends EncodingDetector>> excludeDetectors,
- MimeTypes mimeTypes, ServiceLoader loader)
+ Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
EncodingDetector encodingDetector = null;
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index ea3968e..c5c95a6 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -23,6 +23,7 @@
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -271,11 +272,7 @@
TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
TaggedContentHandler taggedHandler =
handler != null ? new TaggedContentHandler(handler) : null;
- if (parser instanceof ParserDecorator){
- metadata.add("X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName());
- } else {
- metadata.add("X-Parsed-By", parser.getClass().getName());
- }
+ ParserUtils.recordParserDetails(parser, metadata);
try {
parser.parse(taggedStream, taggedHandler, metadata, context);
} catch (RuntimeException e) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java b/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
index 55c33e9..3cc07c1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
@@ -23,10 +23,11 @@
import java.util.Set;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TemporaryResources;
-import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
+import org.apache.tika.parser.multiple.FallbackParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -98,59 +99,17 @@
/**
* Decorates the given parsers into a virtual parser, where they'll
* be tried in preference order until one works without error.
- * TODO Is this the right name?
- * TODO Is this the right place to put this? Should it be in CompositeParser? Elsewhere?
- * TODO Should we reset the Metadata if we try another parser?
- * TODO Should we reset the ContentHandler if we try another parser?
- * TODO Should we log/report failures anywhere?
- * @deprecated Do not use until the TODOs are resolved, see TIKA-1509
+ * @deprecated This has been replaced by {@link FallbackParser}
*/
public static final Parser withFallbacks(
final Collection<? extends Parser> parsers, final Set<MediaType> types) {
- Parser parser = EmptyParser.INSTANCE;
- if (!parsers.isEmpty()) parser = parsers.iterator().next();
+ // Delegate to the new FallbackParser for now, until people upgrade
+ // Keep old behaviour on metadata, which was to preseve all
+ MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry();
+ Parser p = new FallbackParser(registry, MetadataPolicy.KEEP_ALL, parsers);
- return new ParserDecorator(parser) {
- private static final long serialVersionUID = 1625187131782069683L;
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return types;
- }
- @Override
- public void parse(InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- // Must have a TikaInputStream, so we can re-use it if parsing fails
- // Need to close internally created tstream to release resources
- TemporaryResources tmp = (TikaInputStream.isTikaInputStream(stream)) ? null
- : new TemporaryResources();
- try {
- TikaInputStream tstream =
- TikaInputStream.get(stream, tmp);
- tstream.getFile();
- // Try each parser in turn
- for (Parser p : parsers) {
- tstream.mark(-1);
- try {
- p.parse(tstream, handler, metadata, context);
- return;
- } catch (Exception e) {
- // TODO How to log / record this failure?
- }
- // Prepare for the next parser, if present
- tstream.reset();
- }
- } finally {
- if (tmp != null) {
- tmp.dispose();
- }
- }
- }
- @Override
- public String getDecorationName() {
- return "With Fallback";
- }
- };
+ if (types == null || types.isEmpty()) return p;
+ return withTypes(p, types);
}
/**
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index a80bf15..ad0ff3f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -31,7 +31,7 @@
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.ContentHandlerFactory;
-import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.ParserUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -84,8 +84,7 @@
public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED =
Property.internalBoolean(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
- public final static Property EMBEDDED_EXCEPTION =
- Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+ public final static Property EMBEDDED_EXCEPTION = ParserUtils.EMBEDDED_EXCEPTION;
//move this to TikaCoreProperties?
public final static Property EMBEDDED_RESOURCE_PATH =
Property.internalText(TikaCoreProperties.TIKA_META_PREFIX+"embedded_resource_path");
@@ -170,7 +169,7 @@
if (hitMaxEmbeddedResources) {
metadata.set(EMBEDDED_RESOURCE_LIMIT_REACHED, "true");
}
- metadatas.add(0, deepCopy(metadata));
+ metadatas.add(0, ParserUtils.cloneMetadata(metadata));
}
}
@@ -227,23 +226,6 @@
}
}
- //defensive copy
- private Metadata deepCopy(Metadata m) {
- Metadata clone = new Metadata();
-
- for (String n : m.names()){
- if (! m.isMultiValued(n)) {
- clone.set(n, m.get(n));
- } else {
- String[] vals = m.getValues(n);
- for (int i = 0; i < vals.length; i++) {
- clone.add(n, vals[i]);
- }
- }
- }
- return clone;
- }
-
private String getResourceName(Metadata metadata) {
String objectName = "";
if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) {
@@ -321,16 +303,14 @@
metadata.add(WRITE_LIMIT_REACHED, "true");
} else {
if (catchEmbeddedExceptions) {
- String trace = ExceptionUtils.getStackTrace(e);
- metadata.set(EMBEDDED_EXCEPTION, trace);
+ ParserUtils.recordParserFailure(this, e, metadata);
} else {
throw e;
}
}
} catch (TikaException e) {
if (catchEmbeddedExceptions) {
- String trace = ExceptionUtils.getStackTrace(e);
- metadata.set(EMBEDDED_EXCEPTION, trace);
+ ParserUtils.recordParserFailure(this, e, metadata);
} else {
throw e;
}
@@ -349,9 +329,7 @@
return;
}
addContent(localHandler, metadata);
- metadatas.add(deepCopy(metadata));
+ metadatas.add(ParserUtils.cloneMetadata(metadata));
}
}
-
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
new file mode 100644
index 0000000..d687e41
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -0,0 +1,357 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import static org.apache.tika.utils.ParserUtils.cloneMetadata;
+import static org.apache.tika.utils.ParserUtils.recordParserDetails;
+import static org.apache.tika.utils.ParserUtils.recordParserFailure;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.config.Param;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.sax.ContentHandlerFactory;
+import org.apache.tika.utils.ParserUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Abstract base class for parser wrappers which may / will
+ * process a given stream multiple times, merging the results
+ * of the various parsers used.
+ * End users should normally use {@link FallbackParser} or
+ * {@link SupplementingParser} along with a Strategy.
+ * Note that unless you give a {@link ContentHandlerFactory},
+ * you'll get content from every parser tried mushed together!
+ *
+ * @since Apache Tika 1.18
+ */
+public abstract class AbstractMultipleParser extends AbstractParser {
+ /**
+ * Serial version UID.
+ */
+ private static final long serialVersionUID = 5383668090329836559L;
+
+ /**
+ * The various strategies for handling metadata emitted by
+ * multiple parsers.
+ * Note that not all will be supported by all subclasses.
+ */
+ public enum MetadataPolicy {
+ /**
+ * Before moving onto another parser, throw away
+ * all previously seen metadata
+ */
+ DISCARD_ALL,
+ /**
+ * The first parser to output a given key wins,
+ * merge in non-clashing other keys
+ */
+ FIRST_WINS,
+ /**
+ * The last parser to output a given key wins,
+ * overriding previous parser values for a
+ * clashing key.
+ */
+ LAST_WINS,
+ /**
+ * Where multiple parsers output a given key,
+ * store all their different (unique) values
+ */
+ KEEP_ALL
+ };
+ protected static final String METADATA_POLICY_CONFIG_KEY = "metadataPolicy";
+
+ /**
+ * Media type registry.
+ */
+ private MediaTypeRegistry registry;
+
+ /**
+ * How we should handle metadata clashes
+ */
+ private MetadataPolicy policy;
+
+ /**
+ * List of the multiple parsers to try.
+ */
+ private Collection<? extends Parser> parsers;
+
+ /**
+ * Computed list of Mime Types to offer, which is all
+ * those in common between the parsers.
+ * For explicit mimetypes only, use a {@link ParserDecorator}
+ */
+ private Set<MediaType> offeredTypes;
+
+ /**
+ * Returns the media type registry used to infer type relationships.
+ *
+ * @return media type registry
+ */
+ public MediaTypeRegistry getMediaTypeRegistry() {
+ return registry;
+ }
+
+ /**
+ * Sets the media type registry used to infer type relationships.
+ *
+ * @param registry media type registry
+ */
+ public void setMediaTypeRegistry(MediaTypeRegistry registry) {
+ this.registry = registry;
+ }
+
+ @SuppressWarnings("rawtypes")
+ protected static MetadataPolicy getMetadataPolicy(Map<String, Param> params) {
+ if (params.containsKey(METADATA_POLICY_CONFIG_KEY)) {
+ return (MetadataPolicy)params.get(METADATA_POLICY_CONFIG_KEY).getValue();
+ }
+ throw new IllegalArgumentException("Required parameter '"+METADATA_POLICY_CONFIG_KEY+"' not supplied");
+ }
+ @SuppressWarnings("rawtypes")
+ public AbstractMultipleParser(MediaTypeRegistry registry,
+ Collection<? extends Parser> parsers,
+ Map<String, Param> params) {
+ this(registry, getMetadataPolicy(params), parsers);
+ }
+ public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy,
+ Parser... parsers) {
+ this(registry, policy, Arrays.asList(parsers));
+ }
+ public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy,
+ Collection<? extends Parser> parsers) {
+ this.policy = policy;
+ this.parsers = parsers;
+ this.registry = registry;
+
+ // TODO Only offer those in common to several/all parser
+ // TODO Some sort of specialisation / subtype support
+ this.offeredTypes = new HashSet<>();
+ for (Parser parser : parsers) {
+ offeredTypes.addAll(
+ parser.getSupportedTypes(new ParseContext())
+ );
+ }
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return offeredTypes;
+ }
+
+ public MetadataPolicy getMetadataPolicy() {
+ return policy;
+ }
+
+ /**
+ * Used to allow implementations to prepare or change things
+ * before parsing occurs
+ */
+ protected void parserPrepare(Parser parser, Metadata metadata,
+ ParseContext context) {}
+
+ /**
+ * Used to notify implementations that a Parser has Finished
+ * or Failed, and to allow them to decide to continue or
+ * abort further parsing
+ */
+ protected abstract boolean parserCompleted(
+ Parser parser, Metadata metadata, ContentHandler handler,
+ ParseContext context, Exception exception);
+
+ /**
+ * Processes the given Stream through one or more parsers,
+ * resetting things between parsers as requested by policy.
+ * The actual processing is delegated to one or more {@link Parser}s.
+ *
+ * Note that you'll get text from every parser this way, to have
+ * control of which content is from which parser you need to
+ * call the method with a {@link ContentHandlerFactory} instead.
+ */
+ @Override
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ parse(stream, handler, null, metadata, context);
+ }
+ /**
+ * Processes the given Stream through one or more parsers,
+ * resetting things between parsers as requested by policy.
+ * The actual processing is delegated to one or more {@link Parser}s.
+ * You will get one ContentHandler fetched for each Parser used.
+ * TODO Do we need to return all the ContentHandler instances we created?
+ * @deprecated The {@link ContentHandlerFactory} override is still experimental
+ * and the method signature is subject to change before Tika 2.0
+ */
+ public void parse(
+ InputStream stream, ContentHandlerFactory handlers,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ parse(stream, null, handlers, metadata, context);
+ }
+ private void parse(InputStream stream,
+ ContentHandler handler, ContentHandlerFactory handlerFactory,
+ Metadata originalMetadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Track the metadata between parsers, so we can apply our policy
+ Metadata lastMetadata = cloneMetadata(originalMetadata);
+ Metadata metadata = lastMetadata;
+
+ // Start tracking resources, so we can clean up when done
+ TemporaryResources tmp = new TemporaryResources();
+ try {
+ // Ensure we'll be able to re-read safely, buffering to disk if so,
+ // to permit Parsers 2+ to be able to read the same data
+ InputStream taggedStream = ParserUtils.ensureStreamReReadable(stream, tmp);
+
+ for (Parser p : parsers) {
+ // Get a new handler for this parser, if we can
+ // If not, the user will get text from every parser
+ // mushed together onto the one solitary handler...
+ if (handlerFactory != null) {
+ handler = handlerFactory.getNewContentHandler();
+ }
+
+ // Record that we used this parser
+ recordParserDetails(p, originalMetadata);
+
+ // Prepare an near-empty Metadata, will merge after
+ metadata = cloneMetadata(originalMetadata);
+
+ // Notify the implementation of what we're about to do
+ parserPrepare(p, metadata, context);
+
+ // Process if possible
+ Exception failure = null;
+ try {
+ p.parse(taggedStream, handler, metadata, context);
+ } catch (Exception e) {
+ // Record the failure such that it can't get lost / overwritten
+ recordParserFailure(p, e, originalMetadata);
+ recordParserFailure(p, e, metadata);
+ failure = e;
+ }
+
+ // Notify the implementation how it went
+ boolean tryNext = parserCompleted(p, metadata, handler, context, failure);
+
+ // Handle metadata merging / clashes
+ metadata = mergeMetadata(metadata, lastMetadata, policy);
+
+ // Abort if requested, with the exception if there was one
+ if (!tryNext) {
+ if (failure != null) {
+ if (failure instanceof IOException) throw (IOException)failure;
+ if (failure instanceof SAXException) throw (SAXException)failure;
+ if (failure instanceof TikaException) throw (TikaException)failure;
+ throw new TikaException("Unexpected RuntimeException from " + p, failure);
+ }
+ // Abort processing, don't try any more parsers
+ break;
+ }
+
+ // Prepare for the next parser, if present
+ lastMetadata = cloneMetadata(metadata);
+ taggedStream = ParserUtils.streamResetForReRead(taggedStream, tmp);
+ }
+ } finally {
+ tmp.dispose();
+ }
+
+ // Finally, copy the latest metadata back onto their supplied object
+ for (String n : metadata.names()) {
+ originalMetadata.remove(n);
+ for (String val : metadata.getValues(n)) {
+ originalMetadata.add(n, val);
+ }
+ }
+ }
+
+ protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata, MetadataPolicy policy) {
+ if (policy == MetadataPolicy.DISCARD_ALL) {
+ return newMetadata;
+ }
+
+ for (String n : lastMetadata.names()) {
+ // If this is one of the metadata keys we're setting ourselves
+ // for tracking/errors, then always keep the latest one!
+ if (n.equals(ParserUtils.X_PARSED_BY)) continue;
+ if (n.equals(ParserUtils.EMBEDDED_PARSER.getName())) continue;
+ if (n.equals(ParserUtils.EMBEDDED_EXCEPTION.getName())) continue;
+
+ // Merge as per policy
+ String[] newVals = newMetadata.getValues(n);
+ String[] oldVals = lastMetadata.getValues(n);
+ if (newVals == null || newVals.length == 0) {
+ // Metadata only in previous run, keep old values
+ for (String val : oldVals) {
+ newMetadata.add(n, val);
+ }
+ } else if (Arrays.deepEquals(oldVals, newVals)) {
+ // Metadata is the same, nothing to do
+ continue;
+ } else {
+ switch (policy) {
+ case FIRST_WINS:
+ // Use the earlier value(s) in place of this/these one/s
+ newMetadata.remove(n);
+ for (String val : oldVals) {
+ newMetadata.add(n, val);
+ }
+ continue;
+ case LAST_WINS:
+ // Most recent (last) parser has already won
+ continue;
+ case KEEP_ALL:
+ // Start with old list, then add any new unique values
+ List<String> vals = new ArrayList<>(Arrays.asList(oldVals));
+ newMetadata.remove(n);
+ for (String oldVal : oldVals) {
+ newMetadata.add(n, oldVal);
+ }
+ for (String newVal : newVals) {
+ if (! vals.contains(newVal)) {
+ newMetadata.add(n, newVal);
+ vals.add(newVal);
+ }
+ }
+
+ continue;
+ }
+ }
+ }
+ return newMetadata;
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
new file mode 100644
index 0000000..0bb1a53
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
@@ -0,0 +1,76 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.config.Param;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Tries multiple parsers in turn, until one succeeds.
+ *
+ * Can optionally keep Metadata from failed parsers when
+ * trying the next one, depending on the {@link MetadataPolicy}
+ * chosen.
+ *
+ * @since Apache Tika 1.18
+ */
+public class FallbackParser extends AbstractMultipleParser {
+ /**
+ * Serial version UID.
+ */
+ private static final long serialVersionUID = 5844409020977206167L;
+
+ /**
+ * The different Metadata Policies we support (all)
+ */
+ public static final List<MetadataPolicy> allowedPolicies =
+ Arrays.asList(MetadataPolicy.values());
+
+ @SuppressWarnings("rawtypes")
+ public FallbackParser(MediaTypeRegistry registry,
+ Collection<? extends Parser> parsers, Map<String, Param> params) {
+ super(registry, parsers, params);
+ }
+ public FallbackParser(MediaTypeRegistry registry, MetadataPolicy policy,
+ Collection<? extends Parser> parsers) {
+ super(registry, policy, parsers);
+ }
+ public FallbackParser(MediaTypeRegistry registry, MetadataPolicy policy,
+ Parser... parsers) {
+ super(registry, policy, parsers);
+ }
+
+ @Override
+ protected boolean parserCompleted(Parser parser, Metadata metadata,
+ ContentHandler handler, ParseContext context, Exception exception) {
+ // If there was no exception, abort further parsers
+ if (exception == null) return false;
+
+ // Have the next parser tried
+ return true;
+ }
+}
+
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
new file mode 100644
index 0000000..f043a5a
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/PickBestTextEncodingParser.java
@@ -0,0 +1,193 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.NonDetectingEncodingDetector;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Inspired by TIKA-1443 and https://wiki.apache.org/tika/CompositeParserDiscussion
+ * this tries several different text encodings, then does the real
+ * text parsing based on which is "best".
+ *
+ * The logic for "best" needs a lot of work!
+ *
+ * This is not recommended for actual production use... It is mostly to
+ * prove that the {@link AbstractMultipleParser} environment is
+ * sufficient to support this use-case
+ *
+ * TODO Move this to the parsers package so it can get {@link TXTParser}
+ *
+ * @deprecated Currently not suitable for real use, more a demo / prototype!
+ */
+public class PickBestTextEncodingParser extends AbstractMultipleParser {
+ /**
+ * Serial version UID.
+ */
+ private static final long serialVersionUID = 730345169223211807L;
+
+ /**
+ * Which charsets we should try
+ */
+ private String[] charsetsToTry;
+
+ public PickBestTextEncodingParser(MediaTypeRegistry registry, String[] charsets) {
+ // TODO Actually give 1 more TXTParser than we have charsets
+ super(registry, MetadataPolicy.DISCARD_ALL, makeParsers(charsets));
+ this.charsetsToTry = charsets;
+ }
+ private static List<Parser> makeParsers(String[] charsets) {
+ // One more TXTParser than we have charsets, for the real thing
+ List<Parser> parsers = new ArrayList<>(charsets.length+1);
+ for (int i=0; i<charsets.length+1; i++) {
+ // TODO Actually get the right parser, TXTParser
+ parsers.set(i, new EmptyParser());
+ }
+ return parsers;
+ }
+
+ @Override
+ protected void parserPrepare(Parser parser, Metadata metadata,
+ ParseContext context) {
+ super.parserPrepare(parser, metadata, context);
+
+ // Specify which charset to try
+ String charset = context.get(CharsetTester.class).getNextCharset();
+ Charset charsetCS = Charset.forName(charset);
+ context.set(EncodingDetector.class,
+ new NonDetectingEncodingDetector(charsetCS));
+ }
+
+ @Override
+ protected boolean parserCompleted(Parser parser, Metadata metadata,
+ ContentHandler handler, ParseContext context, Exception exception) {
+ // Get the current charset
+ CharsetTester charsetTester = context.get(CharsetTester.class);
+ String charset = charsetTester.getCurrentCharset();
+
+ // Record the text
+ if (charsetTester.stillTesting()) {
+ charsetTester.charsetText.put(charset, handler.toString());
+
+ // If this was the last real charset, see which one is best
+ if (! charsetTester.moreToTest()) {
+ // TODO Properly work out the best!
+ charsetTester.pickedCharset = charsetsToTry[0];
+ }
+ }
+
+ // Always have the next parser tried
+ return true;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata originalMetadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // Use a BodyContentHandler for each of the charset test,
+ // then their real ContentHandler for the last one
+ CharsetContentHandlerFactory handlerFactory = new CharsetContentHandlerFactory();
+ handlerFactory.handler = handler;
+
+ // Put something on the ParseContext to get the charset
+ context.set(CharsetTester.class, new CharsetTester());
+
+ // Have the parsing done
+ super.parse(stream, handlerFactory, originalMetadata, context);
+ }
+ @Override
+ public void parse(InputStream stream, ContentHandlerFactory handlers,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // We only work with one ContentHandler as far as the user is
+ // concerned, any others are purely internal!
+ parse(stream, handlers.getNewContentHandler(), metadata, context);
+ }
+
+ protected class CharsetContentHandlerFactory implements ContentHandlerFactory {
+ // Which one we're on
+ private int index = -1;
+ // The real one for at the end
+ private ContentHandler handler;
+
+ @Override
+ public ContentHandler getNewContentHandler() {
+ index++;
+ if (index < charsetsToTry.length)
+ return new BodyContentHandler();
+ return handler;
+ }
+ @Override
+ public ContentHandler getNewContentHandler(OutputStream os,
+ String encoding) throws UnsupportedEncodingException {
+ return getNewContentHandler();
+ }
+ }
+
+ protected class CharsetTester {
+ /**
+ * Our current charset's index
+ */
+ private int index = -1;
+
+ /**
+ * What charset we felt was best
+ */
+ private String pickedCharset;
+ /**
+ * What text we got for each charset, so we can test for the best
+ */
+ private Map<String,String> charsetText = new HashMap<>();
+
+ protected String getNextCharset() {
+ index++;
+ return getCurrentCharset();
+ }
+ protected String getCurrentCharset() {
+ if (index < charsetsToTry.length) {
+ return charsetsToTry[index];
+ }
+ return pickedCharset;
+ }
+ protected boolean stillTesting() {
+ return index < charsetsToTry.length;
+ }
+ protected boolean moreToTest() {
+ return index < charsetsToTry.length-1;
+ }
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
new file mode 100644
index 0000000..67766da
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
@@ -0,0 +1,86 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.tika.config.Param;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaTypeRegistry;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
+
+/**
+ * Runs the input stream through all available parsers,
+ * merging the metadata from them based on the
+ * {@link MetadataPolicy} chosen.
+ *
+ * Warning - currently only one Parser should output
+ * any Content to the {@link ContentHandler}, the rest
+ * should only output {@link Metadata}. A solution to
+ * multiple-content is still being worked on...
+ *
+ * @since Apache Tika 1.18
+ */
+public class SupplementingParser extends AbstractMultipleParser {
+ /**
+ * Serial version UID.
+ */
+ private static final long serialVersionUID = 313179254565350994L;
+
+ /**
+ * The different Metadata Policies we support (not discard)
+ */
+ public static final List<MetadataPolicy> allowedPolicies =
+ Arrays.asList(MetadataPolicy.FIRST_WINS,
+ MetadataPolicy.LAST_WINS,
+ MetadataPolicy.KEEP_ALL);
+
+ @SuppressWarnings("rawtypes")
+ public SupplementingParser(MediaTypeRegistry registry,
+ Collection<? extends Parser> parsers, Map<String, Param> params) {
+ super(registry, parsers, params);
+ }
+ public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy,
+ Parser... parsers) {
+ this(registry, policy, Arrays.asList(parsers));
+ }
+ public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy,
+ Collection<? extends Parser> parsers) {
+ super(registry, policy, parsers);
+
+ // Ensure it's a supported policy
+ if (!allowedPolicies.contains(policy)) {
+ throw new IllegalArgumentException("Unsupported policy for SupplementingParser: " + policy);
+ }
+ }
+
+ @Override
+ protected boolean parserCompleted(Parser parser, Metadata metadata,
+ ContentHandler handler, ParseContext context, Exception exception) {
+ // If there was no exception, just carry on to the next
+ if (exception == null) return true;
+
+ // Have the next parser tried
+ return true;
+ }
+}
+
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
new file mode 100644
index 0000000..02958c2
--- /dev/null
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.utils;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ParserDecorator;
+
+/**
+ * Helper util methods for Parsers themselves.
+ */
+public class ParserUtils {
+ public final static String X_PARSED_BY = "X-Parsed-By";
+ public final static Property EMBEDDED_PARSER =
+ Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
+ public final static Property EMBEDDED_EXCEPTION =
+ Property.internalText(TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+
+ /**
+ * Does a deep clone of a Metadata object.
+ */
+ public static Metadata cloneMetadata(Metadata m) {
+ Metadata clone = new Metadata();
+
+ for (String n : m.names()){
+ if (! m.isMultiValued(n)) {
+ clone.set(n, m.get(n));
+ } else {
+ String[] vals = m.getValues(n);
+ for (int i = 0; i < vals.length; i++) {
+ clone.add(n, vals[i]);
+ }
+ }
+ }
+ return clone;
+ }
+
+ /**
+ * Identifies the real class name of the {@link Parser}, unwrapping
+ * any {@link ParserDecorator} decorations on top of it.
+ */
+ public static String getParserClassname(Parser parser) {
+ if (parser instanceof ParserDecorator){
+ return ((ParserDecorator) parser).getWrappedParser().getClass().getName();
+ } else {
+ return parser.getClass().getName();
+ }
+ }
+
+ /**
+ * Records details of the {@link Parser} used to the {@link Metadata},
+ * typically wanted where multiple parsers could be picked between
+ * or used.
+ */
+ public static void recordParserDetails(Parser parser, Metadata metadata) {
+ metadata.add(X_PARSED_BY, getParserClassname(parser));
+ }
+
+ /**
+ * Records details of a {@link Parser}'s failure to the
+ * {@link Metadata}, so you can check what went wrong even if the
+ * {@link Exception} wasn't immediately thrown (eg when several different
+ * Parsers are used)
+ */
+ public static void recordParserFailure(Parser parser, Exception failure,
+ Metadata metadata) {
+ String trace = ExceptionUtils.getStackTrace(failure);
+ metadata.add(EMBEDDED_EXCEPTION, trace);
+ metadata.add(EMBEDDED_PARSER, getParserClassname(parser));
+ }
+
+ /**
+ * Ensures that the Stream will be able to be re-read, by buffering to
+ * a temporary file if required.
+ * Streams that are automatically OK include {@link TikaInputStream}s
+ * created from Files or InputStreamFactories, and {@link RereadableInputStream}.
+ */
+ public static InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp) throws IOException {
+ // If it's re-readable, we're done
+ if (stream instanceof RereadableInputStream) return stream;
+
+ // Make sure it's a TikaInputStream
+ TikaInputStream tstream = TikaInputStream.cast(stream);
+ if (tstream == null) {
+ tstream = TikaInputStream.get(stream, tmp);
+ }
+
+ // If it's factory based, it's ok
+ if (tstream.getInputStreamFactory() != null) return tstream;
+
+ // Ensure it's file based
+ tstream.getFile();
+ // Prepare for future re-reads
+ tstream.mark(-1);
+ return tstream;
+ }
+ /**
+ * Resets the given {@link TikaInputStream} (checked by
+ * {@link #ensureStreamReReadable(InputStream, TemporaryResources)})
+ * so that it can be re-read again.
+ */
+ public static InputStream streamResetForReRead(InputStream stream, TemporaryResources tmp) throws IOException {
+ // If re-readable, rewind to start
+ if (stream instanceof RereadableInputStream) {
+ ((RereadableInputStream)stream).rewind();
+ return stream;
+ }
+
+ // File or Factory based?
+ TikaInputStream tstream = (TikaInputStream)stream;
+ if (tstream.getInputStreamFactory() != null) {
+ // Just get a fresh one each time from the factory
+ return TikaInputStream.get(tstream.getInputStreamFactory(), tmp);
+ }
+
+ // File based, reset stream to beginning of File
+ tstream.reset();
+ tstream.mark(-1);
+ return tstream;
+ }
+}
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index aa10923..2c0f14c 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -37,6 +37,7 @@
import org.apache.tika.parser.ErrorParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
+import org.apache.tika.parser.multiple.FallbackParser;
import org.junit.Test;
import static org.junit.Assert.assertEquals;
@@ -285,11 +286,28 @@
TikaConfig config = getConfig("TIKA-2389-throw-default-overridden.xml");
}
-
@Test
public void testInitializerPerParserWarn() throws Exception {
//TODO: test that this was logged at WARN level
TikaConfig config = getConfig("TIKA-2389-warn-per-parser.xml");
}
+ @Test
+ public void testMultipleWithFallback() throws Exception {
+ TikaConfig config = getConfig("TIKA-1509-multiple-fallback.xml");
+ CompositeParser parser = (CompositeParser)config.getParser();
+ assertEquals(2, parser.getAllComponentParsers().size());
+ Parser p;
+
+ p = parser.getAllComponentParsers().get(0);
+ assertTrue(p.toString(), p instanceof ParserDecorator);
+ assertEquals(DefaultParser.class, ((ParserDecorator)p).getWrappedParser().getClass());
+
+ p = parser.getAllComponentParsers().get(1);
+ assertTrue(p.toString(), p instanceof ParserDecorator);
+ assertEquals(FallbackParser.class, ((ParserDecorator)p).getWrappedParser().getClass());
+
+ FallbackParser fbp = (FallbackParser)((ParserDecorator)p).getWrappedParser();
+ assertEquals("DISCARD_ALL", fbp.getMetadataPolicy().toString());
+ }
}
\ No newline at end of file
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
new file mode 100644
index 0000000..590c95d
--- /dev/null
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.multiple;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.fail;
+
+import java.io.ByteArrayInputStream;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.parser.DummyParser;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.EmptyParser;
+import org.apache.tika.parser.ErrorParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.ParserUtils;
+import org.junit.Test;
+
+public class MultipleParserTest {
+ /**
+ * Tests how {@link AbstractMultipleParser} works out which
+ * mime types to offer, based on the types of the parsers
+ */
+ @Test
+ public void testMimeTypeSupported() {
+ // TODO
+ // Some media types
+ Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+ Set<MediaType> octAndText = new HashSet<MediaType>(Arrays.asList(
+ MediaType.OCTET_STREAM, MediaType.TEXT_PLAIN));
+ // TODO One with a subtype
+ }
+
+ /**
+ * Test {@link FallbackParser}
+ */
+ @Test
+ public void testFallback() throws Exception {
+ ParseContext context = new ParseContext();
+ BodyContentHandler handler;
+ Metadata metadata;
+ Parser p;
+ String[] usedParsers;
+
+ // Some media types
+ Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+
+ // Some parsers
+ ErrorParser pFail = new ErrorParser();
+ DummyParser pContent = new DummyParser(onlyOct, new HashMap<String,String>(),
+ "Fell back!");
+ EmptyParser pNothing = new EmptyParser();
+
+
+ // With only one parser defined, works as normal
+ p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back!", handler.toString());
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(1, usedParsers.length);
+ assertEquals(DummyParser.class.getName(), usedParsers[0]);
+
+
+ // With a failing parser, will go to the working one
+ p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back!", handler.toString());
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(2, usedParsers.length);
+ assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+ assertEquals(DummyParser.class.getName(), usedParsers[1]);
+
+ // Check we got an exception
+ assertNotNull(metadata.get(ParserUtils.EMBEDDED_EXCEPTION));
+ assertNotNull(metadata.get(ParserUtils.EMBEDDED_PARSER));
+ assertEquals(ErrorParser.class.getName(), metadata.get(ParserUtils.EMBEDDED_PARSER));
+
+
+ // Won't go past a working parser to a second one, stops after one works
+ p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent, pNothing);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back!", handler.toString());
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(2, usedParsers.length);
+ assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+ assertEquals(DummyParser.class.getName(), usedParsers[1]);
+
+
+ // TODO Check merge policies - First vs Discard
+ }
+
+ /**
+ * Test for {@link SupplementingParser}
+ */
+ @Test
+ public void testSupplemental() throws Exception {
+ ParseContext context = new ParseContext();
+ BodyContentHandler handler;
+ Metadata metadata;
+ Parser p;
+ String[] usedParsers;
+
+ // Some media types
+ Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
+
+ // Some test metadata
+ Map<String,String> m1 = new HashMap<>();
+ m1.put("T1","Test1");
+ m1.put("TBoth","Test1");
+ Map<String,String> m2 = new HashMap<>();
+ m2.put("T2","Test2");
+ m2.put("TBoth","Test2");
+
+ // Some parsers
+ ErrorParser pFail = new ErrorParser();
+ DummyParser pContent1 = new DummyParser(onlyOct, m1, "Fell back 1!");
+ DummyParser pContent2 = new DummyParser(onlyOct, m2, "Fell back 2!");
+ EmptyParser pNothing = new EmptyParser();
+
+
+ // Supplemental doesn't support DISCARD
+ try {
+ new SupplementingParser(null, MetadataPolicy.DISCARD_ALL, new Parser[0]);
+ fail("Discard shouldn't be supported");
+ } catch (IllegalArgumentException e) {}
+
+
+ // With only one parser defined, works as normal
+ p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pContent1);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back 1!", handler.toString());
+
+ assertEquals("Test1", metadata.get("T1"));
+ assertEquals("Test1", metadata.get("TBoth"));
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(1, usedParsers.length);
+ assertEquals(DummyParser.class.getName(), usedParsers[0]);
+
+
+ // Check the First, Last and All policies:
+ // First Wins
+ p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail,
+ pContent1, pContent2, pNothing);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back 1!Fell back 2!", handler.toString());
+
+ assertEquals("Test1", metadata.get("T1"));
+ assertEquals("Test2", metadata.get("T2"));
+ assertEquals("Test1", metadata.get("TBoth"));
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(4, usedParsers.length);
+ assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+ assertEquals(DummyParser.class.getName(), usedParsers[1]);
+ assertEquals(DummyParser.class.getName(), usedParsers[2]);
+ assertEquals(EmptyParser.class.getName(), usedParsers[3]);
+
+
+ // Last Wins
+ p = new SupplementingParser(null, MetadataPolicy.LAST_WINS, pFail,
+ pContent1, pContent2, pNothing);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back 1!Fell back 2!", handler.toString());
+
+ assertEquals("Test1", metadata.get("T1"));
+ assertEquals("Test2", metadata.get("T2"));
+ assertEquals("Test2", metadata.get("TBoth"));
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(4, usedParsers.length);
+ assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+ assertEquals(DummyParser.class.getName(), usedParsers[1]);
+ assertEquals(DummyParser.class.getName(), usedParsers[2]);
+ assertEquals(EmptyParser.class.getName(), usedParsers[3]);
+
+
+ // Merge
+ p = new SupplementingParser(null, MetadataPolicy.KEEP_ALL, pFail,
+ pContent1, pContent2, pNothing);
+
+ metadata = new Metadata();
+ handler = new BodyContentHandler();
+ p.parse(new ByteArrayInputStream(new byte[] {0,1,2,3,4}), handler, metadata, context);
+ assertEquals("Fell back 1!Fell back 2!", handler.toString());
+
+ assertEquals("Test1", metadata.get("T1"));
+ assertEquals("Test2", metadata.get("T2"));
+ assertEquals(2, metadata.getValues("TBoth").length);
+ assertEquals("Test1", metadata.getValues("TBoth")[0]);
+ assertEquals("Test2", metadata.getValues("TBoth")[1]);
+
+ usedParsers = metadata.getValues("X-Parsed-By");
+ assertEquals(4, usedParsers.length);
+ assertEquals(ErrorParser.class.getName(), usedParsers[0]);
+ assertEquals(DummyParser.class.getName(), usedParsers[1]);
+ assertEquals(DummyParser.class.getName(), usedParsers[2]);
+ assertEquals(EmptyParser.class.getName(), usedParsers[3]);
+
+
+ // Check the error details always come through, no matter the policy
+ // TODO
+
+
+ // Check that each parser gets its own ContentHandler if a factory was given
+ // TODO
+ }
+}
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-fallback.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-fallback.xml
new file mode 100644
index 0000000..e0bc083
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-fallback.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+
+ <!-- Defaults except for PDF -->
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <mime-exclude>application/pdf</mime-exclude>
+ </parser>
+
+ <!-- For PDF, try one then another, discarding the first results -->
+ <!-- if the first parser fails (as it will!) -->
+ <parser class="org.apache.tika.parser.multiple.FallbackParser">
+ <params>
+ <param name="metadataPolicy" value="DISCARD_ALL" />
+ </params>
+ <parser class="org.apache.tika.parser.ErrorParser"/>
+ <parser class="org.apache.tika.parser.EmptyParser"/>
+ <mime>application/pdf</mime>
+ </parser>
+
+ </parsers>
+</properties>
diff --git a/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-supplemental.xml b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-supplemental.xml
new file mode 100644
index 0000000..7652a18
--- /dev/null
+++ b/tika-core/src/test/resources/org/apache/tika/config/TIKA-1509-multiple-supplemental.xml
@@ -0,0 +1,46 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+
+ <!-- Defaults except for PDF -->
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <mime-exclude>application/pdf</mime-exclude>
+ </parser>
+
+ <!-- For PDF, run several parsers over the input PDF, -->
+ <!-- recording failures, and merging the metadata -->
+ <parser class="org.apache.tika.parser.multiple.FallbackParser">
+ <params>
+ <!-- If several parsers output the same metadata key, first parser to do so wins -->
+ <param name="metadataPolicy" value="FIRST_WINS" />
+ <!-- If several parsers output the same metadata key, last parser to do so wins -->
+ <!--
+ <param name="metadataPolicy" value="LAST_WINS" />
+ -->
+ <!-- If several parsers output the same metadata key, store all their values -->
+ <!--
+ <param name="metadataPolicy" value="KEEP_ALL" />
+ -->
+ </params>
+ <parser class="org.apache.tika.parser.EmptyParser"/>
+ <parser class="org.apache.tika.parser.ErrorParser"/>
+ <mime>application/pdf</mime>
+ </parser>
+ </parsers>
+</properties>
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 1a853d9..b1b72ca 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -36,6 +36,7 @@
import org.apache.tika.parser.utils.CommonsDigester;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.ContentHandlerFactory;
+import org.apache.tika.utils.ParserUtils;
import org.junit.Test;
import org.xml.sax.helpers.DefaultHandler;
@@ -206,7 +207,7 @@
//is to catch the exception
assertEquals(13, list.size());
Metadata mockNPEMetadata = list.get(10);
- assertContains("java.lang.NullPointerException", mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+ assertContains("java.lang.NullPointerException", mockNPEMetadata.get(ParserUtils.EMBEDDED_EXCEPTION));
metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
index d69c621..7f92163 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/crypto/TSDParserTest.java
@@ -26,8 +26,8 @@
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.ParserUtils;
import org.junit.Test;
import org.xml.sax.ContentHandler;
@@ -173,7 +173,7 @@
List<Metadata> list = getRecursiveMetadata("testTSD_broken_pdf.tsd");
assertEquals(2, list.size());
assertEquals("application/pdf", list.get(1).get(Metadata.CONTENT_TYPE));
- assertNotNull(list.get(1).get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
- assertContains("org.apache.pdfbox.pdmodel.PDDocument.load", list.get(1).get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+ assertNotNull(list.get(1).get(ParserUtils.EMBEDDED_EXCEPTION));
+ assertContains("org.apache.pdfbox.pdmodel.PDDocument.load", list.get(1).get(ParserUtils.EMBEDDED_EXCEPTION));
}
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index d95333d..148efec 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -27,7 +27,7 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.utils.ParserUtils;
import org.junit.Test;
/**
@@ -374,7 +374,7 @@
if (m.get(Metadata.CONTENT_TYPE).equals(POIFSContainerDetector.MS_GRAPH_CHART.toString())) {
found = true;
}
- assertNull(m.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+ assertNull(m.get(ParserUtils.EMBEDDED_EXCEPTION));
}
assertTrue("didn't find chart in "+suffix, found);
}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
index 9cc300b..9c594ee 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java
@@ -26,9 +26,9 @@
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.parser.microsoft.AbstractPOIContainerExtractionTest;
import org.apache.tika.parser.microsoft.POIFSContainerDetector;
+import org.apache.tika.utils.ParserUtils;
import org.junit.Before;
import org.junit.Test;
@@ -317,7 +317,7 @@
if (m.get(Metadata.CONTENT_TYPE).equals(POIFSContainerDetector.MS_GRAPH_CHART.toString())) {
found = true;
}
- assertNull(m.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
+ assertNull(m.get(ParserUtils.EMBEDDED_EXCEPTION));
}
assertTrue("didn't find chart in "+suffix, found);
}