blob: 0a509149b20b2b193e51b3ef596d10206d896b1d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config;
import javax.imageio.spi.ServiceRegistry;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
import org.apache.tika.concurrent.SimpleThreadPoolExecutor;
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.CompositeEncodingDetector;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.fetcher.DefaultFetcher;
import org.apache.tika.fetcher.Fetcher;
import org.apache.tika.language.translate.DefaultTranslator;
import org.apache.tika.language.translate.Translator;
import org.apache.tika.metadata.filter.CompositeMetadataFilter;
import org.apache.tika.metadata.filter.DefaultMetadataFilter;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.mime.MimeTypeException;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.MimeTypesFactory;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.multiple.AbstractMultipleParser;
import org.apache.tika.utils.AnnotationUtils;
import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import static org.apache.tika.config.ServiceLoader.getContextClassLoader;
/**
* Parse xml config file.
*/
public class TikaConfig {
private static MimeTypes getDefaultMimeTypes(ClassLoader loader) {
return MimeTypes.getDefaultMimeTypes(loader);
}
protected static CompositeDetector getDefaultDetector(
MimeTypes types, ServiceLoader loader) {
return new DefaultDetector(types, loader);
}
protected static CompositeEncodingDetector getDefaultEncodingDetector(
ServiceLoader loader) {
return new DefaultEncodingDetector(loader);
}
private static CompositeParser getDefaultParser(
MimeTypes types, ServiceLoader loader, EncodingDetector encodingDetector) {
return new DefaultParser(types.getMediaTypeRegistry(), loader, encodingDetector);
}
private static Translator getDefaultTranslator(ServiceLoader loader) {
return new DefaultTranslator(loader);
}
private static ConfigurableThreadPoolExecutor getDefaultExecutorService() {
return new SimpleThreadPoolExecutor();
}
private static MetadataFilter getDefaultMetadataFilter(ServiceLoader loader) {
return new DefaultMetadataFilter(loader);
}
private static Fetcher getDefaultFetcher(ServiceLoader loader) {
return new DefaultFetcher(loader);
}
//use this to look for unneeded instantiations of TikaConfig
protected static AtomicInteger TIMES_INSTANTIATED = new AtomicInteger();
private final ServiceLoader serviceLoader;
private final CompositeParser parser;
private final CompositeDetector detector;
private final Translator translator;
private final MimeTypes mimeTypes;
private final ExecutorService executorService;
private final EncodingDetector encodingDetector;
private final MetadataFilter metadataFilter;
private final Fetcher fetcher;
public TikaConfig(String file)
throws TikaException, IOException, SAXException {
this(Paths.get(file));
}
public TikaConfig(Path path)
throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(path));
}
public TikaConfig(Path path, ServiceLoader loader)
throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(path), loader);
}
public TikaConfig(File file)
throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(file.toPath()));
}
public TikaConfig(File file, ServiceLoader loader)
throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(file.toPath()), loader);
}
public TikaConfig(URL url)
throws TikaException, IOException, SAXException {
this(url, ServiceLoader.getContextClassLoader());
}
public TikaConfig(URL url, ClassLoader loader)
throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(url.toString()).getDocumentElement(), loader);
}
public TikaConfig(URL url, ServiceLoader loader)
throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(url.toString()).getDocumentElement(), loader);
}
public TikaConfig(InputStream stream)
throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(stream));
}
public TikaConfig(Document document) throws TikaException, IOException {
this(document.getDocumentElement());
}
public TikaConfig(Document document, ServiceLoader loader) throws TikaException, IOException {
this(document.getDocumentElement(), loader);
}
public TikaConfig(Element element) throws TikaException, IOException {
this(element, serviceLoaderFromDomElement(element, null));
}
public TikaConfig(Element element, ClassLoader loader)
throws TikaException, IOException {
this(element, serviceLoaderFromDomElement(element, loader));
}
private TikaConfig(Element element, ServiceLoader loader)
throws TikaException, IOException {
DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
EncodingDetectorXmlLoader encodingDetectorXmlLoader = new EncodingDetectorXmlLoader();
MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader();
updateXMLReaderUtils(element);
this.mimeTypes = typesFromDomElement(element);
this.detector = detectorLoader.loadOverall(element, mimeTypes, loader);
this.encodingDetector = encodingDetectorXmlLoader.loadOverall(element, mimeTypes, loader);
ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector);
this.parser = parserLoader.loadOverall(element, mimeTypes, loader);
this.translator = translatorLoader.loadOverall(element, mimeTypes, loader);
this.executorService = executorLoader.loadOverall(element, mimeTypes, loader);
this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, loader);
this.fetcher = new FetcherXmlLoader().loadOverall(element, mimeTypes, loader);
this.serviceLoader = loader;
TIMES_INSTANTIATED.incrementAndGet();
}
/**
* Creates a Tika configuration from the built-in media type rules
* and all the {@link Parser} implementations available through the
* {@link ServiceRegistry service provider mechanism} in the given
* class loader.
*
* @since Apache Tika 0.8
* @param loader the class loader through which parser implementations
* are loaded, or <code>null</code> for no parsers
* @throws MimeTypeException if the built-in media type rules are broken
* @throws IOException if the built-in media type rules can not be read
*/
public TikaConfig(ClassLoader loader)
throws MimeTypeException, IOException {
this.serviceLoader = new ServiceLoader(loader);
this.mimeTypes = getDefaultMimeTypes(loader);
this.detector = getDefaultDetector(mimeTypes, serviceLoader);
this.encodingDetector = getDefaultEncodingDetector(serviceLoader);
this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector);
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
this.fetcher = getDefaultFetcher(serviceLoader);
TIMES_INSTANTIATED.incrementAndGet();
}
/**
* Creates a default Tika configuration.
* First checks whether an XML config file is specified, either in
* <ol>
* <li>System property "tika.config", or</li>
* <li>Environment variable TIKA_CONFIG</li>
* </ol>
* <p>If one of these have a value, try to resolve it relative to file
* system or classpath.</p>
* <p>If XML config is not specified, initialize from the built-in media
* type rules and all the {@link Parser} implementations available through
* the {@link ServiceRegistry service provider mechanism} in the context
* class loader of the current thread.</p>
*
* @throws IOException if the configuration can not be read
* @throws TikaException if problem with MimeTypes or parsing XML config
*/
public TikaConfig() throws TikaException, IOException {
String config = System.getProperty("tika.config");
if (config == null || config.trim().equals("")) {
config = System.getenv("TIKA_CONFIG");
}
if (config == null || config.trim().equals("")) {
this.serviceLoader = new ServiceLoader();
this.mimeTypes = getDefaultMimeTypes(getContextClassLoader());
this.encodingDetector = getDefaultEncodingDetector(serviceLoader);
this.parser = getDefaultParser(mimeTypes, serviceLoader, encodingDetector);
this.detector = getDefaultDetector(mimeTypes, serviceLoader);
this.translator = getDefaultTranslator(serviceLoader);
this.executorService = getDefaultExecutorService();
this.metadataFilter = getDefaultMetadataFilter(serviceLoader);
this.fetcher = getDefaultFetcher(serviceLoader);
} else {
ServiceLoader tmpServiceLoader = new ServiceLoader();
try (InputStream stream = getConfigInputStream(config, tmpServiceLoader)) {
Element element = XMLReaderUtils.buildDOM(stream).getDocumentElement();
updateXMLReaderUtils(element);
serviceLoader = serviceLoaderFromDomElement(element, tmpServiceLoader.getLoader());
DetectorXmlLoader detectorLoader = new DetectorXmlLoader();
EncodingDetectorXmlLoader encodingDetectorLoader = new EncodingDetectorXmlLoader();
TranslatorXmlLoader translatorLoader = new TranslatorXmlLoader();
ExecutorServiceXmlLoader executorLoader = new ExecutorServiceXmlLoader();
MetadataFilterXmlLoader metadataFilterXmlLoader = new MetadataFilterXmlLoader();
this.mimeTypes = typesFromDomElement(element);
this.encodingDetector = encodingDetectorLoader.loadOverall(element, mimeTypes, serviceLoader);
ParserXmlLoader parserLoader = new ParserXmlLoader(encodingDetector);
this.parser = parserLoader.loadOverall(element, mimeTypes, serviceLoader);
this.detector = detectorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.translator = translatorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.executorService = executorLoader.loadOverall(element, mimeTypes, serviceLoader);
this.metadataFilter = metadataFilterXmlLoader.loadOverall(element, mimeTypes, serviceLoader);
this.fetcher = new FetcherXmlLoader().loadOverall(element, mimeTypes, serviceLoader);
} catch (SAXException e) {
throw new TikaException(
"Specified Tika configuration has syntax errors: "
+ config, e);
}
}
TIMES_INSTANTIATED.incrementAndGet();
}
private void updateXMLReaderUtils(Element element) throws TikaException {
Element child = getChild(element, "xml-reader-utils");
if (child == null) {
return;
}
String attr = child.getAttribute("maxEntityExpansions");
if (attr != null) {
XMLReaderUtils.setMaxEntityExpansions(Integer.parseInt(attr));
}
//make sure to call this after set entity expansions
attr = child.getAttribute("poolSize");
if (attr != null) {
XMLReaderUtils.setPoolSize(Integer.parseInt(attr));
}
}
private static InputStream getConfigInputStream(String config, ServiceLoader serviceLoader)
throws TikaException, IOException {
InputStream stream = null;
try {
stream = new URL(config).openStream();
} catch (IOException ignore) {
}
if (stream == null) {
stream = serviceLoader.getResourceAsStream(config);
}
if (stream == null) {
Path file = Paths.get(config);
if (Files.isRegularFile(file)) {
stream = Files.newInputStream(file);
}
}
if (stream == null) {
throw new TikaException(
"Specified Tika configuration not found: " + config);
}
return stream;
}
private static String getText(Node node) {
if (node.getNodeType() == Node.TEXT_NODE) {
return node.getNodeValue();
} else if (node.getNodeType() == Node.ELEMENT_NODE) {
StringBuilder builder = new StringBuilder();
NodeList list = node.getChildNodes();
for (int i = 0; i < list.getLength(); i++) {
builder.append(getText(list.item(i)));
}
return builder.toString();
} else {
return "";
}
}
/**
* @deprecated Use the {@link #getParser()} method instead
*/
public Parser getParser(MediaType mimeType) {
return parser.getParsers().get(mimeType);
}
/**
* Returns the configured parser instance.
*
* @return configured parser
*/
public Parser getParser() {
return parser;
}
/**
* Returns the configured detector instance.
*
* @return configured detector
*/
public Detector getDetector() {
return detector;
}
/**
* Returns the configured encoding detector instance
* @return configured encoding detector
*/
public EncodingDetector getEncodingDetector() {
return encodingDetector;
}
/**
* Returns the configured translator instance.
*
* @return configured translator
*/
public Translator getTranslator() {
return translator;
}
public ExecutorService getExecutorService() {
return executorService;
}
public MimeTypes getMimeRepository(){
return mimeTypes;
}
public MediaTypeRegistry getMediaTypeRegistry() {
return mimeTypes.getMediaTypeRegistry();
}
public ServiceLoader getServiceLoader() {
return serviceLoader;
}
public MetadataFilter getMetadataFilter() {
return metadataFilter;
}
/**
* Provides a default configuration (TikaConfig). Currently creates a
* new instance each time it's called; we may be able to have it
* return a shared instance once it is completely immutable.
*
* @return default configuration
*/
public static TikaConfig getDefaultConfig() {
try {
return new TikaConfig();
} catch (IOException e) {
throw new RuntimeException(
"Unable to read default configuration", e);
} catch (TikaException e) {
throw new RuntimeException(
"Unable to access default configuration", e);
}
}
private static Element getChild(Element element, String name) {
Node child = element.getFirstChild();
while (child != null) {
if (child.getNodeType() == Node.ELEMENT_NODE
&& name.equals(child.getNodeName())) {
return (Element) child;
}
child = child.getNextSibling();
}
return null;
}
private static List<Element> getTopLevelElementChildren(Element element,
String parentName, String childrenName) throws TikaException {
Node parentNode = null;
if (parentName != null) {
// Should be only zero or one <parsers> / <detectors> etc tag
NodeList nodes = element.getElementsByTagName(parentName);
if (nodes.getLength() > 1) {
throw new TikaException("Properties may not contain multiple "+parentName+" entries");
}
else if (nodes.getLength() == 1) {
parentNode = nodes.item(0);
}
} else {
// All children directly on the master element
parentNode = element;
}
if (parentNode != null) {
// Find only the direct child parser/detector objects
NodeList nodes = parentNode.getChildNodes();
List<Element> elements = new ArrayList<Element>();
for (int i = 0; i < nodes.getLength(); i++) {
Node node = nodes.item(i);
if (node instanceof Element) {
Element nodeE = (Element)node;
if (childrenName.equals(nodeE.getTagName())) {
elements.add(nodeE);
}
}
}
return elements;
} else {
// No elements of this type
return Collections.emptyList();
}
}
private static MimeTypes typesFromDomElement(Element element)
throws TikaException, IOException {
Element mtr = getChild(element, "mimeTypeRepository");
if (mtr != null && mtr.hasAttribute("resource")) {
return MimeTypesFactory.create(mtr.getAttribute("resource"));
} else {
return getDefaultMimeTypes(null);
}
}
private static Set<MediaType> mediaTypesListFromDomElement(
Element node, String tag)
throws TikaException, IOException {
Set<MediaType> types = null;
NodeList children = node.getChildNodes();
for (int i=0; i<children.getLength(); i++) {
Node cNode = children.item(i);
if (cNode instanceof Element) {
Element cElement = (Element)cNode;
if (tag.equals(cElement.getTagName())) {
String mime = getText(cElement);
MediaType type = MediaType.parse(mime);
if (type != null) {
if (types == null) types = new HashSet<>();
types.add(type);
} else {
throw new TikaException(
"Invalid media type name: " + mime);
}
}
}
}
if (types != null) return types;
return Collections.emptySet();
}
private static ServiceLoader serviceLoaderFromDomElement(Element element, ClassLoader loader) throws TikaConfigException {
Element serviceLoaderElement = getChild(element, "service-loader");
ServiceLoader serviceLoader;
if (serviceLoaderElement != null) {
boolean dynamic = Boolean.parseBoolean(serviceLoaderElement.getAttribute("dynamic"));
LoadErrorHandler loadErrorHandler = LoadErrorHandler.IGNORE;
String loadErrorHandleConfig = serviceLoaderElement.getAttribute("loadErrorHandler");
if(LoadErrorHandler.WARN.toString().equalsIgnoreCase(loadErrorHandleConfig)) {
loadErrorHandler = LoadErrorHandler.WARN;
} else if(LoadErrorHandler.THROW.toString().equalsIgnoreCase(loadErrorHandleConfig)) {
loadErrorHandler = LoadErrorHandler.THROW;
}
InitializableProblemHandler initializableProblemHandler = getInitializableProblemHandler(serviceLoaderElement.getAttribute("initializableProblemHandler"));
if (loader == null) {
loader = ServiceLoader.getContextClassLoader();
}
serviceLoader = new ServiceLoader(loader, loadErrorHandler, initializableProblemHandler, dynamic);
} else if(loader != null) {
serviceLoader = new ServiceLoader(loader);
} else {
serviceLoader = new ServiceLoader();
}
return serviceLoader;
}
private static InitializableProblemHandler getInitializableProblemHandler(String initializableProblemHandler)
throws TikaConfigException {
if (initializableProblemHandler == null || initializableProblemHandler.length() == 0) {
return InitializableProblemHandler.DEFAULT;
}
if (InitializableProblemHandler.IGNORE.toString().equalsIgnoreCase(initializableProblemHandler)) {
return InitializableProblemHandler.IGNORE;
} else if (InitializableProblemHandler.INFO.toString().equalsIgnoreCase(initializableProblemHandler)) {
return InitializableProblemHandler.INFO;
} else if (InitializableProblemHandler.WARN.toString().equalsIgnoreCase(initializableProblemHandler)) {
return InitializableProblemHandler.WARN;
} else if (InitializableProblemHandler.THROW.toString().equalsIgnoreCase(initializableProblemHandler)) {
return InitializableProblemHandler.THROW;
}
throw new TikaConfigException(
String.format(Locale.US, "Couldn't parse non-null '%s'. Must be one of 'ignore', 'info', 'warn' or 'throw'",
initializableProblemHandler));
}
public Fetcher getFetcher() {
return fetcher;
}
private static abstract class XmlLoader<CT,T> {
protected static final String PARAMS_TAG_NAME = "params";
abstract boolean supportsComposite();
abstract String getParentTagName(); // eg parsers
abstract String getLoaderTagName(); // eg parser
abstract Class<? extends T> getLoaderClass(); // Generics workaround
abstract boolean isComposite(T loaded);
abstract boolean isComposite(Class<? extends T> loadedClass);
abstract T preLoadOne(Class<? extends T> loadedClass, String classname,
MimeTypes mimeTypes) throws TikaException;
abstract CT createDefault(MimeTypes mimeTypes, ServiceLoader loader);
abstract CT createComposite(List<T> loaded, MimeTypes mimeTypes, ServiceLoader loader);
abstract T createComposite(Class<? extends T> compositeClass,
List<T> children, Set<Class<? extends T>> excludeChildren,
Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException;
abstract T decorate(T created, Element element)
throws IOException, TikaException; // eg explicit mime types
@SuppressWarnings("unchecked")
CT loadOverall(Element element, MimeTypes mimeTypes,
ServiceLoader loader) throws TikaException, IOException {
List<T> loaded = new ArrayList<T>();
// Find the children of the parent tag, if any
for (Element le : getTopLevelElementChildren(element, getParentTagName(), getLoaderTagName())) {
T loadedChild = loadOne(le, mimeTypes, loader);
if (loadedChild != null) loaded.add(loadedChild);
}
// Build the classes, and wrap as needed
if (loaded.isEmpty()) {
// Nothing defined, create a Default
return createDefault(mimeTypes, loader);
} else if (loaded.size() == 1) {
T single = loaded.get(0);
if (isComposite(single)) {
// Single Composite defined, use that
return (CT)single;
}
} else if (! supportsComposite()) {
// No composite support, just return the first one
return (CT)loaded.get(0);
}
// Wrap the defined parsers/detectors up in a Composite
return createComposite(loaded, mimeTypes, loader);
}
T loadOne(Element element, MimeTypes mimeTypes, ServiceLoader loader)
throws TikaException, IOException {
String name = element.getAttribute("class");
String initProbHandler = element.getAttribute("initializableProblemHandler");
InitializableProblemHandler initializableProblemHandler;
if (initProbHandler == null || initProbHandler.length() == 0) {
initializableProblemHandler = loader.getInitializableProblemHandler();
} else {
initializableProblemHandler =
getInitializableProblemHandler(initProbHandler);
}
T loaded = null;
try {
Class<? extends T> loadedClass =
loader.getServiceClass(getLoaderClass(), name);
// Do pre-load checks and short-circuits
//TODO : allow duplicate instances with different configurations
loaded = preLoadOne(loadedClass, name, mimeTypes);
if (loaded != null) return loaded;
// Get any parameters / settings for the parser
Map<String, Param> params = null;
try {
params = getParams(element);
} catch (Exception e) {
throw new TikaConfigException(e.getMessage(), e);
}
// Is this a composite or decorated class? If so, support recursion
if (isComposite(loadedClass)) {
// Get the child objects for it
List<T> children = new ArrayList<T>();
NodeList childNodes = element.getElementsByTagName(getLoaderTagName());
if (childNodes.getLength() > 0) {
for (int i = 0; i < childNodes.getLength(); i++) {
T loadedChild = loadOne((Element)childNodes.item(i),
mimeTypes, loader);
if (loadedChild != null) children.add(loadedChild);
}
}
// Get the list of children to exclude
Set<Class<? extends T>> excludeChildren = new HashSet<Class<? extends T>>();
NodeList excludeChildNodes = element.getElementsByTagName(getLoaderTagName()+"-exclude");
if (excludeChildNodes.getLength() > 0) {
for (int i = 0; i < excludeChildNodes.getLength(); i++) {
Element excl = (Element)excludeChildNodes.item(i);
String exclName = excl.getAttribute("class");
excludeChildren.add(loader.getServiceClass(getLoaderClass(), exclName));
}
}
// Create the Composite
loaded = createComposite(loadedClass, children, excludeChildren, params, mimeTypes, loader);
// Default constructor fallback
if (loaded == null) {
loaded = newInstance(loadedClass);
}
} else {
// Regular class, create as-is
loaded = newInstance(loadedClass);
// TODO Support arguments, needed for Translators etc
// See the thread "Configuring parsers and translators" for details
}
//Assigning the params to bean fields/setters
AnnotationUtils.assignFieldParams(loaded, params);
if (loaded instanceof Initializable) {
((Initializable) loaded).initialize(params);
((Initializable) loaded).checkInitialization(initializableProblemHandler);
}
// Have any decoration performed, eg explicit mimetypes
loaded = decorate(loaded, element);
// All done with setup
return loaded;
} catch (ClassNotFoundException e) {
if (loader.getLoadErrorHandler() == LoadErrorHandler.THROW) {
// Use a different exception signature here
throw new TikaException(
"Unable to find a "+getLoaderTagName()+" class: " + name, e);
}
// Report the problem
loader.getLoadErrorHandler().handleLoadError(name, e);
return null;
} catch (IllegalAccessException e) {
throw new TikaException(
"Unable to access a "+getLoaderTagName()+" class: " + name, e);
} catch (InvocationTargetException e) {
throw new TikaException(
"Unable to create a "+getLoaderTagName()+" class: " + name, e);
} catch (InstantiationException e) {
throw new TikaException(
"Unable to instantiate a "+getLoaderTagName()+" class: " + name, e);
} catch (NoSuchMethodException e) {
throw new TikaException(
"Unable to find the right constructor for "+getLoaderTagName()+" class: " + name, e);
}
}
T newInstance(Class<? extends T> loadedClass) throws
IllegalAccessException, InstantiationException,
NoSuchMethodException, InvocationTargetException {
return loadedClass.newInstance();
}
/**
* Gets parameters from a given
* @param el xml node which has {@link #PARAMS_TAG_NAME} child
* @return Map of key values read from xml
*/
Map<String, Param> getParams(Element el){
Map<String, Param> params = new HashMap<>();
for (Node child = el.getFirstChild(); child != null;
child = child.getNextSibling()){
if (PARAMS_TAG_NAME.equals(child.getNodeName())){ //found the node
if (child.hasChildNodes()) {//it has children
NodeList childNodes = child.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node item = childNodes.item(i);
if (item.getNodeType() == Node.ELEMENT_NODE){
Param<?> param = Param.load(item);
params.put(param.getName(), param);
}
}
}
break; //only the first one is used
}
}
return params;
}
}
private static class ParserXmlLoader extends XmlLoader<CompositeParser,Parser> {
private final EncodingDetector encodingDetector;
boolean supportsComposite() { return true; }
String getParentTagName() { return "parsers"; }
String getLoaderTagName() { return "parser"; }
private ParserXmlLoader(EncodingDetector encodingDetector) {
this.encodingDetector = encodingDetector;
}
@Override
Class<? extends Parser> getLoaderClass() {
return Parser.class;
}
@Override
Parser preLoadOne(Class<? extends Parser> loadedClass, String classname,
MimeTypes mimeTypes) throws TikaException {
// Check for classes which can't be set in config
if (AutoDetectParser.class.isAssignableFrom(loadedClass)) {
// https://issues.apache.org/jira/browse/TIKA-866
throw new TikaException(
"AutoDetectParser not supported in a <parser>"
+ " configuration element: " + classname);
}
// Continue with normal loading
return null;
}
@Override
boolean isComposite(Parser loaded) {
return loaded instanceof CompositeParser;
}
@Override
boolean isComposite(Class<? extends Parser> loadedClass) {
if (CompositeParser.class.isAssignableFrom(loadedClass) ||
AbstractMultipleParser.class.isAssignableFrom(loadedClass) ||
ParserDecorator.class.isAssignableFrom(loadedClass)) {
return true;
}
return false;
}
@Override
CompositeParser createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
return getDefaultParser(mimeTypes, loader, encodingDetector);
}
@Override
CompositeParser createComposite(List<Parser> parsers, MimeTypes mimeTypes, ServiceLoader loader) {
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
return new CompositeParser(registry, parsers);
}
@Override
Parser createComposite(Class<? extends Parser> parserClass,
List<Parser> childParsers, Set<Class<? extends Parser>> excludeParsers,
Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException {
Parser parser = null;
Constructor<? extends Parser> c = null;
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
// Try the possible default and composite parser constructors
if (parser == null) {
try {
c = parserClass.getConstructor(MediaTypeRegistry.class,
ServiceLoader.class, Collection.class, EncodingDetector.class);
parser = c.newInstance(registry, loader, excludeParsers, encodingDetector);
}
catch (NoSuchMethodException me) {}
}
if (parser == null) {
try {
c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class, Collection.class);
parser = c.newInstance(registry, loader, excludeParsers);
}
catch (NoSuchMethodException me) {}
}
if (parser == null) {
try {
c = parserClass.getConstructor(MediaTypeRegistry.class, List.class, Collection.class);
parser = c.newInstance(registry, childParsers, excludeParsers);
} catch (NoSuchMethodException me) {}
}
if (parser == null) {
try {
c = parserClass.getConstructor(MediaTypeRegistry.class, Collection.class, Map.class);
parser = c.newInstance(registry, childParsers, params);
} catch (NoSuchMethodException me) {}
}
if (parser == null) {
try {
c = parserClass.getConstructor(MediaTypeRegistry.class, List.class);
parser = c.newInstance(registry, childParsers);
} catch (NoSuchMethodException me) {}
}
// Create as a Parser Decorator
if (parser == null && ParserDecorator.class.isAssignableFrom(parserClass)) {
try {
CompositeParser cp = null;
if (childParsers.size() == 1 && excludeParsers.size() == 0 &&
childParsers.get(0) instanceof CompositeParser) {
cp = (CompositeParser)childParsers.get(0);
} else {
cp = new CompositeParser(registry, childParsers, excludeParsers);
}
c = parserClass.getConstructor(Parser.class);
parser = c.newInstance(cp);
} catch (NoSuchMethodException me) {}
}
return parser;
}
@Override
Parser newInstance(Class<? extends Parser> loadedClass) throws IllegalAccessException, InstantiationException, NoSuchMethodException, InvocationTargetException {
if (AbstractEncodingDetectorParser.class.isAssignableFrom(loadedClass)) {
Constructor ctor = loadedClass.getConstructor(EncodingDetector.class);
return (Parser) ctor.newInstance(encodingDetector);
} else {
return loadedClass.newInstance();
}
}
@Override
Parser decorate(Parser created, Element element) throws IOException, TikaException {
Parser parser = created;
// Is there an explicit list of mime types for this to handle?
Set<MediaType> parserTypes = mediaTypesListFromDomElement(element, "mime");
if (! parserTypes.isEmpty()) {
parser = ParserDecorator.withTypes(parser, parserTypes);
}
// Is there an explicit list of mime types this shouldn't handle?
Set<MediaType> parserExclTypes = mediaTypesListFromDomElement(element, "mime-exclude");
if (! parserExclTypes.isEmpty()) {
parser = ParserDecorator.withoutTypes(parser, parserExclTypes);
}
// All done with decoration
return parser;
}
}
private static class DetectorXmlLoader extends XmlLoader<CompositeDetector,Detector> {
boolean supportsComposite() { return true; }
String getParentTagName() { return "detectors"; }
String getLoaderTagName() { return "detector"; }
@Override
Class<? extends Detector> getLoaderClass() {
return Detector.class;
}
@Override
Detector preLoadOne(Class<? extends Detector> loadedClass, String classname,
MimeTypes mimeTypes) throws TikaException {
// If they asked for the mime types as a detector, give
// them the one we've already created. TIKA-1708
if (MimeTypes.class.equals(loadedClass)) {
return mimeTypes;
}
// Continue with normal loading
return null;
}
@Override
boolean isComposite(Detector loaded) {
return loaded instanceof CompositeDetector;
}
@Override
boolean isComposite(Class<? extends Detector> loadedClass) {
return CompositeDetector.class.isAssignableFrom(loadedClass);
}
@Override
CompositeDetector createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
return getDefaultDetector(mimeTypes, loader);
}
@Override
CompositeDetector createComposite(List<Detector> detectors, MimeTypes mimeTypes, ServiceLoader loader) {
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
return new CompositeDetector(registry, detectors);
}
@Override
Detector createComposite(Class<? extends Detector> detectorClass,
List<Detector> childDetectors,
Set<Class<? extends Detector>> excludeDetectors,
Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
Detector detector = null;
Constructor<? extends Detector> c;
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
// Try the possible default and composite detector constructors
if (detector == null) {
try {
c = detectorClass.getConstructor(MimeTypes.class, ServiceLoader.class, Collection.class);
detector = c.newInstance(mimeTypes, loader, excludeDetectors);
}
catch (NoSuchMethodException me) {}
}
if (detector == null) {
try {
c = detectorClass.getConstructor(MediaTypeRegistry.class, List.class, Collection.class);
detector = c.newInstance(registry, childDetectors, excludeDetectors);
} catch (NoSuchMethodException me) {}
}
if (detector == null) {
try {
c = detectorClass.getConstructor(MediaTypeRegistry.class, List.class);
detector = c.newInstance(registry, childDetectors);
} catch (NoSuchMethodException me) {}
}
if (detector == null) {
try {
c = detectorClass.getConstructor(List.class);
detector = c.newInstance(childDetectors);
} catch (NoSuchMethodException me) {}
}
return detector;
}
@Override
Detector decorate(Detector created, Element element) {
return created; // No decoration of Detectors
}
}
private static class TranslatorXmlLoader extends XmlLoader<Translator,Translator> {
boolean supportsComposite() { return false; }
String getParentTagName() { return null; }
String getLoaderTagName() { return "translator"; }
@Override
Class<? extends Translator> getLoaderClass() {
return Translator.class;
}
@Override
Translator preLoadOne(Class<? extends Translator> loadedClass, String classname,
MimeTypes mimeTypes) throws TikaException {
// Continue with normal loading
return null;
}
@Override
boolean isComposite(Translator loaded) { return false; }
@Override
boolean isComposite(Class<? extends Translator> loadedClass) { return false; }
@Override
Translator createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
return getDefaultTranslator(loader);
}
@Override
Translator createComposite(List<Translator> loaded,
MimeTypes mimeTypes, ServiceLoader loader) {
return loaded.get(0);
}
@Override
Translator createComposite(Class<? extends Translator> compositeClass,
List<Translator> children,
Set<Class<? extends Translator>> excludeChildren,
Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
throw new InstantiationException("Only one translator supported");
}
@Override
Translator decorate(Translator created, Element element) {
return created; // No decoration of Translators
}
}
private static class ExecutorServiceXmlLoader extends XmlLoader<ConfigurableThreadPoolExecutor,ConfigurableThreadPoolExecutor> {
@Override
ConfigurableThreadPoolExecutor createComposite(
Class<? extends ConfigurableThreadPoolExecutor> compositeClass,
List<ConfigurableThreadPoolExecutor> children,
Set<Class<? extends ConfigurableThreadPoolExecutor>> excludeChildren,
Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
throw new InstantiationException("Only one executor service supported");
}
@Override
ConfigurableThreadPoolExecutor createComposite(List<ConfigurableThreadPoolExecutor> loaded,
MimeTypes mimeTypes, ServiceLoader loader) {
return loaded.get(0);
}
@Override
ConfigurableThreadPoolExecutor createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
return getDefaultExecutorService();
}
@Override
ConfigurableThreadPoolExecutor decorate(ConfigurableThreadPoolExecutor created, Element element)
throws IOException, TikaException {
Element maxThreadElement = getChild(element, "max-threads");
if(maxThreadElement != null)
{
created.setMaximumPoolSize(Integer.parseInt(getText(maxThreadElement)));
}
Element coreThreadElement = getChild(element, "core-threads");
if(coreThreadElement != null)
{
created.setCorePoolSize(Integer.parseInt(getText(coreThreadElement)));
}
return created;
}
@Override
Class<? extends ConfigurableThreadPoolExecutor> getLoaderClass() {
return ConfigurableThreadPoolExecutor.class;
}
@Override
ConfigurableThreadPoolExecutor loadOne(Element element, MimeTypes mimeTypes,
ServiceLoader loader) throws TikaException, IOException {
return super.loadOne(element, mimeTypes, loader);
}
@Override
boolean supportsComposite() {return false;}
@Override
String getParentTagName() {return null;}
@Override
String getLoaderTagName() {return "executor-service";}
@Override
boolean isComposite(ConfigurableThreadPoolExecutor loaded) {return false;}
@Override
boolean isComposite(Class<? extends ConfigurableThreadPoolExecutor> loadedClass) {return false;}
@Override
ConfigurableThreadPoolExecutor preLoadOne(
Class<? extends ConfigurableThreadPoolExecutor> loadedClass, String classname,
MimeTypes mimeTypes) throws TikaException {
return null;
}
}
private static class EncodingDetectorXmlLoader extends
XmlLoader<EncodingDetector, EncodingDetector> {
boolean supportsComposite() {
return true;
}
String getParentTagName() {
return "encodingDetectors";
}
String getLoaderTagName() {
return "encodingDetector";
}
@Override
Class<? extends EncodingDetector> getLoaderClass() {
return EncodingDetector.class;
}
@Override
boolean isComposite(EncodingDetector loaded) {
return loaded instanceof CompositeEncodingDetector;
}
@Override
boolean isComposite(Class<? extends EncodingDetector> loadedClass) {
return CompositeEncodingDetector.class.isAssignableFrom(loadedClass);
}
@Override
EncodingDetector preLoadOne(Class<? extends EncodingDetector> loadedClass,
String classname, MimeTypes mimeTypes) throws TikaException {
// Check for classes which can't be set in config
// Continue with normal loading
return null;
}
@Override
EncodingDetector createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
return getDefaultEncodingDetector(loader);
}
@Override
CompositeEncodingDetector createComposite(List<EncodingDetector> encodingDetectors,
MimeTypes mimeTypes, ServiceLoader loader) {
return new CompositeEncodingDetector(encodingDetectors);
}
@Override
EncodingDetector createComposite(Class<? extends EncodingDetector> encodingDetectorClass,
List<EncodingDetector> childEncodingDetectors,
Set<Class<? extends EncodingDetector>> excludeDetectors,
Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
EncodingDetector encodingDetector = null;
Constructor<? extends EncodingDetector> c;
// Try the possible default and composite detector constructors
if (encodingDetector == null) {
try {
c = encodingDetectorClass.getConstructor(ServiceLoader.class, Collection.class);
encodingDetector = c.newInstance(loader, excludeDetectors);
} catch (NoSuchMethodException me) {
me.printStackTrace();
}
}
if (encodingDetector == null) {
try {
c = encodingDetectorClass.getConstructor(List.class);
encodingDetector = c.newInstance(childEncodingDetectors);
} catch (NoSuchMethodException me) {
me.printStackTrace();
}
}
return encodingDetector;
}
@Override
EncodingDetector decorate(EncodingDetector created, Element element) {
return created; // No decoration of EncodingDetectors
}
}
private static class MetadataFilterXmlLoader extends
XmlLoader<MetadataFilter, MetadataFilter> {
boolean supportsComposite() {
return true;
}
String getParentTagName() {
return "metadataFilters";
}
String getLoaderTagName() {
return "metadataFilter";
}
@Override
Class<? extends MetadataFilter> getLoaderClass() {
return MetadataFilter.class;
}
@Override
boolean isComposite(MetadataFilter loaded) {
return loaded instanceof CompositeMetadataFilter;
}
@Override
boolean isComposite(Class<? extends MetadataFilter> loadedClass) {
return CompositeMetadataFilter.class.isAssignableFrom(loadedClass);
}
@Override
MetadataFilter preLoadOne(Class<? extends MetadataFilter> loadedClass,
String classname, MimeTypes mimeTypes) throws TikaException {
// Check for classes which can't be set in config
// Continue with normal loading
return null;
}
@Override
MetadataFilter createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
return getDefaultMetadataFilter(loader);
}
//this ignores the service loader
@Override
MetadataFilter createComposite(List<MetadataFilter> loaded, MimeTypes mimeTypes, ServiceLoader loader) {
return new DefaultMetadataFilter(loaded);
}
@Override
MetadataFilter createComposite(Class<? extends MetadataFilter> metadataFilterClass,
List<MetadataFilter> childMetadataFilters,
Set<Class<? extends MetadataFilter>> excludeFilters,
Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
MetadataFilter metadataFilter = null;
Constructor<? extends MetadataFilter> c;
// Try the possible default and composite detector constructors
if (metadataFilter == null) {
try {
c = metadataFilterClass.getConstructor(ServiceLoader.class, Collection.class);
metadataFilter = c.newInstance(loader, excludeFilters);
} catch (NoSuchMethodException me) {
me.printStackTrace();
}
}
if (metadataFilter == null) {
try {
c = metadataFilterClass.getConstructor(List.class);
metadataFilter = c.newInstance(childMetadataFilters);
} catch (NoSuchMethodException me) {
me.printStackTrace();
}
}
return metadataFilter;
}
@Override
MetadataFilter decorate(MetadataFilter created, Element element) {
return created; // No decoration of MetadataFilters
}
}
private static class FetcherXmlLoader extends
XmlLoader<Fetcher, Fetcher> {
boolean supportsComposite() {
return true;
}
String getParentTagName() {
return "fetchers";
}
String getLoaderTagName() {
return "fetcher";
}
@Override
Class<? extends Fetcher> getLoaderClass() {
return Fetcher.class;
}
@Override
boolean isComposite(Fetcher loaded) {
return loaded instanceof DefaultFetcher;
}
@Override
boolean isComposite(Class<? extends Fetcher> loadedClass) {
return DefaultFetcher.class.isAssignableFrom(loadedClass);
}
@Override
Fetcher preLoadOne(Class<? extends Fetcher> loadedClass,
String classname, MimeTypes mimeTypes) throws TikaException {
// Check for classes which can't be set in config
// Continue with normal loading
return null;
}
@Override
Fetcher createDefault(MimeTypes mimeTypes, ServiceLoader loader) {
return getDefaultFetcher(loader);
}
//this ignores the service loader
@Override
Fetcher createComposite(List<Fetcher> loaded, MimeTypes mimeTypes, ServiceLoader loader) {
return new DefaultFetcher(loaded);
}
@Override
Fetcher createComposite(Class<? extends Fetcher> fetcherClass,
List<Fetcher> childFetchers,
Set<Class<? extends Fetcher>> excludeFilters,
Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException,
InstantiationException {
Fetcher fetcher = null;
Constructor<? extends Fetcher> c;
// Try the possible default and composite detector constructors
if (fetcher == null) {
try {
c = fetcherClass.getConstructor(ServiceLoader.class, Collection.class);
fetcher = c.newInstance(loader, excludeFilters);
} catch (NoSuchMethodException me) {
me.printStackTrace();
}
}
if (fetcher == null) {
try {
c = fetcherClass.getConstructor(List.class);
fetcher = c.newInstance(childFetchers);
} catch (NoSuchMethodException me) {
me.printStackTrace();
}
}
return fetcher;
}
@Override
Fetcher decorate(Fetcher created, Element element) {
return created; // No decoration of Fetchers
}
}
}