trunk/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java - jackrabbit-oak - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.jackrabbit.oak.plugins.index.search.spi.binary;

 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.Collections;
 import java.util.List;
 import java.util.Set;
 import java.util.concurrent.Callable;
 import java.util.concurrent.TimeoutException;

 import com.google.common.collect.Lists;
 import com.google.common.io.CountingInputStream;
 import org.apache.commons.io.IOUtils;
 import org.apache.jackrabbit.JcrConstants;
 import org.apache.jackrabbit.oak.api.Blob;
 import org.apache.jackrabbit.oak.api.PropertyState;
 import org.apache.jackrabbit.oak.api.Type;
 import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
 import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
 import org.apache.jackrabbit.oak.plugins.index.search.ExtractedTextCache;
 import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
 import org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexEditorContext;
 import org.apache.jackrabbit.oak.spi.state.NodeState;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.jetbrains.annotations.Nullable;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.xml.sax.SAXException;

 import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
 import static org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexEditor.TEXT_EXTRACTION_ERROR;

 /**
  *
  */
 public class FulltextBinaryTextExtractor {

   private static final Logger log = LoggerFactory.getLogger(FulltextBinaryTextExtractor.class);
   private static final Parser defaultParser = createDefaultParser();
   private static final long SMALL_BINARY = Long.getLong("oak.search.smallBinary", 16 * 1024);
   private final TextExtractionStats textExtractionStats = new TextExtractionStats();
   private final ExtractedTextCache extractedTextCache;
   private final IndexDefinition definition;
   private final boolean reindex;
   private Parser parser;
   private TikaConfigHolder tikaConfig;
   /**
    * The media types supported by the parser used.
    */
   private Set<MediaType> supportedMediaTypes;
   private Set<MediaType> nonIndexedMediaType;

   public FulltextBinaryTextExtractor(ExtractedTextCache extractedTextCache, IndexDefinition definition, boolean reindex) {
     this.extractedTextCache = extractedTextCache;
     this.definition = definition;
     this.reindex = reindex;
   }

   public void done(boolean reindex){
     textExtractionStats.log(reindex);
     textExtractionStats.collectStats(extractedTextCache);
   }

   public List<String> newBinary(
       PropertyState property, NodeState state, String path) {
     List<String> values = Lists.newArrayList();
     Metadata metadata = new Metadata();

     //jcr:mimeType is mandatory for a binary to be indexed
     String type = state.getString(JcrConstants.JCR_MIMETYPE);
     type = definition.getTikaMappedMimeType(type);

     if (type == null || !isSupportedMediaType(type)) {
       log.trace(
           "[{}] Ignoring binary content for node {} due to unsupported (or null) jcr:mimeType [{}]",
           getIndexName(), path, type);
       return values;
     }

     metadata.set(Metadata.CONTENT_TYPE, type);
     if (JCR_DATA.equals(property.getName())) {
       String encoding = state.getString(JcrConstants.JCR_ENCODING);
       if (encoding != null) { // not mandatory
         metadata.set(Metadata.CONTENT_ENCODING, encoding);
       }
     }

     for (Blob v : property.getValue(Type.BINARIES)) {
       String value = parseStringValue(v, metadata, path, property.getName());
       if (value == null){
         continue;
       }

       values.add(value);
     }
     return values;
   }

   private String parseStringValue(Blob v, Metadata metadata, String path, String propertyName) {
     String text = extractedTextCache.get(path, propertyName, v, reindex);
     if (text == null){
       text = parseStringValue0(v, metadata, path);
     }
     return text;
   }

   private String parseStringValue0(Blob v, Metadata metadata, String path) {
     WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength());
     long start = System.currentTimeMillis();
     long bytesRead = 0;
     long length = v.length();
     if (log.isDebugEnabled()) {
       log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
     }
     try {
       CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
       try {
         if (length > SMALL_BINARY) {
           String name = "Extracting " + path + ", " + length + " bytes";
           extractedTextCache.process(name, new Callable<Void>() {
             @Override
             public Void call() throws Exception {
               getParser().parse(stream, handler, metadata, new ParseContext());
               return null;
             }
           });
         } else {
           getParser().parse(stream, handler, metadata, new ParseContext());
         }
       } finally {
         bytesRead = stream.getCount();
         stream.close();
       }
     } catch (LinkageError e) {
       // Capture errors caused by extraction libraries
       // not being present. This is equivalent to disabling
       // selected media types in configuration, so we can simply
       // ignore these errors.
       log.debug(
           "[{}] Failed to extract text from a binary property: {}."
               + " This often happens when some media types are disabled by configuration."
               + " The stack trace is included to flag some 'unintended' failures",
           getIndexName(), path, e);
       extractedTextCache.put(v, ExtractedText.ERROR);
       return TEXT_EXTRACTION_ERROR;
     } catch (TimeoutException t) {
       log.warn(
           "[{}] Failed to extract text from a binary property due to timeout: {}.",
           getIndexName(), path);
       extractedTextCache.put(v, ExtractedText.ERROR);
       extractedTextCache.putTimeout(v, ExtractedText.ERROR);
       return TEXT_EXTRACTION_ERROR;
     } catch (Throwable t) {
       // Capture and report any other full text extraction problems.
       // The special STOP exception is used for normal termination.
       if (!handler.isWriteLimitReached(t)) {
         log.debug(
             "[{}] Failed to extract text from a binary property: {}."
                 + " This is a fairly common case, and nothing to"
                 + " worry about. The stack trace is included to"
                 + " help improve the text extraction feature.",
             getIndexName(), path, t);
         extractedTextCache.put(v, ExtractedText.ERROR);
         return TEXT_EXTRACTION_ERROR;
       } else {
         log.debug("Extracted text size exceeded configured limit({})", definition.getMaxExtractLength());
       }
     }
     String result = handler.toString();
     if (bytesRead > 0) {
       long time = System.currentTimeMillis() - start;
       int len = result.length();
       recordTextExtractionStats(time, bytesRead, len);
       if (log.isDebugEnabled()) {
         log.debug("Extracting {} took {} ms, {} bytes read, {} text size",
             path, time, bytesRead, len);
       }
     }
     extractedTextCache.put(v,  new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result));
     return result;
   }

   private void recordTextExtractionStats(long timeInMillis, long bytesRead, int textLength) {
     textExtractionStats.addStats(timeInMillis, bytesRead, textLength);
   }

   private String getIndexName() {
     return definition.getIndexName();
   }

   //~-------------------------------------------< Tika >

   public TikaConfig getTikaConfig(){
     if (tikaConfig == null) {
       tikaConfig = initializeTikaConfig(definition);
     }
     return tikaConfig.config;
   }

   private Parser getParser() {
     if (parser == null){
       parser = initializeTikaParser(definition);
     }
     return parser;
   }

   private boolean isSupportedMediaType(String type) {
     if (supportedMediaTypes == null) {
       supportedMediaTypes = getParser().getSupportedTypes(new ParseContext());
       nonIndexedMediaType = getNonIndexedMediaTypes();
     }
     MediaType mediaType = MediaType.parse(type);
     return supportedMediaTypes.contains(mediaType) && !nonIndexedMediaType.contains(mediaType);
   }

   private Set<MediaType> getNonIndexedMediaTypes() {
     InputStream configStream = null;
     String configSource = null;
     try {
       if (definition.hasCustomTikaConfig()) {
         configSource = String.format("Custom config at %s", definition.getIndexPath());
         configStream = definition.getTikaConfig();
       } else {
         URL configUrl = FulltextIndexEditorContext.class.getResource("tika-config.xml");
         configSource = "Default : tika-config.xml";
         if (configUrl != null) {
           configStream = configUrl.openStream();
         }
       }

       if (configStream != null) {
         return TikaParserConfig.getNonIndexedMediaTypes(configStream);
       }
     } catch (TikaException | IOException | SAXException e) {
       log.warn("Tika configuration not available : " + configSource, e);
     } finally {
       IOUtils.closeQuietly(configStream);
     }
     return Collections.emptySet();
   }


   private static TikaConfigHolder initializeTikaConfig(@Nullable IndexDefinition definition) {
     ClassLoader current = Thread.currentThread().getContextClassLoader();
     InputStream configStream = null;
     String configSource = null;

     try {
       Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
       if (definition != null && definition.hasCustomTikaConfig()) {
         log.debug("[{}] Using custom tika config", definition.getIndexName());
         configSource = "Custom config at " + definition.getIndexPath();
         configStream = definition.getTikaConfig();
       } else {
         URL configUrl = FulltextIndexEditorContext.class.getResource("tika-config.xml");
         if (configUrl != null) {
           configSource = configUrl.toString();
           configStream = configUrl.openStream();
         }
       }

       if (configStream != null) {
         return new TikaConfigHolder(new TikaConfig(configStream), configSource);
       }
     } catch (TikaException | IOException | SAXException e) {
       log.warn("Tika configuration not available : " + configSource, e);
     } finally {
       IOUtils.closeQuietly(configStream);
       Thread.currentThread().setContextClassLoader(current);
     }
     return new TikaConfigHolder(TikaConfig.getDefaultConfig(), "Default Config");
   }

   private Parser initializeTikaParser(IndexDefinition definition) {
     ClassLoader current = Thread.currentThread().getContextClassLoader();
     try {
       if (definition.hasCustomTikaConfig()) {
         Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
         return new AutoDetectParser(getTikaConfig());
       }
     } finally {
       Thread.currentThread().setContextClassLoader(current);
     }
     return defaultParser;
   }

   private static AutoDetectParser createDefaultParser() {
     ClassLoader current = Thread.currentThread().getContextClassLoader();
     TikaConfigHolder configHolder = null;
     try {
       configHolder = initializeTikaConfig(null);
       Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
       log.info("Loaded default Tika Config from classpath {}", configHolder);
       return new AutoDetectParser(configHolder.config);
     } catch (Exception e) {
       log.warn("Tika configuration not available : " + configHolder, e);
     } finally {
       Thread.currentThread().setContextClassLoader(current);
     }
     return new AutoDetectParser();
   }

   private static final class TikaConfigHolder{
     final TikaConfig config;
     final String sourceInfo;

     public TikaConfigHolder(TikaConfig config, String sourceInfo) {
       this.config = config;
       this.sourceInfo = sourceInfo;
     }

     @Override
     public String toString() {
       return sourceInfo;
     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.jackrabbit.oak.plugins.index.search.spi.binary;

	import java.io.IOException;
	import java.io.InputStream;
	import java.net.URL;
	import java.util.Collections;
	import java.util.List;
	import java.util.Set;
	import java.util.concurrent.Callable;
	import java.util.concurrent.TimeoutException;

	import com.google.common.collect.Lists;
	import com.google.common.io.CountingInputStream;
	import org.apache.commons.io.IOUtils;
	import org.apache.jackrabbit.JcrConstants;
	import org.apache.jackrabbit.oak.api.Blob;
	import org.apache.jackrabbit.oak.api.PropertyState;
	import org.apache.jackrabbit.oak.api.Type;
	import org.apache.jackrabbit.oak.commons.io.LazyInputStream;
	import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText;
	import org.apache.jackrabbit.oak.plugins.index.search.ExtractedTextCache;
	import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
	import org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexEditorContext;
	import org.apache.jackrabbit.oak.spi.state.NodeState;
	import org.apache.tika.config.TikaConfig;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.mime.MediaType;
	import org.apache.tika.parser.AutoDetectParser;
	import org.apache.tika.parser.ParseContext;
	import org.apache.tika.parser.Parser;
	import org.apache.tika.sax.WriteOutContentHandler;
	import org.jetbrains.annotations.Nullable;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.xml.sax.SAXException;

	import static org.apache.jackrabbit.JcrConstants.JCR_DATA;
	import static org.apache.jackrabbit.oak.plugins.index.search.spi.editor.FulltextIndexEditor.TEXT_EXTRACTION_ERROR;

	/**
	*
	*/
	public class FulltextBinaryTextExtractor {

	private static final Logger log = LoggerFactory.getLogger(FulltextBinaryTextExtractor.class);
	private static final Parser defaultParser = createDefaultParser();
	private static final long SMALL_BINARY = Long.getLong("oak.search.smallBinary", 16 * 1024);
	private final TextExtractionStats textExtractionStats = new TextExtractionStats();
	private final ExtractedTextCache extractedTextCache;
	private final IndexDefinition definition;
	private final boolean reindex;
	private Parser parser;
	private TikaConfigHolder tikaConfig;
	/**
	* The media types supported by the parser used.
	*/
	private Set<MediaType> supportedMediaTypes;
	private Set<MediaType> nonIndexedMediaType;

	public FulltextBinaryTextExtractor(ExtractedTextCache extractedTextCache, IndexDefinition definition, boolean reindex) {
	this.extractedTextCache = extractedTextCache;
	this.definition = definition;
	this.reindex = reindex;
	}

	public void done(boolean reindex){
	textExtractionStats.log(reindex);
	textExtractionStats.collectStats(extractedTextCache);
	}

	public List<String> newBinary(
	PropertyState property, NodeState state, String path) {
	List<String> values = Lists.newArrayList();
	Metadata metadata = new Metadata();

	//jcr:mimeType is mandatory for a binary to be indexed
	String type = state.getString(JcrConstants.JCR_MIMETYPE);
	type = definition.getTikaMappedMimeType(type);

	if (type == null \|\| !isSupportedMediaType(type)) {
	log.trace(
	"[{}] Ignoring binary content for node {} due to unsupported (or null) jcr:mimeType [{}]",
	getIndexName(), path, type);
	return values;
	}

	metadata.set(Metadata.CONTENT_TYPE, type);
	if (JCR_DATA.equals(property.getName())) {
	String encoding = state.getString(JcrConstants.JCR_ENCODING);
	if (encoding != null) { // not mandatory
	metadata.set(Metadata.CONTENT_ENCODING, encoding);
	}
	}

	for (Blob v : property.getValue(Type.BINARIES)) {
	String value = parseStringValue(v, metadata, path, property.getName());
	if (value == null){
	continue;
	}

	values.add(value);
	}
	return values;
	}

	private String parseStringValue(Blob v, Metadata metadata, String path, String propertyName) {
	String text = extractedTextCache.get(path, propertyName, v, reindex);
	if (text == null){
	text = parseStringValue0(v, metadata, path);
	}
	return text;
	}

	private String parseStringValue0(Blob v, Metadata metadata, String path) {
	WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength());
	long start = System.currentTimeMillis();
	long bytesRead = 0;
	long length = v.length();
	if (log.isDebugEnabled()) {
	log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity());
	}
	try {
	CountingInputStream stream = new CountingInputStream(new LazyInputStream(new BlobByteSource(v)));
	try {
	if (length > SMALL_BINARY) {
	String name = "Extracting " + path + ", " + length + " bytes";
	extractedTextCache.process(name, new Callable<Void>() {
	@Override
	public Void call() throws Exception {
	getParser().parse(stream, handler, metadata, new ParseContext());
	return null;
	}
	});
	} else {
	getParser().parse(stream, handler, metadata, new ParseContext());
	}
	} finally {
	bytesRead = stream.getCount();
	stream.close();
	}
	} catch (LinkageError e) {
	// Capture errors caused by extraction libraries
	// not being present. This is equivalent to disabling
	// selected media types in configuration, so we can simply
	// ignore these errors.
	log.debug(
	"[{}] Failed to extract text from a binary property: {}."
	+ " This often happens when some media types are disabled by configuration."
	+ " The stack trace is included to flag some 'unintended' failures",
	getIndexName(), path, e);
	extractedTextCache.put(v, ExtractedText.ERROR);
	return TEXT_EXTRACTION_ERROR;
	} catch (TimeoutException t) {
	log.warn(
	"[{}] Failed to extract text from a binary property due to timeout: {}.",
	getIndexName(), path);
	extractedTextCache.put(v, ExtractedText.ERROR);
	extractedTextCache.putTimeout(v, ExtractedText.ERROR);
	return TEXT_EXTRACTION_ERROR;
	} catch (Throwable t) {
	// Capture and report any other full text extraction problems.
	// The special STOP exception is used for normal termination.
	if (!handler.isWriteLimitReached(t)) {
	log.debug(
	"[{}] Failed to extract text from a binary property: {}."
	+ " This is a fairly common case, and nothing to"
	+ " worry about. The stack trace is included to"
	+ " help improve the text extraction feature.",
	getIndexName(), path, t);
	extractedTextCache.put(v, ExtractedText.ERROR);
	return TEXT_EXTRACTION_ERROR;
	} else {
	log.debug("Extracted text size exceeded configured limit({})", definition.getMaxExtractLength());
	}
	}
	String result = handler.toString();
	if (bytesRead > 0) {
	long time = System.currentTimeMillis() - start;
	int len = result.length();
	recordTextExtractionStats(time, bytesRead, len);
	if (log.isDebugEnabled()) {
	log.debug("Extracting {} took {} ms, {} bytes read, {} text size",
	path, time, bytesRead, len);
	}
	}
	extractedTextCache.put(v, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result));
	return result;
	}

	private void recordTextExtractionStats(long timeInMillis, long bytesRead, int textLength) {
	textExtractionStats.addStats(timeInMillis, bytesRead, textLength);
	}

	private String getIndexName() {
	return definition.getIndexName();
	}

	//~-------------------------------------------< Tika >

	public TikaConfig getTikaConfig(){
	if (tikaConfig == null) {
	tikaConfig = initializeTikaConfig(definition);
	}
	return tikaConfig.config;
	}

	private Parser getParser() {
	if (parser == null){
	parser = initializeTikaParser(definition);
	}
	return parser;
	}

	private boolean isSupportedMediaType(String type) {
	if (supportedMediaTypes == null) {
	supportedMediaTypes = getParser().getSupportedTypes(new ParseContext());
	nonIndexedMediaType = getNonIndexedMediaTypes();
	}
	MediaType mediaType = MediaType.parse(type);
	return supportedMediaTypes.contains(mediaType) && !nonIndexedMediaType.contains(mediaType);
	}

	private Set<MediaType> getNonIndexedMediaTypes() {
	InputStream configStream = null;
	String configSource = null;
	try {
	if (definition.hasCustomTikaConfig()) {
	configSource = String.format("Custom config at %s", definition.getIndexPath());
	configStream = definition.getTikaConfig();
	} else {
	URL configUrl = FulltextIndexEditorContext.class.getResource("tika-config.xml");
	configSource = "Default : tika-config.xml";
	if (configUrl != null) {
	configStream = configUrl.openStream();
	}
	}

	if (configStream != null) {
	return TikaParserConfig.getNonIndexedMediaTypes(configStream);
	}
	} catch (TikaException \| IOException \| SAXException e) {
	log.warn("Tika configuration not available : " + configSource, e);
	} finally {
	IOUtils.closeQuietly(configStream);
	}
	return Collections.emptySet();
	}


	private static TikaConfigHolder initializeTikaConfig(@Nullable IndexDefinition definition) {
	ClassLoader current = Thread.currentThread().getContextClassLoader();
	InputStream configStream = null;
	String configSource = null;

	try {
	Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
	if (definition != null && definition.hasCustomTikaConfig()) {
	log.debug("[{}] Using custom tika config", definition.getIndexName());
	configSource = "Custom config at " + definition.getIndexPath();
	configStream = definition.getTikaConfig();
	} else {
	URL configUrl = FulltextIndexEditorContext.class.getResource("tika-config.xml");
	if (configUrl != null) {
	configSource = configUrl.toString();
	configStream = configUrl.openStream();
	}
	}

	if (configStream != null) {
	return new TikaConfigHolder(new TikaConfig(configStream), configSource);
	}
	} catch (TikaException \| IOException \| SAXException e) {
	log.warn("Tika configuration not available : " + configSource, e);
	} finally {
	IOUtils.closeQuietly(configStream);
	Thread.currentThread().setContextClassLoader(current);
	}
	return new TikaConfigHolder(TikaConfig.getDefaultConfig(), "Default Config");
	}

	private Parser initializeTikaParser(IndexDefinition definition) {
	ClassLoader current = Thread.currentThread().getContextClassLoader();
	try {
	if (definition.hasCustomTikaConfig()) {
	Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
	return new AutoDetectParser(getTikaConfig());
	}
	} finally {
	Thread.currentThread().setContextClassLoader(current);
	}
	return defaultParser;
	}

	private static AutoDetectParser createDefaultParser() {
	ClassLoader current = Thread.currentThread().getContextClassLoader();
	TikaConfigHolder configHolder = null;
	try {
	configHolder = initializeTikaConfig(null);
	Thread.currentThread().setContextClassLoader(FulltextIndexEditorContext.class.getClassLoader());
	log.info("Loaded default Tika Config from classpath {}", configHolder);
	return new AutoDetectParser(configHolder.config);
	} catch (Exception e) {
	log.warn("Tika configuration not available : " + configHolder, e);
	} finally {
	Thread.currentThread().setContextClassLoader(current);
	}
	return new AutoDetectParser();
	}

	private static final class TikaConfigHolder{
	final TikaConfig config;
	final String sourceInfo;

	public TikaConfigHolder(TikaConfig config, String sourceInfo) {
	this.config = config;
	this.sourceInfo = sourceInfo;
	}

	@Override
	public String toString() {
	return sourceInfo;
	}
	}

	}