tika-core/src/main/java/org/apache/tika/Tika.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika;

 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
 import java.net.URL;
 import java.nio.file.Path;
 import java.util.Properties;

 import org.xml.sax.SAXException;

 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.WriteLimitReachedException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.language.translate.Translator;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParsingReader;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;

 /**
  * Facade class for accessing Tika functionality. This class hides much of
  * the underlying complexity of the lower level Tika classes and provides
  * simple methods for many common parsing and type detection operations.
  *
  * @see Parser
  * @see Detector
  * @since Apache Tika 0.5
  */
 public class Tika {

     /**
      * The detector instance used by this facade.
      */
     private final Detector detector;

     /**
      * The parser instance used by this facade.
      */
     private final Parser parser;

     /**
      * The Translator instance used by this facade.
      */
     private final Translator translator;

     /**
      * Maximum length of the strings returned by the parseToString methods.
      * Used to prevent out of memory problems with huge input documents.
      * The default setting is 100k characters.
      */
     private int maxStringLength = 100 * 1000;

     /**
      * Creates a Tika facade using the given detector and parser instances, but the default
      * Translator.
      *
      * @param detector type detector
      * @param parser   document parser
      * @since Apache Tika 0.8
      */
     public Tika(Detector detector, Parser parser) {
         this.detector = detector;
         this.parser = parser;
         this.translator = TikaConfig.getDefaultConfig().getTranslator();
     }

     /**
      * Creates a Tika facade using the given detector, parser, and translator instances.
      *
      * @param detector   type detector
      * @param parser     document parser
      * @param translator text translator
      * @since Apache Tika 1.6
      */
     public Tika(Detector detector, Parser parser, Translator translator) {
         this.detector = detector;
         this.parser = parser;
         this.translator = translator;
     }

     /**
      * Creates a Tika facade using the given configuration.
      *
      * @param config Tika configuration
      */
     public Tika(TikaConfig config) {
         this(config.getDetector(), new AutoDetectParser(config), config.getTranslator());
     }

     /**
      * Creates a Tika facade using the default configuration.
      */
     public Tika() {
         this(TikaConfig.getDefaultConfig());
     }

     /**
      * Creates a Tika facade using the given detector instance, the
      * default parser configuration, and the default Translator.
      *
      * @param detector type detector
      * @since Apache Tika 0.8
      */
     public Tika(Detector detector) {
         this(detector, new AutoDetectParser(detector));
     }


     /**
      * Detects the media type of the given document. The type detection is
      * based on the content of the given document stream and any given
      * document metadata. The document stream can be <code>null</code>,
      * in which case only the given document metadata is used for type
      * detection.
      * <p>
      * If the document stream supports the
      * {@link InputStream#markSupported() mark feature}, then the stream is
      * marked and reset to the original position before this method returns.
      * Only a limited number of bytes are read from the stream.
      * <p>
      * The given document stream is <em>not</em> closed by this method.
      * <p>
      * Unlike in the {@link #parse(InputStream, Metadata)} method, the
      * given document metadata is <em>not</em> modified by this method.
      *
      * @param stream   the document stream, or <code>null</code>
      * @param metadata document metadata
      * @return detected media type
      * @throws IOException if the stream can not be read
      */
     public String detect(InputStream stream, Metadata metadata) throws IOException {
         if (stream == null || stream.markSupported()) {
             return detector.detect(stream, metadata).toString();
         } else {
             return detector.detect(new BufferedInputStream(stream), metadata).toString();
         }
     }

     /**
      * Detects the media type of the given document. The type detection is
      * based on the content of the given document stream and the name of the
      * document.
      * <p>
      * If the document stream supports the
      * {@link InputStream#markSupported() mark feature}, then the stream is
      * marked and reset to the original position before this method returns.
      * Only a limited number of bytes are read from the stream.
      * <p>
      * The given document stream is <em>not</em> closed by this method.
      *
      * @param stream the document stream
      * @param name   document name
      * @return detected media type
      * @throws IOException if the stream can not be read
      * @since Apache Tika 0.9
      */
     public String detect(InputStream stream, String name) throws IOException {
         Metadata metadata = new Metadata();
         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
         return detect(stream, metadata);
     }

     /**
      * Detects the media type of the given document. The type detection is
      * based on the content of the given document stream.
      * <p>
      * If the document stream supports the
      * {@link InputStream#markSupported() mark feature}, then the stream is
      * marked and reset to the original position before this method returns.
      * Only a limited number of bytes are read from the stream.
      * <p>
      * The given document stream is <em>not</em> closed by this method.
      *
      * @param stream the document stream
      * @return detected media type
      * @throws IOException if the stream can not be read
      */
     public String detect(InputStream stream) throws IOException {
         return detect(stream, new Metadata());
     }

     /**
      * Detects the media type of the given document. The type detection is
      * based on the first few bytes of a document and the document name.
      * <p>
      * For best results at least a few kilobytes of the document data
      * are needed. See also the other detect() methods for better
      * alternatives when you have more than just the document prefix
      * available for type detection.
      *
      * @param prefix first few bytes of the document
      * @param name   document name
      * @return detected media type
      * @since Apache Tika 0.9
      */
     public String detect(byte[] prefix, String name) {
         try {
             try (InputStream stream = TikaInputStream.get(prefix)) {
                 return detect(stream, name);
             }
         } catch (IOException e) {
             throw new IllegalStateException("Unexpected IOException", e);
         }
     }

     /**
      * Detects the media type of the given document. The type detection is
      * based on the first few bytes of a document.
      * <p>
      * For best results at least a few kilobytes of the document data
      * are needed. See also the other detect() methods for better
      * alternatives when you have more than just the document prefix
      * available for type detection.
      *
      * @param prefix first few bytes of the document
      * @return detected media type
      * @since Apache Tika 0.9
      */
     public String detect(byte[] prefix) {
         try {
             try (InputStream stream = TikaInputStream.get(prefix)) {
                 return detect(stream);
             }
         } catch (IOException e) {
             throw new IllegalStateException("Unexpected IOException", e);
         }
     }

     /**
      * Detects the media type of the file at the given path. The type
      * detection is based on the document content and a potential known
      * file extension.
      * <p>
      * Use the {@link #detect(String)} method when you want to detect the
      * type of the document without actually accessing the file.
      *
      * @param path the path of the file
      * @return detected media type
      * @throws IOException if the file can not be read
      */
     public String detect(Path path) throws IOException {
         Metadata metadata = new Metadata();
         try (InputStream stream = TikaInputStream.get(path, metadata)) {
             return detect(stream, metadata);
         }
     }

     /**
      * Detects the media type of the given file. The type detection is
      * based on the document content and a potential known file extension.
      * <p>
      * Use the {@link #detect(String)} method when you want to detect the
      * type of the document without actually accessing the file.
      *
      * @param file the file
      * @return detected media type
      * @throws IOException if the file can not be read
      * @see #detect(Path)
      */
     public String detect(File file) throws IOException {
         Metadata metadata = new Metadata();
         try (@SuppressWarnings("deprecation") InputStream stream = TikaInputStream
                 .get(file, metadata)) {
             return detect(stream, metadata);
         }
     }

     /**
      * Detects the media type of the resource at the given URL. The type
      * detection is based on the document content and a potential known
      * file extension included in the URL.
      * <p>
      * Use the {@link #detect(String)} method when you want to detect the
      * type of the document without actually accessing the URL.
      *
      * @param url the URL of the resource
      * @return detected media type
      * @throws IOException if the resource can not be read
      */
     public String detect(URL url) throws IOException {
         Metadata metadata = new Metadata();
         try (InputStream stream = TikaInputStream.get(url, metadata)) {
             return detect(stream, metadata);
         }
     }

     /**
      * Detects the media type of a document with the given file name.
      * The type detection is based on known file name extensions.
      * <p>
      * The given name can also be a URL or a full file path. In such cases
      * only the file name part of the string is used for type detection.
      *
      * @param name the file name of the document
      * @return detected media type
      */
     public String detect(String name) {
         try {
             return detect((InputStream) null, name);
         } catch (IOException e) {
             throw new IllegalStateException("Unexpected IOException", e);
         }
     }

     /**
      * Translate the given text String to and from the given languages.
      *
      * @param text           The text to translate.
      * @param sourceLanguage The input text language (for example, "hi").
      * @param targetLanguage The desired output language (for example, "fr").
      * @return The translated text. If translation is unavailable (client keys not set),  returns
      * the same text back.
      * @see org.apache.tika.language.translate.Translator
      */
     public String translate(String text, String sourceLanguage, String targetLanguage) {
         try {
             return translator.translate(text, sourceLanguage, targetLanguage);
         } catch (Exception e) {
             throw new IllegalStateException("Error translating data.", e);
         }
     }

     /**
      * Translate the given text String to the given language, attempting to auto-detect the
      * source language.
      *
      * @param text           The text to translate.
      * @param targetLanguage The desired output language (for example, "en").
      * @return The translated text. If translation is unavailable (client keys not set), returns
      * the same text back.
      * @see org.apache.tika.language.translate.Translator
      */
     public String translate(String text, String targetLanguage) {
         try {
             return translator.translate(text, targetLanguage);
         } catch (Exception e) {
             throw new IllegalStateException("Error translating data.", e);
         }
     }


     /**
      * Parses the given document and returns the extracted text content.
      * Input metadata like a file name or a content type hint can be passed
      * in the given metadata instance. Metadata information extracted from
      * the document is returned in that same metadata instance.
      * <p>
      * The returned reader will be responsible for closing the given stream.
      * The stream and any associated resources will be closed at or before
      * the time when the {@link Reader#close()} method is called.
      *
      * @param stream   the document to be parsed
      * @param metadata where document's metadata will be populated
      * @return extracted text content
      * @throws IOException if the document can not be read or parsed
      */
     public Reader parse(InputStream stream, Metadata metadata) throws IOException {
         ParseContext context = new ParseContext();
         context.set(Parser.class, parser);
         return new ParsingReader(parser, stream, metadata, context);
     }

     /**
      * Parses the given document and returns the extracted text content.
      * <p>
      * The returned reader will be responsible for closing the given stream.
      * The stream and any associated resources will be closed at or before
      * the time when the {@link Reader#close()} method is called.
      *
      * @param stream the document to be parsed
      * @return extracted text content
      * @throws IOException if the document can not be read or parsed
      */
     public Reader parse(InputStream stream) throws IOException {
         return parse(stream, new Metadata());
     }

     /**
      * Parses the file at the given path and returns the extracted text content.
      * <p>
      * Metadata information extracted from the document is returned in
      * the supplied metadata instance.
      *
      * @param path     the path of the file to be parsed
      * @param metadata where document's metadata will be populated
      * @return extracted text content
      * @throws IOException if the file can not be read or parsed
      */
     public Reader parse(Path path, Metadata metadata) throws IOException {
         InputStream stream = TikaInputStream.get(path, metadata);
         return parse(stream, metadata);
     }

     /**
      * Parses the file at the given path and returns the extracted text content.
      *
      * @param path the path of the file to be parsed
      * @return extracted text content
      * @throws IOException if the file can not be read or parsed
      */
     public Reader parse(Path path) throws IOException {
         return parse(path, new Metadata());
     }

     /**
      * Parses the given file and returns the extracted text content.
      * <p>
      * Metadata information extracted from the document is returned in
      * the supplied metadata instance.
      *
      * @param file     the file to be parsed
      * @param metadata where document's metadata will be populated
      * @return extracted text content
      * @throws IOException if the file can not be read or parsed
      * @see #parse(Path)
      */
     public Reader parse(File file, Metadata metadata) throws IOException {
         @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata);
         return parse(stream, metadata);
     }

     /**
      * Parses the given file and returns the extracted text content.
      *
      * @param file the file to be parsed
      * @return extracted text content
      * @throws IOException if the file can not be read or parsed
      * @see #parse(Path)
      */
     public Reader parse(File file) throws IOException {
         return parse(file, new Metadata());
     }

     /**
      * Parses the resource at the given URL and returns the extracted
      * text content.
      *
      * @param url the URL of the resource to be parsed
      * @return extracted text content
      * @throws IOException if the resource can not be read or parsed
      */
     public Reader parse(URL url) throws IOException {
         Metadata metadata = new Metadata();
         InputStream stream = TikaInputStream.get(url, metadata);
         return parse(stream, metadata);
     }

     /**
      * Parses the given document and returns the extracted text content.
      * The given input stream is closed by this method.
      * <p>
      * To avoid unpredictable excess memory use, the returned string contains
      * only up to {@link #getMaxStringLength()} first characters extracted
      * from the input document. Use the {@link #setMaxStringLength(int)}
      * method to adjust this limitation.
      * <p>
      * <strong>NOTE:</strong> Unlike most other Tika methods that take an
      * {@link InputStream}, this method will close the given stream for
      * you as a convenience. With other methods you are still responsible
      * for closing the stream or a wrapper instance returned by Tika.
      *
      * @param stream   the document to be parsed
      * @param metadata document metadata
      * @return extracted text content
      * @throws IOException   if the document can not be read
      * @throws TikaException if the document can not be parsed
      */
     public String parseToString(InputStream stream, Metadata metadata)
             throws IOException, TikaException {
         return parseToString(stream, metadata, maxStringLength);
     }

     /**
      * Parses the given document and returns the extracted text content.
      * The given input stream is closed by this method. This method lets
      * you control the maxStringLength per call.
      * <p>
      * To avoid unpredictable excess memory use, the returned string contains
      * only up to maxLength (parameter) first characters extracted
      * from the input document.
      * <p>
      * <strong>NOTE:</strong> Unlike most other Tika methods that take an
      * {@link InputStream}, this method will close the given stream for
      * you as a convenience. With other methods you are still responsible
      * for closing the stream or a wrapper instance returned by Tika.
      *
      * @param stream    the document to be parsed
      * @param metadata  document metadata
      * @param maxLength maximum length of the returned string
      * @return extracted text content
      * @throws IOException   if the document can not be read
      * @throws TikaException if the document can not be parsed
      */
     public String parseToString(InputStream stream, Metadata metadata, int maxLength)
             throws IOException, TikaException {
         WriteOutContentHandler handler = new WriteOutContentHandler(maxLength);
         try {
             ParseContext context = new ParseContext();
             context.set(Parser.class, parser);
             parser.parse(stream, new BodyContentHandler(handler), metadata, context);
         } catch (SAXException e) {
             if (!WriteLimitReachedException.isWriteLimitReached(e)) {
                 // This should never happen with BodyContentHandler...
                 throw new TikaException("Unexpected SAX processing failure", e);
             }
         } finally {
             stream.close();
         }
         return handler.toString();
     }

     /**
      * Parses the given document and returns the extracted text content.
      * The given input stream is closed by this method.
      * <p>
      * To avoid unpredictable excess memory use, the returned string contains
      * only up to {@link #getMaxStringLength()} first characters extracted
      * from the input document. Use the {@link #setMaxStringLength(int)}
      * method to adjust this limitation.
      * <p>
      * <strong>NOTE:</strong> Unlike most other Tika methods that take an
      * {@link InputStream}, this method will close the given stream for
      * you as a convenience. With other methods you are still responsible
      * for closing the stream or a wrapper instance returned by Tika.
      *
      * @param stream the document to be parsed
      * @return extracted text content
      * @throws IOException   if the document can not be read
      * @throws TikaException if the document can not be parsed
      */
     public String parseToString(InputStream stream) throws IOException, TikaException {
         return parseToString(stream, new Metadata());
     }

     /**
      * Parses the file at the given path and returns the extracted text content.
      * <p>
      * To avoid unpredictable excess memory use, the returned string contains
      * only up to {@link #getMaxStringLength()} first characters extracted
      * from the input document. Use the {@link #setMaxStringLength(int)}
      * method to adjust this limitation.
      *
      * @param path the path of the file to be parsed
      * @return extracted text content
      * @throws IOException   if the file can not be read
      * @throws TikaException if the file can not be parsed
      */
     public String parseToString(Path path) throws IOException, TikaException {
         Metadata metadata = new Metadata();
         InputStream stream = TikaInputStream.get(path, metadata);
         return parseToString(stream, metadata);
     }

     /**
      * Parses the given file and returns the extracted text content.
      * <p>
      * To avoid unpredictable excess memory use, the returned string contains
      * only up to {@link #getMaxStringLength()} first characters extracted
      * from the input document. Use the {@link #setMaxStringLength(int)}
      * method to adjust this limitation.
      *
      * @param file the file to be parsed
      * @return extracted text content
      * @throws IOException   if the file can not be read
      * @throws TikaException if the file can not be parsed
      * @see #parseToString(Path)
      */
     public String parseToString(File file) throws IOException, TikaException {
         Metadata metadata = new Metadata();
         @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata);
         return parseToString(stream, metadata);
     }

     /**
      * Parses the resource at the given URL and returns the extracted
      * text content.
      * <p>
      * To avoid unpredictable excess memory use, the returned string contains
      * only up to {@link #getMaxStringLength()} first characters extracted
      * from the input document. Use the {@link #setMaxStringLength(int)}
      * method to adjust this limitation.
      *
      * @param url the URL of the resource to be parsed
      * @return extracted text content
      * @throws IOException   if the resource can not be read
      * @throws TikaException if the resource can not be parsed
      */
     public String parseToString(URL url) throws IOException, TikaException {
         Metadata metadata = new Metadata();
         InputStream stream = TikaInputStream.get(url, metadata);
         return parseToString(stream, metadata);
     }

     /**
      * Returns the maximum length of strings returned by the
      * parseToString methods.
      *
      * @return maximum string length, or -1 if the limit has been disabled
      * @since Apache Tika 0.7
      */
     public int getMaxStringLength() {
         return maxStringLength;
     }

     /**
      * Sets the maximum length of strings returned by the parseToString
      * methods.
      *
      * @param maxStringLength maximum string length,
      *                        or -1 to disable this limit
      * @since Apache Tika 0.7
      */
     public void setMaxStringLength(int maxStringLength) {
         this.maxStringLength = maxStringLength;
     }

     /**
      * Returns the parser instance used by this facade.
      *
      * @return parser instance
      * @since Apache Tika 0.10
      */
     public Parser getParser() {
         return parser;
     }

     /**
      * Returns the detector instance used by this facade.
      *
      * @return detector instance
      * @since Apache Tika 0.10
      */
     public Detector getDetector() {
         return detector;
     }

     /**
      * Returns the translator instance used by this facade.
      *
      * @return translator instance
      * @since Tika 1.6
      */
     public Translator getTranslator() {
         return translator;
     }

     //--------------------------------------------------------------< Object >

     public String toString() {
         String version = null;

         try (InputStream stream = Tika.class
                 .getResourceAsStream("/META-INF/maven/org.apache.tika/tika-core/pom.properties")) {
             if (stream != null) {
                 Properties properties = new Properties();
                 properties.load(stream);
                 version = properties.getProperty("version");
             }
         } catch (Exception ignore) {
         }

         if (version != null) {
             return "Apache Tika " + version;
         } else {
             return "Apache Tika";
         }
     }

 }