| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika; |
| |
| import java.io.BufferedInputStream; |
| import java.io.File; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.Reader; |
| import java.net.URL; |
| import java.nio.file.Path; |
| import java.util.Properties; |
| |
| import org.xml.sax.SAXException; |
| |
| import org.apache.tika.config.TikaConfig; |
| import org.apache.tika.detect.Detector; |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.exception.WriteLimitReachedException; |
| import org.apache.tika.io.TikaInputStream; |
| import org.apache.tika.language.translate.Translator; |
| import org.apache.tika.metadata.Metadata; |
| import org.apache.tika.metadata.TikaCoreProperties; |
| import org.apache.tika.parser.AutoDetectParser; |
| import org.apache.tika.parser.ParseContext; |
| import org.apache.tika.parser.Parser; |
| import org.apache.tika.parser.ParsingReader; |
| import org.apache.tika.sax.BodyContentHandler; |
| import org.apache.tika.sax.WriteOutContentHandler; |
| |
| /** |
| * Facade class for accessing Tika functionality. This class hides much of |
| * the underlying complexity of the lower level Tika classes and provides |
| * simple methods for many common parsing and type detection operations. |
| * |
| * @see Parser |
| * @see Detector |
| * @since Apache Tika 0.5 |
| */ |
| public class Tika { |
| |
| /** |
| * The detector instance used by this facade. |
| */ |
| private final Detector detector; |
| |
| /** |
| * The parser instance used by this facade. |
| */ |
| private final Parser parser; |
| |
| /** |
| * The Translator instance used by this facade. |
| */ |
| private final Translator translator; |
| |
| /** |
| * Maximum length of the strings returned by the parseToString methods. |
| * Used to prevent out of memory problems with huge input documents. |
| * The default setting is 100k characters. |
| */ |
| private int maxStringLength = 100 * 1000; |
| |
| /** |
| * Creates a Tika facade using the given detector and parser instances, but the default |
| * Translator. |
| * |
| * @param detector type detector |
| * @param parser document parser |
| * @since Apache Tika 0.8 |
| */ |
| public Tika(Detector detector, Parser parser) { |
| this.detector = detector; |
| this.parser = parser; |
| this.translator = TikaConfig.getDefaultConfig().getTranslator(); |
| } |
| |
| /** |
| * Creates a Tika facade using the given detector, parser, and translator instances. |
| * |
| * @param detector type detector |
| * @param parser document parser |
| * @param translator text translator |
| * @since Apache Tika 1.6 |
| */ |
| public Tika(Detector detector, Parser parser, Translator translator) { |
| this.detector = detector; |
| this.parser = parser; |
| this.translator = translator; |
| } |
| |
| /** |
| * Creates a Tika facade using the given configuration. |
| * |
| * @param config Tika configuration |
| */ |
| public Tika(TikaConfig config) { |
| this(config.getDetector(), new AutoDetectParser(config), config.getTranslator()); |
| } |
| |
| /** |
| * Creates a Tika facade using the default configuration. |
| */ |
| public Tika() { |
| this(TikaConfig.getDefaultConfig()); |
| } |
| |
| /** |
| * Creates a Tika facade using the given detector instance, the |
| * default parser configuration, and the default Translator. |
| * |
| * @param detector type detector |
| * @since Apache Tika 0.8 |
| */ |
| public Tika(Detector detector) { |
| this(detector, new AutoDetectParser(detector)); |
| } |
| |
| |
| /** |
| * Detects the media type of the given document. The type detection is |
| * based on the content of the given document stream and any given |
| * document metadata. The document stream can be <code>null</code>, |
| * in which case only the given document metadata is used for type |
| * detection. |
| * <p> |
| * If the document stream supports the |
| * {@link InputStream#markSupported() mark feature}, then the stream is |
| * marked and reset to the original position before this method returns. |
| * Only a limited number of bytes are read from the stream. |
| * <p> |
| * The given document stream is <em>not</em> closed by this method. |
| * <p> |
| * Unlike in the {@link #parse(InputStream, Metadata)} method, the |
| * given document metadata is <em>not</em> modified by this method. |
| * |
| * @param stream the document stream, or <code>null</code> |
| * @param metadata document metadata |
| * @return detected media type |
| * @throws IOException if the stream can not be read |
| */ |
| public String detect(InputStream stream, Metadata metadata) throws IOException { |
| if (stream == null || stream.markSupported()) { |
| return detector.detect(stream, metadata).toString(); |
| } else { |
| return detector.detect(new BufferedInputStream(stream), metadata).toString(); |
| } |
| } |
| |
| /** |
| * Detects the media type of the given document. The type detection is |
| * based on the content of the given document stream and the name of the |
| * document. |
| * <p> |
| * If the document stream supports the |
| * {@link InputStream#markSupported() mark feature}, then the stream is |
| * marked and reset to the original position before this method returns. |
| * Only a limited number of bytes are read from the stream. |
| * <p> |
| * The given document stream is <em>not</em> closed by this method. |
| * |
| * @param stream the document stream |
| * @param name document name |
| * @return detected media type |
| * @throws IOException if the stream can not be read |
| * @since Apache Tika 0.9 |
| */ |
| public String detect(InputStream stream, String name) throws IOException { |
| Metadata metadata = new Metadata(); |
| metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); |
| return detect(stream, metadata); |
| } |
| |
| /** |
| * Detects the media type of the given document. The type detection is |
| * based on the content of the given document stream. |
| * <p> |
| * If the document stream supports the |
| * {@link InputStream#markSupported() mark feature}, then the stream is |
| * marked and reset to the original position before this method returns. |
| * Only a limited number of bytes are read from the stream. |
| * <p> |
| * The given document stream is <em>not</em> closed by this method. |
| * |
| * @param stream the document stream |
| * @return detected media type |
| * @throws IOException if the stream can not be read |
| */ |
| public String detect(InputStream stream) throws IOException { |
| return detect(stream, new Metadata()); |
| } |
| |
| /** |
| * Detects the media type of the given document. The type detection is |
| * based on the first few bytes of a document and the document name. |
| * <p> |
| * For best results at least a few kilobytes of the document data |
| * are needed. See also the other detect() methods for better |
| * alternatives when you have more than just the document prefix |
| * available for type detection. |
| * |
| * @param prefix first few bytes of the document |
| * @param name document name |
| * @return detected media type |
| * @since Apache Tika 0.9 |
| */ |
| public String detect(byte[] prefix, String name) { |
| try { |
| try (InputStream stream = TikaInputStream.get(prefix)) { |
| return detect(stream, name); |
| } |
| } catch (IOException e) { |
| throw new IllegalStateException("Unexpected IOException", e); |
| } |
| } |
| |
| /** |
| * Detects the media type of the given document. The type detection is |
| * based on the first few bytes of a document. |
| * <p> |
| * For best results at least a few kilobytes of the document data |
| * are needed. See also the other detect() methods for better |
| * alternatives when you have more than just the document prefix |
| * available for type detection. |
| * |
| * @param prefix first few bytes of the document |
| * @return detected media type |
| * @since Apache Tika 0.9 |
| */ |
| public String detect(byte[] prefix) { |
| try { |
| try (InputStream stream = TikaInputStream.get(prefix)) { |
| return detect(stream); |
| } |
| } catch (IOException e) { |
| throw new IllegalStateException("Unexpected IOException", e); |
| } |
| } |
| |
| /** |
| * Detects the media type of the file at the given path. The type |
| * detection is based on the document content and a potential known |
| * file extension. |
| * <p> |
| * Use the {@link #detect(String)} method when you want to detect the |
| * type of the document without actually accessing the file. |
| * |
| * @param path the path of the file |
| * @return detected media type |
| * @throws IOException if the file can not be read |
| */ |
| public String detect(Path path) throws IOException { |
| Metadata metadata = new Metadata(); |
| try (InputStream stream = TikaInputStream.get(path, metadata)) { |
| return detect(stream, metadata); |
| } |
| } |
| |
| /** |
| * Detects the media type of the given file. The type detection is |
| * based on the document content and a potential known file extension. |
| * <p> |
| * Use the {@link #detect(String)} method when you want to detect the |
| * type of the document without actually accessing the file. |
| * |
| * @param file the file |
| * @return detected media type |
| * @throws IOException if the file can not be read |
| * @see #detect(Path) |
| */ |
| public String detect(File file) throws IOException { |
| Metadata metadata = new Metadata(); |
| try (@SuppressWarnings("deprecation") InputStream stream = TikaInputStream |
| .get(file, metadata)) { |
| return detect(stream, metadata); |
| } |
| } |
| |
| /** |
| * Detects the media type of the resource at the given URL. The type |
| * detection is based on the document content and a potential known |
| * file extension included in the URL. |
| * <p> |
| * Use the {@link #detect(String)} method when you want to detect the |
| * type of the document without actually accessing the URL. |
| * |
| * @param url the URL of the resource |
| * @return detected media type |
| * @throws IOException if the resource can not be read |
| */ |
| public String detect(URL url) throws IOException { |
| Metadata metadata = new Metadata(); |
| try (InputStream stream = TikaInputStream.get(url, metadata)) { |
| return detect(stream, metadata); |
| } |
| } |
| |
| /** |
| * Detects the media type of a document with the given file name. |
| * The type detection is based on known file name extensions. |
| * <p> |
| * The given name can also be a URL or a full file path. In such cases |
| * only the file name part of the string is used for type detection. |
| * |
| * @param name the file name of the document |
| * @return detected media type |
| */ |
| public String detect(String name) { |
| try { |
| return detect((InputStream) null, name); |
| } catch (IOException e) { |
| throw new IllegalStateException("Unexpected IOException", e); |
| } |
| } |
| |
| /** |
| * Translate the given text String to and from the given languages. |
| * |
| * @param text The text to translate. |
| * @param sourceLanguage The input text language (for example, "hi"). |
| * @param targetLanguage The desired output language (for example, "fr"). |
| * @return The translated text. If translation is unavailable (client keys not set), returns |
| * the same text back. |
| * @see org.apache.tika.language.translate.Translator |
| */ |
| public String translate(String text, String sourceLanguage, String targetLanguage) { |
| try { |
| return translator.translate(text, sourceLanguage, targetLanguage); |
| } catch (Exception e) { |
| throw new IllegalStateException("Error translating data.", e); |
| } |
| } |
| |
| /** |
| * Translate the given text String to the given language, attempting to auto-detect the |
| * source language. |
| * |
| * @param text The text to translate. |
| * @param targetLanguage The desired output language (for example, "en"). |
| * @return The translated text. If translation is unavailable (client keys not set), returns |
| * the same text back. |
| * @see org.apache.tika.language.translate.Translator |
| */ |
| public String translate(String text, String targetLanguage) { |
| try { |
| return translator.translate(text, targetLanguage); |
| } catch (Exception e) { |
| throw new IllegalStateException("Error translating data.", e); |
| } |
| } |
| |
| |
| /** |
| * Parses the given document and returns the extracted text content. |
| * Input metadata like a file name or a content type hint can be passed |
| * in the given metadata instance. Metadata information extracted from |
| * the document is returned in that same metadata instance. |
| * <p> |
| * The returned reader will be responsible for closing the given stream. |
| * The stream and any associated resources will be closed at or before |
| * the time when the {@link Reader#close()} method is called. |
| * |
| * @param stream the document to be parsed |
| * @param metadata where document's metadata will be populated |
| * @return extracted text content |
| * @throws IOException if the document can not be read or parsed |
| */ |
| public Reader parse(InputStream stream, Metadata metadata) throws IOException { |
| ParseContext context = new ParseContext(); |
| context.set(Parser.class, parser); |
| return new ParsingReader(parser, stream, metadata, context); |
| } |
| |
| /** |
| * Parses the given document and returns the extracted text content. |
| * <p> |
| * The returned reader will be responsible for closing the given stream. |
| * The stream and any associated resources will be closed at or before |
| * the time when the {@link Reader#close()} method is called. |
| * |
| * @param stream the document to be parsed |
| * @return extracted text content |
| * @throws IOException if the document can not be read or parsed |
| */ |
| public Reader parse(InputStream stream) throws IOException { |
| return parse(stream, new Metadata()); |
| } |
| |
| /** |
| * Parses the file at the given path and returns the extracted text content. |
| * <p> |
| * Metadata information extracted from the document is returned in |
| * the supplied metadata instance. |
| * |
| * @param path the path of the file to be parsed |
| * @param metadata where document's metadata will be populated |
| * @return extracted text content |
| * @throws IOException if the file can not be read or parsed |
| */ |
| public Reader parse(Path path, Metadata metadata) throws IOException { |
| InputStream stream = TikaInputStream.get(path, metadata); |
| return parse(stream, metadata); |
| } |
| |
| /** |
| * Parses the file at the given path and returns the extracted text content. |
| * |
| * @param path the path of the file to be parsed |
| * @return extracted text content |
| * @throws IOException if the file can not be read or parsed |
| */ |
| public Reader parse(Path path) throws IOException { |
| return parse(path, new Metadata()); |
| } |
| |
| /** |
| * Parses the given file and returns the extracted text content. |
| * <p> |
| * Metadata information extracted from the document is returned in |
| * the supplied metadata instance. |
| * |
| * @param file the file to be parsed |
| * @param metadata where document's metadata will be populated |
| * @return extracted text content |
| * @throws IOException if the file can not be read or parsed |
| * @see #parse(Path) |
| */ |
| public Reader parse(File file, Metadata metadata) throws IOException { |
| @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); |
| return parse(stream, metadata); |
| } |
| |
| /** |
| * Parses the given file and returns the extracted text content. |
| * |
| * @param file the file to be parsed |
| * @return extracted text content |
| * @throws IOException if the file can not be read or parsed |
| * @see #parse(Path) |
| */ |
| public Reader parse(File file) throws IOException { |
| return parse(file, new Metadata()); |
| } |
| |
| /** |
| * Parses the resource at the given URL and returns the extracted |
| * text content. |
| * |
| * @param url the URL of the resource to be parsed |
| * @return extracted text content |
| * @throws IOException if the resource can not be read or parsed |
| */ |
| public Reader parse(URL url) throws IOException { |
| Metadata metadata = new Metadata(); |
| InputStream stream = TikaInputStream.get(url, metadata); |
| return parse(stream, metadata); |
| } |
| |
| /** |
| * Parses the given document and returns the extracted text content. |
| * The given input stream is closed by this method. |
| * <p> |
| * To avoid unpredictable excess memory use, the returned string contains |
| * only up to {@link #getMaxStringLength()} first characters extracted |
| * from the input document. Use the {@link #setMaxStringLength(int)} |
| * method to adjust this limitation. |
| * <p> |
| * <strong>NOTE:</strong> Unlike most other Tika methods that take an |
| * {@link InputStream}, this method will close the given stream for |
| * you as a convenience. With other methods you are still responsible |
| * for closing the stream or a wrapper instance returned by Tika. |
| * |
| * @param stream the document to be parsed |
| * @param metadata document metadata |
| * @return extracted text content |
| * @throws IOException if the document can not be read |
| * @throws TikaException if the document can not be parsed |
| */ |
| public String parseToString(InputStream stream, Metadata metadata) |
| throws IOException, TikaException { |
| return parseToString(stream, metadata, maxStringLength); |
| } |
| |
| /** |
| * Parses the given document and returns the extracted text content. |
| * The given input stream is closed by this method. This method lets |
| * you control the maxStringLength per call. |
| * <p> |
| * To avoid unpredictable excess memory use, the returned string contains |
| * only up to maxLength (parameter) first characters extracted |
| * from the input document. |
| * <p> |
| * <strong>NOTE:</strong> Unlike most other Tika methods that take an |
| * {@link InputStream}, this method will close the given stream for |
| * you as a convenience. With other methods you are still responsible |
| * for closing the stream or a wrapper instance returned by Tika. |
| * |
| * @param stream the document to be parsed |
| * @param metadata document metadata |
| * @param maxLength maximum length of the returned string |
| * @return extracted text content |
| * @throws IOException if the document can not be read |
| * @throws TikaException if the document can not be parsed |
| */ |
| public String parseToString(InputStream stream, Metadata metadata, int maxLength) |
| throws IOException, TikaException { |
| WriteOutContentHandler handler = new WriteOutContentHandler(maxLength); |
| try { |
| ParseContext context = new ParseContext(); |
| context.set(Parser.class, parser); |
| parser.parse(stream, new BodyContentHandler(handler), metadata, context); |
| } catch (SAXException e) { |
| if (!WriteLimitReachedException.isWriteLimitReached(e)) { |
| // This should never happen with BodyContentHandler... |
| throw new TikaException("Unexpected SAX processing failure", e); |
| } |
| } finally { |
| stream.close(); |
| } |
| return handler.toString(); |
| } |
| |
| /** |
| * Parses the given document and returns the extracted text content. |
| * The given input stream is closed by this method. |
| * <p> |
| * To avoid unpredictable excess memory use, the returned string contains |
| * only up to {@link #getMaxStringLength()} first characters extracted |
| * from the input document. Use the {@link #setMaxStringLength(int)} |
| * method to adjust this limitation. |
| * <p> |
| * <strong>NOTE:</strong> Unlike most other Tika methods that take an |
| * {@link InputStream}, this method will close the given stream for |
| * you as a convenience. With other methods you are still responsible |
| * for closing the stream or a wrapper instance returned by Tika. |
| * |
| * @param stream the document to be parsed |
| * @return extracted text content |
| * @throws IOException if the document can not be read |
| * @throws TikaException if the document can not be parsed |
| */ |
| public String parseToString(InputStream stream) throws IOException, TikaException { |
| return parseToString(stream, new Metadata()); |
| } |
| |
| /** |
| * Parses the file at the given path and returns the extracted text content. |
| * <p> |
| * To avoid unpredictable excess memory use, the returned string contains |
| * only up to {@link #getMaxStringLength()} first characters extracted |
| * from the input document. Use the {@link #setMaxStringLength(int)} |
| * method to adjust this limitation. |
| * |
| * @param path the path of the file to be parsed |
| * @return extracted text content |
| * @throws IOException if the file can not be read |
| * @throws TikaException if the file can not be parsed |
| */ |
| public String parseToString(Path path) throws IOException, TikaException { |
| Metadata metadata = new Metadata(); |
| InputStream stream = TikaInputStream.get(path, metadata); |
| return parseToString(stream, metadata); |
| } |
| |
| /** |
| * Parses the given file and returns the extracted text content. |
| * <p> |
| * To avoid unpredictable excess memory use, the returned string contains |
| * only up to {@link #getMaxStringLength()} first characters extracted |
| * from the input document. Use the {@link #setMaxStringLength(int)} |
| * method to adjust this limitation. |
| * |
| * @param file the file to be parsed |
| * @return extracted text content |
| * @throws IOException if the file can not be read |
| * @throws TikaException if the file can not be parsed |
| * @see #parseToString(Path) |
| */ |
| public String parseToString(File file) throws IOException, TikaException { |
| Metadata metadata = new Metadata(); |
| @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata); |
| return parseToString(stream, metadata); |
| } |
| |
| /** |
| * Parses the resource at the given URL and returns the extracted |
| * text content. |
| * <p> |
| * To avoid unpredictable excess memory use, the returned string contains |
| * only up to {@link #getMaxStringLength()} first characters extracted |
| * from the input document. Use the {@link #setMaxStringLength(int)} |
| * method to adjust this limitation. |
| * |
| * @param url the URL of the resource to be parsed |
| * @return extracted text content |
| * @throws IOException if the resource can not be read |
| * @throws TikaException if the resource can not be parsed |
| */ |
| public String parseToString(URL url) throws IOException, TikaException { |
| Metadata metadata = new Metadata(); |
| InputStream stream = TikaInputStream.get(url, metadata); |
| return parseToString(stream, metadata); |
| } |
| |
| /** |
| * Returns the maximum length of strings returned by the |
| * parseToString methods. |
| * |
| * @return maximum string length, or -1 if the limit has been disabled |
| * @since Apache Tika 0.7 |
| */ |
| public int getMaxStringLength() { |
| return maxStringLength; |
| } |
| |
| /** |
| * Sets the maximum length of strings returned by the parseToString |
| * methods. |
| * |
| * @param maxStringLength maximum string length, |
| * or -1 to disable this limit |
| * @since Apache Tika 0.7 |
| */ |
| public void setMaxStringLength(int maxStringLength) { |
| this.maxStringLength = maxStringLength; |
| } |
| |
| /** |
| * Returns the parser instance used by this facade. |
| * |
| * @return parser instance |
| * @since Apache Tika 0.10 |
| */ |
| public Parser getParser() { |
| return parser; |
| } |
| |
| /** |
| * Returns the detector instance used by this facade. |
| * |
| * @return detector instance |
| * @since Apache Tika 0.10 |
| */ |
| public Detector getDetector() { |
| return detector; |
| } |
| |
| /** |
| * Returns the translator instance used by this facade. |
| * |
| * @return translator instance |
| * @since Tika 1.6 |
| */ |
| public Translator getTranslator() { |
| return translator; |
| } |
| |
| //--------------------------------------------------------------< Object > |
| |
| public String toString() { |
| String version = null; |
| |
| try (InputStream stream = Tika.class |
| .getResourceAsStream("/META-INF/maven/org.apache.tika/tika-core/pom.properties")) { |
| if (stream != null) { |
| Properties properties = new Properties(); |
| properties.load(stream); |
| version = properties.getProperty("version"); |
| } |
| } catch (Exception ignore) { |
| } |
| |
| if (version != null) { |
| return "Apache Tika " + version; |
| } else { |
| return "Apache Tika"; |
| } |
| } |
| |
| } |