tika-core first commit
diff --git a/.mvn/jvm.config b/.mvn/jvm.config
new file mode 100644
index 0000000..e2a50e0
--- /dev/null
+++ b/.mvn/jvm.config
@@ -0,0 +1 @@
+--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED --add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED --add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED --add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED --add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED
\ No newline at end of file
diff --git a/tika-core/pom.xml b/tika-core/pom.xml
index 8426ac5..1fbef62 100644
--- a/tika-core/pom.xml
+++ b/tika-core/pom.xml
@@ -34,6 +34,10 @@
<name>Apache Tika core</name>
<url>https://tika.apache.org/</url>
+ <properties>
+ <git-code-format-maven-plugin.version>5.3</git-code-format-maven-plugin.version>
+ </properties>
+
<dependencies>
<!-- See https://issues.apache.org/jira/browse/TIKA-2566 for more info -->
<dependency>
@@ -131,6 +135,46 @@
</executions>
</plugin>
<plugin>
+ <groupId>com.cosium.code</groupId>
+ <artifactId>git-code-format-maven-plugin</artifactId>
+ <version>${git-code-format-maven-plugin.version}</version>
+ <executions>
+ <!-- On commit, format the modified java files -->
+ <execution>
+ <id>install-formatter-hook</id>
+ <goals>
+ <goal>install-hooks</goal>
+ </goals>
+ </execution>
+ <!-- On Maven verify phase, fail if any file (including
+ unmodified)
+ is badly formatted -->
+ <execution>
+ <id>validate-code-format</id>
+ <goals>
+ <goal>validate-code-format</goal>
+ </goals>
+ </execution>
+ </executions>
+ <dependencies>
+ <!-- Enable https://github.com/google/google-java-format -->
+ <dependency>
+ <groupId>com.cosium.code</groupId>
+ <artifactId>google-java-format</artifactId>
+ <version>${git-code-format-maven-plugin.version}</version>
+ </dependency>
+ </dependencies>
+ <configuration>
+ <formatterOptions>
+ <googleJavaFormat.aosp>true</googleJavaFormat.aosp>
+ <googleJavaFormat.fixImportsOnly>false</googleJavaFormat.fixImportsOnly>
+ <googleJavaFormat.skipSortingImports>false</googleJavaFormat.skipSortingImports>
+ <googleJavaFormat.skipRemovingUnusedImports>false</googleJavaFormat.skipRemovingUnusedImports>
+ </formatterOptions>
+ </configuration>
+ </plugin>
+
+ <plugin>
<groupId>org.apache.felix</groupId>
<artifactId>maven-bundle-plugin</artifactId>
<version>${maven.bundle.version}</version>
diff --git a/tika-core/src/main/java/org/apache/tika/Tika.java b/tika-core/src/main/java/org/apache/tika/Tika.java
index 22811f9..23f4769 100644
--- a/tika-core/src/main/java/org/apache/tika/Tika.java
+++ b/tika-core/src/main/java/org/apache/tika/Tika.java
@@ -24,9 +24,6 @@
import java.net.URL;
import java.nio.file.Path;
import java.util.Properties;
-
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
@@ -41,11 +38,12 @@
import org.apache.tika.parser.ParsingReader;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
+import org.xml.sax.SAXException;
/**
- * Facade class for accessing Tika functionality. This class hides much of
- * the underlying complexity of the lower level Tika classes and provides
- * simple methods for many common parsing and type detection operations.
+ * Facade class for accessing Tika functionality. This class hides much of the underlying complexity
+ * of the lower level Tika classes and provides simple methods for many common parsing and type
+ * detection operations.
*
* @see Parser
* @see Detector
@@ -53,25 +51,18 @@
*/
public class Tika {
- /**
- * The detector instance used by this facade.
- */
+ /** The detector instance used by this facade. */
private final Detector detector;
- /**
- * The parser instance used by this facade.
- */
+ /** The parser instance used by this facade. */
private final Parser parser;
- /**
- * The Translator instance used by this facade.
- */
+ /** The Translator instance used by this facade. */
private final Translator translator;
/**
- * Maximum length of the strings returned by the parseToString methods.
- * Used to prevent out of memory problems with huge input documents.
- * The default setting is 100k characters.
+ * Maximum length of the strings returned by the parseToString methods. Used to prevent out of
+ * memory problems with huge input documents. The default setting is 100k characters.
*/
private int maxStringLength = 100 * 1000;
@@ -80,7 +71,7 @@
* Translator.
*
* @param detector type detector
- * @param parser document parser
+ * @param parser document parser
* @since Apache Tika 0.8
*/
public Tika(Detector detector, Parser parser) {
@@ -92,8 +83,8 @@
/**
* Creates a Tika facade using the given detector, parser, and translator instances.
*
- * @param detector type detector
- * @param parser document parser
+ * @param detector type detector
+ * @param parser document parser
* @param translator text translator
* @since Apache Tika 1.6
*/
@@ -112,16 +103,14 @@
this(config.getDetector(), new AutoDetectParser(config), config.getTranslator());
}
- /**
- * Creates a Tika facade using the default configuration.
- */
+ /** Creates a Tika facade using the default configuration. */
public Tika() {
this(TikaConfig.getDefaultConfig());
}
/**
- * Creates a Tika facade using the given detector instance, the
- * default parser configuration, and the default Translator.
+ * Creates a Tika facade using the given detector instance, the default parser configuration,
+ * and the default Translator.
*
* @param detector type detector
* @since Apache Tika 0.8
@@ -130,25 +119,21 @@
this(detector, new AutoDetectParser(detector));
}
-
/**
- * Detects the media type of the given document. The type detection is
- * based on the content of the given document stream and any given
- * document metadata. The document stream can be <code>null</code>,
- * in which case only the given document metadata is used for type
- * detection.
- * <p>
- * If the document stream supports the
- * {@link InputStream#markSupported() mark feature}, then the stream is
- * marked and reset to the original position before this method returns.
- * Only a limited number of bytes are read from the stream.
- * <p>
- * The given document stream is <em>not</em> closed by this method.
- * <p>
- * Unlike in the {@link #parse(InputStream, Metadata)} method, the
- * given document metadata is <em>not</em> modified by this method.
+ * Detects the media type of the given document. The type detection is based on the content of
+ * the given document stream and any given document metadata. The document stream can be <code>
+ * null</code>, in which case only the given document metadata is used for type detection.
*
- * @param stream the document stream, or <code>null</code>
+ * <p>If the document stream supports the {@link InputStream#markSupported() mark feature}, then
+ * the stream is marked and reset to the original position before this method returns. Only a
+ * limited number of bytes are read from the stream.
+ *
+ * <p>The given document stream is <em>not</em> closed by this method.
+ *
+ * <p>Unlike in the {@link #parse(InputStream, Metadata)} method, the given document metadata is
+ * <em>not</em> modified by this method.
+ *
+ * @param stream the document stream, or <code>null</code>
* @param metadata document metadata
* @return detected media type
* @throws IOException if the stream can not be read
@@ -162,19 +147,17 @@
}
/**
- * Detects the media type of the given document. The type detection is
- * based on the content of the given document stream and the name of the
- * document.
- * <p>
- * If the document stream supports the
- * {@link InputStream#markSupported() mark feature}, then the stream is
- * marked and reset to the original position before this method returns.
- * Only a limited number of bytes are read from the stream.
- * <p>
- * The given document stream is <em>not</em> closed by this method.
+ * Detects the media type of the given document. The type detection is based on the content of
+ * the given document stream and the name of the document.
+ *
+ * <p>If the document stream supports the {@link InputStream#markSupported() mark feature}, then
+ * the stream is marked and reset to the original position before this method returns. Only a
+ * limited number of bytes are read from the stream.
+ *
+ * <p>The given document stream is <em>not</em> closed by this method.
*
* @param stream the document stream
- * @param name document name
+ * @param name document name
* @return detected media type
* @throws IOException if the stream can not be read
* @since Apache Tika 0.9
@@ -186,15 +169,14 @@
}
/**
- * Detects the media type of the given document. The type detection is
- * based on the content of the given document stream.
- * <p>
- * If the document stream supports the
- * {@link InputStream#markSupported() mark feature}, then the stream is
- * marked and reset to the original position before this method returns.
- * Only a limited number of bytes are read from the stream.
- * <p>
- * The given document stream is <em>not</em> closed by this method.
+ * Detects the media type of the given document. The type detection is based on the content of
+ * the given document stream.
+ *
+ * <p>If the document stream supports the {@link InputStream#markSupported() mark feature}, then
+ * the stream is marked and reset to the original position before this method returns. Only a
+ * limited number of bytes are read from the stream.
+ *
+ * <p>The given document stream is <em>not</em> closed by this method.
*
* @param stream the document stream
* @return detected media type
@@ -205,16 +187,15 @@
}
/**
- * Detects the media type of the given document. The type detection is
- * based on the first few bytes of a document and the document name.
- * <p>
- * For best results at least a few kilobytes of the document data
- * are needed. See also the other detect() methods for better
- * alternatives when you have more than just the document prefix
- * available for type detection.
+ * Detects the media type of the given document. The type detection is based on the first few
+ * bytes of a document and the document name.
+ *
+ * <p>For best results at least a few kilobytes of the document data are needed. See also the
+ * other detect() methods for better alternatives when you have more than just the document
+ * prefix available for type detection.
*
* @param prefix first few bytes of the document
- * @param name document name
+ * @param name document name
* @return detected media type
* @since Apache Tika 0.9
*/
@@ -229,13 +210,12 @@
}
/**
- * Detects the media type of the given document. The type detection is
- * based on the first few bytes of a document.
- * <p>
- * For best results at least a few kilobytes of the document data
- * are needed. See also the other detect() methods for better
- * alternatives when you have more than just the document prefix
- * available for type detection.
+ * Detects the media type of the given document. The type detection is based on the first few
+ * bytes of a document.
+ *
+ * <p>For best results at least a few kilobytes of the document data are needed. See also the
+ * other detect() methods for better alternatives when you have more than just the document
+ * prefix available for type detection.
*
* @param prefix first few bytes of the document
* @return detected media type
@@ -252,12 +232,11 @@
}
/**
- * Detects the media type of the file at the given path. The type
- * detection is based on the document content and a potential known
- * file extension.
- * <p>
- * Use the {@link #detect(String)} method when you want to detect the
- * type of the document without actually accessing the file.
+ * Detects the media type of the file at the given path. The type detection is based on the
+ * document content and a potential known file extension.
+ *
+ * <p>Use the {@link #detect(String)} method when you want to detect the type of the document
+ * without actually accessing the file.
*
* @param path the path of the file
* @return detected media type
@@ -271,11 +250,11 @@
}
/**
- * Detects the media type of the given file. The type detection is
- * based on the document content and a potential known file extension.
- * <p>
- * Use the {@link #detect(String)} method when you want to detect the
- * type of the document without actually accessing the file.
+ * Detects the media type of the given file. The type detection is based on the document content
+ * and a potential known file extension.
+ *
+ * <p>Use the {@link #detect(String)} method when you want to detect the type of the document
+ * without actually accessing the file.
*
* @param file the file
* @return detected media type
@@ -284,19 +263,18 @@
*/
public String detect(File file) throws IOException {
Metadata metadata = new Metadata();
- try (@SuppressWarnings("deprecation") InputStream stream = TikaInputStream
- .get(file, metadata)) {
+ try (@SuppressWarnings("deprecation")
+ InputStream stream = TikaInputStream.get(file, metadata)) {
return detect(stream, metadata);
}
}
/**
- * Detects the media type of the resource at the given URL. The type
- * detection is based on the document content and a potential known
- * file extension included in the URL.
- * <p>
- * Use the {@link #detect(String)} method when you want to detect the
- * type of the document without actually accessing the URL.
+ * Detects the media type of the resource at the given URL. The type detection is based on the
+ * document content and a potential known file extension included in the URL.
+ *
+ * <p>Use the {@link #detect(String)} method when you want to detect the type of the document
+ * without actually accessing the URL.
*
* @param url the URL of the resource
* @return detected media type
@@ -310,11 +288,11 @@
}
/**
- * Detects the media type of a document with the given file name.
- * The type detection is based on known file name extensions.
- * <p>
- * The given name can also be a URL or a full file path. In such cases
- * only the file name part of the string is used for type detection.
+ * Detects the media type of a document with the given file name. The type detection is based on
+ * known file name extensions.
+ *
+ * <p>The given name can also be a URL or a full file path. In such cases only the file name
+ * part of the string is used for type detection.
*
* @param name the file name of the document
* @return detected media type
@@ -330,11 +308,11 @@
/**
* Translate the given text String to and from the given languages.
*
- * @param text The text to translate.
+ * @param text The text to translate.
* @param sourceLanguage The input text language (for example, "hi").
* @param targetLanguage The desired output language (for example, "fr").
- * @return The translated text. If translation is unavailable (client keys not set), returns
- * the same text back.
+ * @return The translated text. If translation is unavailable (client keys not set), returns the
+ * same text back.
* @see org.apache.tika.language.translate.Translator
*/
public String translate(String text, String sourceLanguage, String targetLanguage) {
@@ -346,13 +324,13 @@
}
/**
- * Translate the given text String to the given language, attempting to auto-detect the
- * source language.
+ * Translate the given text String to the given language, attempting to auto-detect the source
+ * language.
*
- * @param text The text to translate.
+ * @param text The text to translate.
* @param targetLanguage The desired output language (for example, "en").
- * @return The translated text. If translation is unavailable (client keys not set), returns
- * the same text back.
+ * @return The translated text. If translation is unavailable (client keys not set), returns the
+ * same text back.
* @see org.apache.tika.language.translate.Translator
*/
public String translate(String text, String targetLanguage) {
@@ -363,18 +341,16 @@
}
}
-
/**
- * Parses the given document and returns the extracted text content.
- * Input metadata like a file name or a content type hint can be passed
- * in the given metadata instance. Metadata information extracted from
- * the document is returned in that same metadata instance.
- * <p>
- * The returned reader will be responsible for closing the given stream.
- * The stream and any associated resources will be closed at or before
- * the time when the {@link Reader#close()} method is called.
+ * Parses the given document and returns the extracted text content. Input metadata like a file
+ * name or a content type hint can be passed in the given metadata instance. Metadata
+ * information extracted from the document is returned in that same metadata instance.
*
- * @param stream the document to be parsed
+ * <p>The returned reader will be responsible for closing the given stream. The stream and any
+ * associated resources will be closed at or before the time when the {@link Reader#close()}
+ * method is called.
+ *
+ * @param stream the document to be parsed
* @param metadata where document's metadata will be populated
* @return extracted text content
* @throws IOException if the document can not be read or parsed
@@ -387,10 +363,10 @@
/**
* Parses the given document and returns the extracted text content.
- * <p>
- * The returned reader will be responsible for closing the given stream.
- * The stream and any associated resources will be closed at or before
- * the time when the {@link Reader#close()} method is called.
+ *
+ * <p>The returned reader will be responsible for closing the given stream. The stream and any
+ * associated resources will be closed at or before the time when the {@link Reader#close()}
+ * method is called.
*
* @param stream the document to be parsed
* @return extracted text content
@@ -402,11 +378,11 @@
/**
* Parses the file at the given path and returns the extracted text content.
- * <p>
- * Metadata information extracted from the document is returned in
- * the supplied metadata instance.
*
- * @param path the path of the file to be parsed
+ * <p>Metadata information extracted from the document is returned in the supplied metadata
+ * instance.
+ *
+ * @param path the path of the file to be parsed
* @param metadata where document's metadata will be populated
* @return extracted text content
* @throws IOException if the file can not be read or parsed
@@ -429,18 +405,19 @@
/**
* Parses the given file and returns the extracted text content.
- * <p>
- * Metadata information extracted from the document is returned in
- * the supplied metadata instance.
*
- * @param file the file to be parsed
+ * <p>Metadata information extracted from the document is returned in the supplied metadata
+ * instance.
+ *
+ * @param file the file to be parsed
* @param metadata where document's metadata will be populated
* @return extracted text content
* @throws IOException if the file can not be read or parsed
* @see #parse(Path)
*/
public Reader parse(File file, Metadata metadata) throws IOException {
- @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata);
+ @SuppressWarnings("deprecation")
+ InputStream stream = TikaInputStream.get(file, metadata);
return parse(stream, metadata);
}
@@ -457,8 +434,7 @@
}
/**
- * Parses the resource at the given URL and returns the extracted
- * text content.
+ * Parses the resource at the given URL and returns the extracted text content.
*
* @param url the URL of the resource to be parsed
* @return extracted text content
@@ -471,23 +447,21 @@
}
/**
- * Parses the given document and returns the extracted text content.
- * The given input stream is closed by this method.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to {@link #getMaxStringLength()} first characters extracted
- * from the input document. Use the {@link #setMaxStringLength(int)}
- * method to adjust this limitation.
- * <p>
- * <strong>NOTE:</strong> Unlike most other Tika methods that take an
- * {@link InputStream}, this method will close the given stream for
- * you as a convenience. With other methods you are still responsible
- * for closing the stream or a wrapper instance returned by Tika.
+ * Parses the given document and returns the extracted text content. The given input stream is
+ * closed by this method.
*
- * @param stream the document to be parsed
+ * <p>To avoid unpredictable excess memory use, the returned string contains only up to {@link
+ * #getMaxStringLength()} first characters extracted from the input document. Use the {@link
+ * #setMaxStringLength(int)} method to adjust this limitation.
+ *
+ * <p><strong>NOTE:</strong> Unlike most other Tika methods that take an {@link InputStream},
+ * this method will close the given stream for you as a convenience. With other methods you are
+ * still responsible for closing the stream or a wrapper instance returned by Tika.
+ *
+ * @param stream the document to be parsed
* @param metadata document metadata
* @return extracted text content
- * @throws IOException if the document can not be read
+ * @throws IOException if the document can not be read
* @throws TikaException if the document can not be parsed
*/
public String parseToString(InputStream stream, Metadata metadata)
@@ -496,24 +470,21 @@
}
/**
- * Parses the given document and returns the extracted text content.
- * The given input stream is closed by this method. This method lets
- * you control the maxStringLength per call.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to maxLength (parameter) first characters extracted
- * from the input document.
- * <p>
- * <strong>NOTE:</strong> Unlike most other Tika methods that take an
- * {@link InputStream}, this method will close the given stream for
- * you as a convenience. With other methods you are still responsible
- * for closing the stream or a wrapper instance returned by Tika.
+ * Parses the given document and returns the extracted text content. The given input stream is
+ * closed by this method. This method lets you control the maxStringLength per call.
*
- * @param stream the document to be parsed
- * @param metadata document metadata
+ * <p>To avoid unpredictable excess memory use, the returned string contains only up to
+ * maxLength (parameter) first characters extracted from the input document.
+ *
+ * <p><strong>NOTE:</strong> Unlike most other Tika methods that take an {@link InputStream},
+ * this method will close the given stream for you as a convenience. With other methods you are
+ * still responsible for closing the stream or a wrapper instance returned by Tika.
+ *
+ * @param stream the document to be parsed
+ * @param metadata document metadata
* @param maxLength maximum length of the returned string
* @return extracted text content
- * @throws IOException if the document can not be read
+ * @throws IOException if the document can not be read
* @throws TikaException if the document can not be parsed
*/
public String parseToString(InputStream stream, Metadata metadata, int maxLength)
@@ -535,22 +506,20 @@
}
/**
- * Parses the given document and returns the extracted text content.
- * The given input stream is closed by this method.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to {@link #getMaxStringLength()} first characters extracted
- * from the input document. Use the {@link #setMaxStringLength(int)}
- * method to adjust this limitation.
- * <p>
- * <strong>NOTE:</strong> Unlike most other Tika methods that take an
- * {@link InputStream}, this method will close the given stream for
- * you as a convenience. With other methods you are still responsible
- * for closing the stream or a wrapper instance returned by Tika.
+ * Parses the given document and returns the extracted text content. The given input stream is
+ * closed by this method.
+ *
+ * <p>To avoid unpredictable excess memory use, the returned string contains only up to {@link
+ * #getMaxStringLength()} first characters extracted from the input document. Use the {@link
+ * #setMaxStringLength(int)} method to adjust this limitation.
+ *
+ * <p><strong>NOTE:</strong> Unlike most other Tika methods that take an {@link InputStream},
+ * this method will close the given stream for you as a convenience. With other methods you are
+ * still responsible for closing the stream or a wrapper instance returned by Tika.
*
* @param stream the document to be parsed
* @return extracted text content
- * @throws IOException if the document can not be read
+ * @throws IOException if the document can not be read
* @throws TikaException if the document can not be parsed
*/
public String parseToString(InputStream stream) throws IOException, TikaException {
@@ -559,15 +528,14 @@
/**
* Parses the file at the given path and returns the extracted text content.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to {@link #getMaxStringLength()} first characters extracted
- * from the input document. Use the {@link #setMaxStringLength(int)}
- * method to adjust this limitation.
+ *
+ * <p>To avoid unpredictable excess memory use, the returned string contains only up to {@link
+ * #getMaxStringLength()} first characters extracted from the input document. Use the {@link
+ * #setMaxStringLength(int)} method to adjust this limitation.
*
* @param path the path of the file to be parsed
* @return extracted text content
- * @throws IOException if the file can not be read
+ * @throws IOException if the file can not be read
* @throws TikaException if the file can not be parsed
*/
public String parseToString(Path path) throws IOException, TikaException {
@@ -578,36 +546,34 @@
/**
* Parses the given file and returns the extracted text content.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to {@link #getMaxStringLength()} first characters extracted
- * from the input document. Use the {@link #setMaxStringLength(int)}
- * method to adjust this limitation.
+ *
+ * <p>To avoid unpredictable excess memory use, the returned string contains only up to {@link
+ * #getMaxStringLength()} first characters extracted from the input document. Use the {@link
+ * #setMaxStringLength(int)} method to adjust this limitation.
*
* @param file the file to be parsed
* @return extracted text content
- * @throws IOException if the file can not be read
+ * @throws IOException if the file can not be read
* @throws TikaException if the file can not be parsed
* @see #parseToString(Path)
*/
public String parseToString(File file) throws IOException, TikaException {
Metadata metadata = new Metadata();
- @SuppressWarnings("deprecation") InputStream stream = TikaInputStream.get(file, metadata);
+ @SuppressWarnings("deprecation")
+ InputStream stream = TikaInputStream.get(file, metadata);
return parseToString(stream, metadata);
}
/**
- * Parses the resource at the given URL and returns the extracted
- * text content.
- * <p>
- * To avoid unpredictable excess memory use, the returned string contains
- * only up to {@link #getMaxStringLength()} first characters extracted
- * from the input document. Use the {@link #setMaxStringLength(int)}
- * method to adjust this limitation.
+ * Parses the resource at the given URL and returns the extracted text content.
+ *
+ * <p>To avoid unpredictable excess memory use, the returned string contains only up to {@link
+ * #getMaxStringLength()} first characters extracted from the input document. Use the {@link
+ * #setMaxStringLength(int)} method to adjust this limitation.
*
* @param url the URL of the resource to be parsed
* @return extracted text content
- * @throws IOException if the resource can not be read
+ * @throws IOException if the resource can not be read
* @throws TikaException if the resource can not be parsed
*/
public String parseToString(URL url) throws IOException, TikaException {
@@ -617,8 +583,7 @@
}
/**
- * Returns the maximum length of strings returned by the
- * parseToString methods.
+ * Returns the maximum length of strings returned by the parseToString methods.
*
* @return maximum string length, or -1 if the limit has been disabled
* @since Apache Tika 0.7
@@ -628,11 +593,9 @@
}
/**
- * Sets the maximum length of strings returned by the parseToString
- * methods.
+ * Sets the maximum length of strings returned by the parseToString methods.
*
- * @param maxStringLength maximum string length,
- * or -1 to disable this limit
+ * @param maxStringLength maximum string length, or -1 to disable this limit
* @since Apache Tika 0.7
*/
public void setMaxStringLength(int maxStringLength) {
@@ -669,7 +632,7 @@
return translator;
}
- //--------------------------------------------------------------< Object >
+ // --------------------------------------------------------------< Object >
public String toString() {
return getString();
@@ -678,8 +641,9 @@
public static String getString() {
String version = null;
- try (InputStream stream = Tika.class
- .getResourceAsStream("/META-INF/maven/org.apache.tika/tika-core/pom.properties")) {
+ try (InputStream stream =
+ Tika.class.getResourceAsStream(
+ "/META-INF/maven/org.apache.tika/tika-core/pom.properties")) {
if (stream != null) {
Properties properties = new Properties();
properties.load(stream);
@@ -694,5 +658,4 @@
return "Apache Tika";
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
index 1f7c4a0..f3a5ffa 100644
--- a/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
+++ b/tika-core/src/main/java/org/apache/tika/concurrent/ConfigurableThreadPoolExecutor.java
@@ -24,9 +24,8 @@
* @since Apache Tika 1.11
*/
public interface ConfigurableThreadPoolExecutor extends ExecutorService {
-
- public void setMaximumPoolSize(int threads);
-
- public void setCorePoolSize(int threads);
+ public void setMaximumPoolSize(int threads);
+
+ public void setCorePoolSize(int threads);
}
diff --git a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
index a4385e2..6a7ee32 100644
--- a/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
+++ b/tika-core/src/main/java/org/apache/tika/concurrent/SimpleThreadPoolExecutor.java
@@ -25,11 +25,16 @@
*
* @since Apache Tika 1.11
*/
-public class SimpleThreadPoolExecutor extends ThreadPoolExecutor implements
- ConfigurableThreadPoolExecutor {
+public class SimpleThreadPoolExecutor extends ThreadPoolExecutor
+ implements ConfigurableThreadPoolExecutor {
public SimpleThreadPoolExecutor() {
- super(1, 2, 0L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(),
+ super(
+ 1,
+ 2,
+ 0L,
+ TimeUnit.SECONDS,
+ new LinkedBlockingQueue<>(),
r -> new Thread(r, "Tika Executor Thread"));
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java b/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java
index 405294f..448f882 100644
--- a/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java
+++ b/tika-core/src/main/java/org/apache/tika/config/ConfigBase.java
@@ -29,23 +29,21 @@
import java.util.Locale;
import java.util.Map;
import java.util.Set;
-
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.utils.XMLReaderUtils;
-
-
public abstract class ConfigBase {
private static Class[] SUPPORTED_PRIMITIVES =
- new Class[]{String.class, boolean.class, long.class, int.class, double.class,
- float.class};
+ new Class[] {
+ String.class, boolean.class, long.class, int.class, double.class, float.class
+ };
/**
* Use this to build a single class, where the user specifies the instance class, e.g.
@@ -81,8 +79,9 @@
* @throws TikaConfigException
* @throws IOException
*/
- protected static <T> T buildSingle(String itemName, Class<T> itemClass, Element properties,
- T defaultValue) throws TikaConfigException, IOException {
+ protected static <T> T buildSingle(
+ String itemName, Class<T> itemClass, Element properties, T defaultValue)
+ throws TikaConfigException, IOException {
NodeList children = properties.getChildNodes();
T toConfigure = null;
@@ -110,10 +109,9 @@
return toConfigure;
}
-
/**
- * Use this to build a list of components for a composite item (e.g.
- * CompositeMetadataFilter, FetcherManager), each with their own configurations
+ * Use this to build a list of components for a composite item (e.g. CompositeMetadataFilter,
+ * FetcherManager), each with their own configurations
*
* @param compositeElementName
* @param itemName
@@ -121,8 +119,12 @@
* @throws TikaConfigException
* @throws IOException
*/
- protected static <P, T> P buildComposite(String compositeElementName, Class<P> compositeClass,
- String itemName, Class<T> itemClass, InputStream is)
+ protected static <P, T> P buildComposite(
+ String compositeElementName,
+ Class<P> compositeClass,
+ String itemName,
+ Class<T> itemClass,
+ InputStream is)
throws TikaConfigException, IOException {
Element properties = null;
try {
@@ -132,13 +134,16 @@
} catch (TikaException e) {
throw new TikaConfigException("problem loading xml to dom", e);
}
- return buildComposite(compositeElementName, compositeClass, itemName, itemClass,
- properties);
+ return buildComposite(
+ compositeElementName, compositeClass, itemName, itemClass, properties);
}
- protected static <P, T> P buildComposite(String compositeElementName, Class<P> compositeClass,
- String itemName, Class<T> itemClass,
- Element properties)
+ protected static <P, T> P buildComposite(
+ String compositeElementName,
+ Class<P> compositeClass,
+ String itemName,
+ Class<T> itemClass,
+ Element properties)
throws TikaConfigException, IOException {
if (!properties.getLocalName().equals("properties")) {
@@ -159,8 +164,10 @@
P composite = (P) constructor.newInstance(components);
setParams(composite, child, new HashSet<>(), itemName);
return composite;
- } catch (NoSuchMethodException | InvocationTargetException |
- InstantiationException | IllegalAccessException e) {
+ } catch (NoSuchMethodException
+ | InvocationTargetException
+ | InstantiationException
+ | IllegalAccessException e) {
throw new TikaConfigException("can't build composite class", e);
}
}
@@ -168,8 +175,8 @@
throw new TikaConfigException("could not find " + compositeElementName);
}
- private static <T> List<T> loadComposite(Node composite, String itemName,
- Class<? extends T> itemClass)
+ private static <T> List<T> loadComposite(
+ Node composite, String itemName, Class<? extends T> itemClass)
throws TikaConfigException {
NodeList children = composite.getChildNodes();
List<T> items = new ArrayList<>();
@@ -199,14 +206,21 @@
Class clazz = Class.forName(className);
if (!itemClass.isAssignableFrom(clazz)) {
throw new TikaConfigException(
- elementName + " with class name " + className + " must be of type '" +
- itemClass.getName() + "'");
+ elementName
+ + " with class name "
+ + className
+ + " must be of type '"
+ + itemClass.getName()
+ + "'");
}
return (T) clazz.getDeclaredConstructor().newInstance();
- } catch (InstantiationException | IllegalAccessException | ClassNotFoundException |
- NoSuchMethodException | InvocationTargetException e) {
- throw new TikaConfigException("problem loading " + elementName +
- " with class " + itemClass.getName(), e);
+ } catch (InstantiationException
+ | IllegalAccessException
+ | ClassNotFoundException
+ | NoSuchMethodException
+ | InvocationTargetException e) {
+ throw new TikaConfigException(
+ "problem loading " + elementName + " with class " + itemClass.getName(), e);
}
}
@@ -215,8 +229,9 @@
setParams(object, targetNode, settings, null);
}
- private static void setParams(Object object, Node targetNode, Set<String> settings,
- String exceptNodeName) throws TikaConfigException {
+ private static void setParams(
+ Object object, Node targetNode, Set<String> settings, String exceptNodeName)
+ throws TikaConfigException {
NodeList children = targetNode.getChildNodes();
List<Node> params = new ArrayList<>();
for (int i = 0; i < children.getLength(); i++) {
@@ -257,7 +272,7 @@
if (isPrimitive(setterClassPair.itemClass)) {
tryToSetPrimitive(object, setterClassPair, param.getTextContent());
} else {
- //tryToSetPrimitive(object, localName, txt);
+ // tryToSetPrimitive(object, localName, txt);
Object item = buildClass(param, itemName, setterClassPair.itemClass);
setParams(setterClassPair.itemClass.cast(item), param, new HashSet<>());
try {
@@ -298,8 +313,8 @@
private static SetterClassPair findSetterClassPair(Object object, String itemName)
throws TikaConfigException {
- //TODO -- we could do more with info from the node -- is it complex, does it have
- //a text value, does it have a class, etc... This works for now.
+ // TODO -- we could do more with info from the node -- is it complex, does it have
+ // a text value, does it have a class, etc... This works for now.
String setter =
"set" + itemName.substring(0, 1).toUpperCase(Locale.US) + itemName.substring(1);
Class itemClass = null;
@@ -308,7 +323,7 @@
if (setter.equals(method.getName())) {
Class<?>[] classes = method.getParameterTypes();
if (classes.length == 1) {
- //if both setX(String) and setX(Object), prefer setX(String)
+ // if both setX(String) and setX(Object), prefer setX(String)
if (itemClass == null || classes[0].equals(String.class)) {
itemClass = classes[0];
setterMethod = method;
@@ -319,14 +334,14 @@
if (setterMethod != null && itemClass != null) {
return new SetterClassPair(setterMethod, itemClass);
}
- //now try adders
+ // now try adders
String adder =
"add" + itemName.substring(0, 1).toUpperCase(Locale.US) + itemName.substring(1);
for (Method method : object.getClass().getMethods()) {
if (adder.equals(method.getName())) {
Class<?>[] classes = method.getParameterTypes();
if (classes.length == 1) {
- //if both setX(String) and setX(Object), prefer setX(String)
+ // if both setX(String) and setX(Object), prefer setX(String)
if (itemClass == null || classes[0].equals(String.class)) {
itemClass = classes[0];
setterMethod = method;
@@ -336,8 +351,14 @@
}
if (setterMethod == null && itemClass == null) {
throw new TikaConfigException(
- "Couldn't find setter '" + setter + "' or adder '" + adder + "' for " + itemName +
- " of class: " + object.getClass());
+ "Couldn't find setter '"
+ + setter
+ + "' or adder '"
+ + adder
+ + "' for "
+ + itemName
+ + " of class: "
+ + object.getClass());
}
return new SetterClassPair(setterMethod, itemClass);
}
@@ -385,8 +406,10 @@
Method m = object.getClass().getMethod(setter, List.class);
m.invoke(object, items);
- } catch (ClassNotFoundException | InvocationTargetException | NoSuchMethodException |
- IllegalAccessException e) {
+ } catch (ClassNotFoundException
+ | InvocationTargetException
+ | NoSuchMethodException
+ | IllegalAccessException e) {
throw new TikaConfigException("couldn't build class for " + name, e);
}
}
@@ -415,8 +438,8 @@
private static void tryToSetMap(Object object, Node param) throws TikaConfigException {
String name = param.getLocalName();
- //only supports string, string at this point
- //use LinkedHashMap to keep insertion order!
+ // only supports string, string at this point
+ // use LinkedHashMap to keep insertion order!
Map<String, String> map = new LinkedHashMap<>();
NodeList nodeList = param.getChildNodes();
for (int i = 0; i < nodeList.getLength(); i++) {
@@ -450,7 +473,6 @@
}
map.put(key, value);
}
-
}
String setter = "set" + name.substring(0, 1).toUpperCase(Locale.US) + name.substring(1);
try {
@@ -467,11 +489,11 @@
Node n = nodeList.item(i);
if (n.getNodeType() == 1) {
if (n.hasAttributes()) {
- if (n.getAttributes().getNamedItem("from") != null &&
- n.getAttributes().getNamedItem("to") != null) {
+ if (n.getAttributes().getNamedItem("from") != null
+ && n.getAttributes().getNamedItem("to") != null) {
return true;
- } else if (n.getAttributes().getNamedItem("k") != null &&
- n.getAttributes().getNamedItem("v") != null) {
+ } else if (n.getAttributes().getNamedItem("k") != null
+ && n.getAttributes().getNamedItem("v") != null) {
return true;
}
}
@@ -480,8 +502,9 @@
return false;
}
- private static void tryToSetPrimitive(Object object, SetterClassPair setterClassPair,
- String value) throws TikaConfigException {
+ private static void tryToSetPrimitive(
+ Object object, SetterClassPair setterClassPair, String value)
+ throws TikaConfigException {
try {
if (setterClassPair.itemClass == int.class) {
setterClassPair.setterMethod.invoke(object, Integer.parseInt(value));
@@ -501,15 +524,13 @@
}
}
-
/**
- * This should be overridden to do something with the settings
- * after loading the object.
+ * This should be overridden to do something with the settings after loading the object.
*
* @param settings
*/
protected void handleSettings(Set<String> settings) {
- //no-op
+ // no-op
}
/**
@@ -559,8 +580,12 @@
@Override
public String toString() {
- return "SetterClassPair{" + "setterMethod=" + setterMethod + ", itemClass=" +
- itemClass + '}';
+ return "SetterClassPair{"
+ + "setterMethod="
+ + setterMethod
+ + ", itemClass="
+ + itemClass
+ + '}';
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/Field.java b/tika-core/src/main/java/org/apache/tika/config/Field.java
index 403ad6d..bd52e5b 100644
--- a/tika-core/src/main/java/org/apache/tika/config/Field.java
+++ b/tika-core/src/main/java/org/apache/tika/config/Field.java
@@ -23,9 +23,8 @@
import java.lang.annotation.Target;
/**
- * Field annotation is a contract for binding {@link Param} value from
- * Tika Configuration to an object.
- * services
+ * Field annotation is a contract for binding {@link Param} value from Tika Configuration to an
+ * object. services
*
* @since Apache Tika 1.14
*/
diff --git a/tika-core/src/main/java/org/apache/tika/config/Initializable.java b/tika-core/src/main/java/org/apache/tika/config/Initializable.java
index f37bdd9..df7a91d 100644
--- a/tika-core/src/main/java/org/apache/tika/config/Initializable.java
+++ b/tika-core/src/main/java/org/apache/tika/config/Initializable.java
@@ -17,18 +17,16 @@
package org.apache.tika.config;
import java.util.Map;
-
import org.apache.tika.exception.TikaConfigException;
/**
- * Components that must do special processing across multiple fields
- * at initialization time should implement this interface.
- * <p>
- * TikaConfig will call initialize on Initializable classes after
- * setting the parameters for non-statically service loaded classes.
- * <p>
- * TikaConfig will call checkInitialization on all Initializables,
- * whether loaded statically
+ * Components that must do special processing across multiple fields at initialization time should
+ * implement this interface.
+ *
+ * <p>TikaConfig will call initialize on Initializable classes after setting the parameters for
+ * non-statically service loaded classes.
+ *
+ * <p>TikaConfig will call checkInitialization on all Initializables, whether loaded statically
*/
public interface Initializable {
@@ -38,15 +36,10 @@
*/
void initialize(Map<String, Param> params) throws TikaConfigException;
-
/**
- * @param problemHandler if there is a problem and no
- * custom initializableProblemHandler has been configured
- * via Initializable parameters,
- * this is called to respond.
+ * @param problemHandler if there is a problem and no custom initializableProblemHandler has
+ * been configured via Initializable parameters, this is called to respond.
* @throws TikaConfigException
*/
void checkInitialization(InitializableProblemHandler problemHandler) throws TikaConfigException;
-
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java b/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java
index fdca690..8f933f3 100644
--- a/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/config/InitializableProblemHandler.java
@@ -16,72 +16,71 @@
*/
package org.apache.tika.config;
-
+import org.apache.tika.exception.TikaConfigException;
import org.slf4j.LoggerFactory;
-import org.apache.tika.exception.TikaConfigException;
-
-
/**
- * This is to be used to handle potential recoverable problems that
- * might arise during initialization.
+ * This is to be used to handle potential recoverable problems that might arise during
+ * initialization.
*/
public interface InitializableProblemHandler {
+ /** Strategy that simply ignores all problems. */
+ InitializableProblemHandler IGNORE =
+ new InitializableProblemHandler() {
+ public void handleInitializableProblem(String className, String message) {}
+
+ @Override
+ public String toString() {
+ return "IGNORE";
+ }
+ };
/**
- * Strategy that simply ignores all problems.
+ * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} created using
+ * the given class name.
*/
- InitializableProblemHandler IGNORE = new InitializableProblemHandler() {
- public void handleInitializableProblem(String className, String message) {
- }
+ InitializableProblemHandler INFO =
+ new InitializableProblemHandler() {
+ public void handleInitializableProblem(String classname, String message) {
+ LoggerFactory.getLogger(classname).info(message);
+ }
- @Override
- public String toString() {
- return "IGNORE";
- }
- };
+ @Override
+ public String toString() {
+ return "INFO";
+ }
+ };
+
/**
- * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger}
- * created using the given class name.
+ * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} created using
+ * the given class name.
*/
- InitializableProblemHandler INFO = new InitializableProblemHandler() {
- public void handleInitializableProblem(String classname, String message) {
- LoggerFactory.getLogger(classname).info(message);
- }
+ InitializableProblemHandler WARN =
+ new InitializableProblemHandler() {
+ public void handleInitializableProblem(String classname, String message) {
+ LoggerFactory.getLogger(classname).warn(message);
+ }
- @Override
- public String toString() {
- return "INFO";
- }
- };
- /**
- * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger}
- * created using the given class name.
- */
- InitializableProblemHandler WARN = new InitializableProblemHandler() {
- public void handleInitializableProblem(String classname, String message) {
- LoggerFactory.getLogger(classname).warn(message);
- }
+ @Override
+ public String toString() {
+ return "WARN";
+ }
+ };
- @Override
- public String toString() {
- return "WARN";
- }
- };
- InitializableProblemHandler THROW = new InitializableProblemHandler() {
- public void handleInitializableProblem(String classname, String message)
- throws TikaConfigException {
- throw new TikaConfigException(message);
- }
+ InitializableProblemHandler THROW =
+ new InitializableProblemHandler() {
+ public void handleInitializableProblem(String classname, String message)
+ throws TikaConfigException {
+ throw new TikaConfigException(message);
+ }
- @Override
- public String toString() {
- return "THROW";
- }
- };
+ @Override
+ public String toString() {
+ return "THROW";
+ }
+ };
InitializableProblemHandler DEFAULT = WARN;
void handleInitializableProblem(String className, String message) throws TikaConfigException;
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java b/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java
index 666c20d..c134d06 100644
--- a/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/config/LoadErrorHandler.java
@@ -16,67 +16,65 @@
*/
package org.apache.tika.config;
-
import org.slf4j.LoggerFactory;
-
/**
- * Interface for error handling strategies in service class loading.
- * You can implement this interface for a custom error handling mechanism,
- * or use one of the predefined strategies.
+ * Interface for error handling strategies in service class loading. You can implement this
+ * interface for a custom error handling mechanism, or use one of the predefined strategies.
*
* @since Apache Tika 0.9
*/
public interface LoadErrorHandler {
+ /** Strategy that simply ignores all problems. */
+ LoadErrorHandler IGNORE =
+ new LoadErrorHandler() {
+ public void handleLoadError(String classname, Throwable throwable) {}
+
+ @Override
+ public String toString() {
+ return "IGNORE";
+ }
+ };
+
/**
- * Strategy that simply ignores all problems.
+ * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger} created using
+ * the given class name.
*/
- LoadErrorHandler IGNORE = new LoadErrorHandler() {
- public void handleLoadError(String classname, Throwable throwable) {
- }
+ LoadErrorHandler WARN =
+ new LoadErrorHandler() {
+ public void handleLoadError(String classname, Throwable throwable) {
+ LoggerFactory.getLogger(classname)
+ .warn("Unable to load {}", classname, throwable);
+ }
- @Override
- public String toString() {
- return "IGNORE";
- }
- };
+ @Override
+ public String toString() {
+ return "WARN";
+ }
+ };
+
/**
- * Strategy that logs warnings of all problems using a {@link org.slf4j.Logger}
- * created using the given class name.
+ * Strategy that throws a {@link RuntimeException} with the given throwable as the root cause,
+ * thus interrupting the entire service loading operation.
*/
- LoadErrorHandler WARN = new LoadErrorHandler() {
- public void handleLoadError(String classname, Throwable throwable) {
- LoggerFactory.getLogger(classname).warn("Unable to load {}", classname, throwable);
- }
+ LoadErrorHandler THROW =
+ new LoadErrorHandler() {
+ public void handleLoadError(String classname, Throwable throwable) {
+ throw new RuntimeException("Unable to load " + classname, throwable);
+ }
- @Override
- public String toString() {
- return "WARN";
- }
- };
- /**
- * Strategy that throws a {@link RuntimeException} with the given
- * throwable as the root cause, thus interrupting the entire service
- * loading operation.
- */
- LoadErrorHandler THROW = new LoadErrorHandler() {
- public void handleLoadError(String classname, Throwable throwable) {
- throw new RuntimeException("Unable to load " + classname, throwable);
- }
-
- @Override
- public String toString() {
- return "THROW";
- }
- };
+ @Override
+ public String toString() {
+ return "THROW";
+ }
+ };
/**
- * Handles a problem encountered when trying to load the specified
- * service class. The implementation can log or otherwise process
- * the given error information. If the method returns normally, then
- * the service loader simply skips this class and continues with the
- * next one.
+ * Handles a problem encountered when trying to load the specified service class. The
+ * implementation can log or otherwise process the given error information. If the method
+ * returns normally, then the service loader simply skips this class and continues with the next
+ * one.
*
* @param classname name of the service class
* @param throwable the encountered problem
diff --git a/tika-core/src/main/java/org/apache/tika/config/Param.java b/tika-core/src/main/java/org/apache/tika/config/Param.java
index 25d367f..0ebbc4a 100644
--- a/tika-core/src/main/java/org/apache/tika/config/Param.java
+++ b/tika-core/src/main/java/org/apache/tika/config/Param.java
@@ -38,24 +38,20 @@
import javax.xml.transform.TransformerException;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
-
+import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.multiple.AbstractMultipleParser;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
-import org.apache.tika.exception.TikaConfigException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.multiple.AbstractMultipleParser;
-import org.apache.tika.utils.XMLReaderUtils;
-
-
/**
* This is a serializable model class for parameters from configuration file.
*
- * @param <T> value type. Should be serializable to string and have a constructor
- * with string param
+ * @param <T> value type. Should be serializable to string and have a constructor with string param
* @since Apache Tika 1.14
*/
public class Param<T> implements Serializable {
@@ -88,7 +84,7 @@
wellKnownMap.put("metadataPolicy", AbstractMultipleParser.MetadataPolicy.class);
}
- //one of these two is used for serialization
+ // one of these two is used for serialization
private final List<String> valueStrings = new ArrayList<>();
private final Map<String, String> valueMap = new LinkedHashMap<>();
@@ -96,8 +92,7 @@
private String name;
private T actualValue;
- public Param() {
- }
+ public Param() {}
public Param(String name, Class<T> type, T value) {
this.name = name;
@@ -106,7 +101,7 @@
if (List.class.isAssignableFrom(value.getClass())) {
this.valueStrings.addAll((List) value);
} else if (Map.class.isAssignableFrom(value.getClass())) {
- valueMap.putAll((Map)value);
+ valueMap.putAll((Map) value);
} else {
this.valueStrings.add(value.toString());
}
@@ -156,8 +151,8 @@
String type = typeAttr.getTextContent();
if ("class".equals(type)) {
if (classAttr == null) {
- throw new TikaConfigException("must specify a class attribute if " +
- "type=\"class\"");
+ throw new TikaConfigException(
+ "must specify a class attribute if " + "type=\"class\"");
}
ret.setType(clazz);
} else {
@@ -180,7 +175,7 @@
} else if (Map.class.isAssignableFrom(ret.type)) {
loadMap(ret, node);
} else {
- //allow the empty string
+ // allow the empty string
String textContent = "";
if (value != null) {
textContent = value.getTextContent();
@@ -190,12 +185,16 @@
}
return ret;
}
- private static <T> void loadObject(Param<T> ret, Node root, Class clazz) throws TikaConfigException {
+
+ private static <T> void loadObject(Param<T> ret, Node root, Class clazz)
+ throws TikaConfigException {
try {
- ret.actualValue = (T)clazz.getDeclaredConstructor().newInstance();
- } catch (InstantiationException | IllegalAccessException | NoSuchMethodException |
- InvocationTargetException e) {
+ ret.actualValue = (T) clazz.getDeclaredConstructor().newInstance();
+ } catch (InstantiationException
+ | IllegalAccessException
+ | NoSuchMethodException
+ | InvocationTargetException e) {
throw new TikaConfigException("can't build class: " + clazz, e);
}
@@ -209,19 +208,23 @@
Param param = load(params.item(j));
Method method = null;
- String methodName = "set" +
- param.getName().substring(0,1).toUpperCase(Locale.US) +
- param.getName().substring(1);
+ String methodName =
+ "set"
+ + param.getName().substring(0, 1).toUpperCase(Locale.US)
+ + param.getName().substring(1);
try {
- method = ret.actualValue.getClass().getMethod(methodName,
- param.getType());
+ method =
+ ret.actualValue
+ .getClass()
+ .getMethod(methodName, param.getType());
} catch (NoSuchMethodException e) {
throw new TikaConfigException("can't find method: " + methodName, e);
}
try {
method.invoke(ret.actualValue, param.getValue());
} catch (IllegalAccessException | InvocationTargetException e) {
- throw new TikaConfigException("can't set param value: " + param.getName(), e);
+ throw new TikaConfigException(
+ "can't set param value: " + param.getName(), e);
}
}
}
@@ -247,10 +250,10 @@
key = child.getLocalName();
value = child.getTextContent();
}
- if (((Map)ret.actualValue).containsKey(key)) {
+ if (((Map) ret.actualValue).containsKey(key)) {
throw new TikaConfigException("Duplicate keys are not allowed: " + key);
}
- ((Map)ret.actualValue).put(key, value);
+ ((Map) ret.actualValue).put(key, value);
ret.valueMap.put(key, value);
}
child = child.getNextSibling();
@@ -293,8 +296,8 @@
constructor.setAccessible(true);
return constructor.newInstance(value);
} catch (NoSuchMethodException e) {
- throw new RuntimeException(type + " doesnt have a constructor that takes String arg",
- e);
+ throw new RuntimeException(
+ type + " doesnt have a constructor that takes String arg", e);
} catch (IllegalAccessException | InstantiationException | InvocationTargetException e) {
throw new RuntimeException(e);
}
@@ -344,13 +347,20 @@
@Override
public String toString() {
- return "Param{" + "name='" + name + '\'' + ", valueStrings='" + valueStrings + '\'' +
- ", actualValue=" + actualValue + '}';
+ return "Param{"
+ + "name='"
+ + name
+ + '\''
+ + ", valueStrings='"
+ + valueStrings
+ + '\''
+ + ", actualValue="
+ + actualValue
+ + '}';
}
public void save(OutputStream stream) throws TransformerException, TikaException {
-
DocumentBuilder builder = XMLReaderUtils.getDocumentBuilder();
Document doc = builder.newDocument();
Element paramEl = doc.createElement("param");
@@ -381,9 +391,9 @@
el.appendChild(item);
}
} else if (Map.class.isAssignableFrom(actualValue.getClass())) {
- for (Object key : ((Map)actualValue).keySet()) {
+ for (Object key : ((Map) actualValue).keySet()) {
String keyString = (String) key;
- String valueString = (String)((Map)actualValue).get(keyString);
+ String valueString = (String) ((Map) actualValue).get(keyString);
Node item = doc.createElement(keyString);
item.setTextContent(valueString);
el.appendChild(item);
@@ -392,5 +402,4 @@
el.setTextContent(valueStrings.get(0));
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/ParamField.java b/tika-core/src/main/java/org/apache/tika/config/ParamField.java
index 15e977a..8a9707b 100644
--- a/tika-core/src/main/java/org/apache/tika/config/ParamField.java
+++ b/tika-core/src/main/java/org/apache/tika/config/ParamField.java
@@ -22,12 +22,11 @@
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
-
import org.apache.tika.exception.TikaConfigException;
/**
- * This class stores metdata for {@link Field} annotation are used to map them
- * to {@link Param} at runtime
+ * This class stores metdata for {@link Field} annotation are used to map them to {@link Param} at
+ * runtime
*
* @since Apache Tika 1.14
*/
@@ -35,18 +34,20 @@
public static final String DEFAULT = "#default";
- //NOTE: since (primitive type) is NOT AssignableFrom (BoxedType),
+ // NOTE: since (primitive type) is NOT AssignableFrom (BoxedType),
// we just use boxed type for everything!
// Example : short.class.isAssignableFrom(Short.class) ? false
private static final Map<Class<?>, Class<?>> PRIMITIVE_MAP =
- new HashMap<Class<?>, Class<?>>() {{
+ new HashMap<Class<?>, Class<?>>() {
+ {
put(int.class, Integer.class);
put(short.class, Short.class);
put(boolean.class, Boolean.class);
put(long.class, Long.class);
put(float.class, Float.class);
put(double.class, Double.class);
- }};
+ }
+ };
private final String name;
private final Class<?> type;
private final boolean required;
@@ -94,9 +95,9 @@
/**
* Sets given value to the annotated field of bean
*
- * @param bean bean with annotation for field
+ * @param bean bean with annotation for field
* @param value value of field
- * @throws IllegalAccessException when it occurs
+ * @throws IllegalAccessException when it occurs
* @throws InvocationTargetException when it occurs
*/
public void assignValue(Object bean, Object value)
@@ -117,15 +118,17 @@
if (params.length != 1) {
String msg = "Invalid setter method. Must have one and only one parameter. ";
if (setter.getName().startsWith("get")) {
- msg += "Perhaps the annotation is misplaced on " + setter.getName() +
- " while a set'X' is expected?";
+ msg +=
+ "Perhaps the annotation is misplaced on "
+ + setter.getName()
+ + " while a set'X' is expected?";
}
throw new TikaConfigException(msg);
}
type = params[0];
}
if (type.isPrimitive() && PRIMITIVE_MAP.containsKey(type)) {
- type = PRIMITIVE_MAP.get(type); //primitive types have hard time
+ type = PRIMITIVE_MAP.get(type); // primitive types have hard time
}
return type;
}
@@ -138,8 +141,9 @@
} else {
String setterName = setter.getName();
if (setterName.startsWith("set") && setterName.length() > 3) {
- name = setterName.substring(3, 4).toLowerCase(Locale.ROOT) +
- setterName.substring(4);
+ name =
+ setterName.substring(3, 4).toLowerCase(Locale.ROOT)
+ + setterName.substring(4);
} else {
name = setter.getName();
}
@@ -152,7 +156,14 @@
@Override
public String toString() {
- return "ParamField{" + "name='" + name + '\'' + ", type=" + type + ", required=" +
- required + '}';
+ return "ParamField{"
+ + "name='"
+ + name
+ + '\''
+ + ", type="
+ + type
+ + ", required="
+ + required
+ + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
index acc53ca..32e624b 100644
--- a/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
+++ b/tika-core/src/main/java/org/apache/tika/config/ServiceLoader.java
@@ -33,7 +33,6 @@
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
-
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.utils.ServiceLoaderUtils;
@@ -45,32 +44,37 @@
public class ServiceLoader {
/**
- * The dynamic set of services available in an OSGi environment.
- * Managed by the {@link TikaActivator} class and used as an additional
- * source of service instances in the {@link #loadServiceProviders(Class)}
- * method.
+ * The dynamic set of services available in an OSGi environment. Managed by the {@link
+ * TikaActivator} class and used as an additional source of service instances in the {@link
+ * #loadServiceProviders(Class)} method.
*/
private static final Map<Object, RankedService> SERVICES = new HashMap<>();
+
private static final Pattern COMMENT = Pattern.compile("#.*");
private static final Pattern WHITESPACE = Pattern.compile("\\s+");
+
/**
- * The default context class loader to use for all threads, or
- * <code>null</code> to automatically select the context class loader.
+ * The default context class loader to use for all threads, or <code>null</code> to
+ * automatically select the context class loader.
*/
private static volatile ClassLoader CONTEXT_CLASS_LOADER = null;
+
private final ClassLoader loader;
private final LoadErrorHandler handler;
private final InitializableProblemHandler initializableProblemHandler;
private final boolean dynamic;
- public ServiceLoader(ClassLoader loader, LoadErrorHandler handler,
- InitializableProblemHandler initializableProblemHandler, boolean dynamic) {
+ public ServiceLoader(
+ ClassLoader loader,
+ LoadErrorHandler handler,
+ InitializableProblemHandler initializableProblemHandler,
+ boolean dynamic) {
this.loader = loader;
this.handler = handler;
this.initializableProblemHandler = initializableProblemHandler;
this.dynamic = dynamic;
-
}
+
public ServiceLoader(ClassLoader loader, LoadErrorHandler handler, boolean dynamic) {
this(loader, handler, InitializableProblemHandler.WARN, dynamic);
}
@@ -80,24 +84,27 @@
}
public ServiceLoader(ClassLoader loader) {
- this(loader,
- Boolean.getBoolean("org.apache.tika.service.error.warn") ? LoadErrorHandler.WARN :
- LoadErrorHandler.IGNORE);
+ this(
+ loader,
+ Boolean.getBoolean("org.apache.tika.service.error.warn")
+ ? LoadErrorHandler.WARN
+ : LoadErrorHandler.IGNORE);
}
public ServiceLoader() {
- this(getContextClassLoader(),
- Boolean.getBoolean("org.apache.tika.service.error.warn") ? LoadErrorHandler.WARN :
- LoadErrorHandler.IGNORE, true);
+ this(
+ getContextClassLoader(),
+ Boolean.getBoolean("org.apache.tika.service.error.warn")
+ ? LoadErrorHandler.WARN
+ : LoadErrorHandler.IGNORE,
+ true);
}
/**
- * Returns the context class loader of the current thread. If such
- * a class loader is not available, then the loader of this class or
- * finally the system class loader is returned.
+ * Returns the context class loader of the current thread. If such a class loader is not
+ * available, then the loader of this class or finally the system class loader is returned.
*
- * @return context class loader, or <code>null</code> if no loader
- * is available
+ * @return context class loader, or <code>null</code> if no loader is available
* @see <a href="https://issues.apache.org/jira/browse/TIKA-441">TIKA-441</a>
*/
static ClassLoader getContextClassLoader() {
@@ -112,12 +119,11 @@
}
/**
- * Sets the context class loader to use for all threads that access
- * this class. Used for example in an OSGi environment to avoid problems
- * with the default context class loader.
+ * Sets the context class loader to use for all threads that access this class. Used for example
+ * in an OSGi environment to avoid problems with the default context class loader.
*
- * @param loader default context class loader,
- * or <code>null</code> to automatically pick the loader
+ * @param loader default context class loader, or <code>null</code> to automatically pick the
+ * loader
*/
public static void setContextClassLoader(ClassLoader loader) {
CONTEXT_CLASS_LOADER = loader;
@@ -166,8 +172,7 @@
}
/**
- * Returns an input stream for reading the specified resource from the
- * configured class loader.
+ * Returns an input stream for reading the specified resource from the configured class loader.
*
* @param name resource name
* @return input stream, or <code>null</code> if the resource was not found
@@ -192,18 +197,16 @@
}
/**
- * Loads and returns the named service class that's expected to implement
- * the given interface.
- * <p>
- * Note that this class does not use the {@link LoadErrorHandler}, a
- * {@link ClassNotFoundException} is always returned for unknown
- * classes or classes of the wrong type
+ * Loads and returns the named service class that's expected to implement the given interface.
+ *
+ * <p>Note that this class does not use the {@link LoadErrorHandler}, a {@link
+ * ClassNotFoundException} is always returned for unknown classes or classes of the wrong type
*
* @param iface service interface
- * @param name service class name
+ * @param name service class name
* @return service class
- * @throws ClassNotFoundException if the service class can not be found
- * or does not implement the given interface
+ * @throws ClassNotFoundException if the service class can not be found or does not implement
+ * the given interface
* @see Class#forName(String, boolean, ClassLoader)
* @since Apache Tika 1.1
*/
@@ -225,10 +228,8 @@
}
/**
- * Returns all the available service resources matching the
- * given pattern, such as all instances of tika-mimetypes.xml
- * on the classpath, or all org.apache.tika.parser.Parser
- * service files.
+ * Returns all the available service resources matching the given pattern, such as all instances
+ * of tika-mimetypes.xml on the classpath, or all org.apache.tika.parser.Parser service files.
*/
public Enumeration<URL> findServiceResources(String filePattern) {
try {
@@ -243,7 +244,7 @@
/**
* Returns all the available service providers of the given type.
*
- * As of versions after 2.4.1, this removes duplicate classes
+ * <p>As of versions after 2.4.1, this removes duplicate classes
*
* @param iface service provider interface
* @return available service providers
@@ -256,7 +257,7 @@
List<T> providers = new ArrayList<>();
Set<String> seen = new HashSet<>();
for (T provider : tmp) {
- if (! seen.contains(provider.getClass().getCanonicalName())) {
+ if (!seen.contains(provider.getClass().getCanonicalName())) {
providers.add(provider);
seen.add(provider.getClass().getCanonicalName());
}
@@ -265,9 +266,8 @@
}
/**
- * Returns the available dynamic service providers of the given type.
- * The returned list is newly allocated and may be freely modified
- * by the caller.
+ * Returns the available dynamic service providers of the given type. The returned list is newly
+ * allocated and may be freely modified by the caller.
*
* @param iface service provider interface
* @return dynamic service providers
@@ -294,10 +294,9 @@
}
/**
- * Returns the defined static service providers of the given type, without
- * attempting to load them.
- * The providers are loaded using the service provider mechanism using
- * the configured class loader (if any).
+ * Returns the defined static service providers of the given type, without attempting to load
+ * them. The providers are loaded using the service provider mechanism using the configured
+ * class loader (if any).
*
* @param iface service provider interface
* @return static list of uninitialised service providers
@@ -326,19 +325,18 @@
}
/**
- * Returns the available static service providers of the given type.
- * The providers are loaded using the service provider mechanism using
- * the configured class loader (if any). The returned list is newly
- * allocated and may be freely modified by the caller.
+ * Returns the available static service providers of the given type. The providers are loaded
+ * using the service provider mechanism using the configured class loader (if any). The returned
+ * list is newly allocated and may be freely modified by the caller.
*
- * @param iface service provider interface
+ * @param iface service provider interface
* @param excludes -- do not load these classes
* @return static service providers
* @since Apache Tika 1.2
*/
@SuppressWarnings("unchecked")
- public <T> List<T> loadStaticServiceProviders(Class<T> iface,
- Collection<Class<? extends T>> excludes) {
+ public <T> List<T> loadStaticServiceProviders(
+ Class<T> iface, Collection<Class<? extends T>> excludes) {
List<T> providers = new ArrayList<>();
if (loader != null) {
@@ -407,7 +405,5 @@
public int compareTo(RankedService that) {
return that.rank - rank; // highest number first
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java b/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java
index e076f1c..54e05ce 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaActivator.java
@@ -16,6 +16,8 @@
*/
package org.apache.tika.config;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.parser.Parser;
import org.osgi.framework.BundleActivator;
import org.osgi.framework.BundleContext;
import org.osgi.framework.Constants;
@@ -23,17 +25,13 @@
import org.osgi.util.tracker.ServiceTracker;
import org.osgi.util.tracker.ServiceTrackerCustomizer;
-import org.apache.tika.detect.Detector;
-import org.apache.tika.parser.Parser;
-
/**
- * Bundle activator that adjust the class loading mechanism of the
- * {@link ServiceLoader} class to work correctly in an OSGi environment.
- * <p>
- * Note that you should <strong>not</strong> access this class directly.
- * Instead the OSGi environment (if present) will automatically invoke the
- * methods of this class based on the Bundle-Activator setting in the bundle
- * manifest.
+ * Bundle activator that adjust the class loading mechanism of the {@link ServiceLoader} class to
+ * work correctly in an OSGi environment.
+ *
+ * <p>Note that you should <strong>not</strong> access this class directly. Instead the OSGi
+ * environment (if present) will automatically invoke the methods of this class based on the
+ * Bundle-Activator setting in the bundle manifest.
*
* @since Apache Tika 0.9
*/
@@ -44,7 +42,8 @@
private ServiceTracker parserTracker;
private BundleContext bundleContext;
- //-----------------------------------------------------< BundleActivator >
+
+ // -----------------------------------------------------< BundleActivator >
public void start(final BundleContext context) throws Exception {
bundleContext = context;
@@ -73,12 +72,10 @@
return service;
}
- public void modifiedService(ServiceReference reference, Object service) {
- }
+ public void modifiedService(ServiceReference reference, Object service) {}
public void removedService(ServiceReference reference, Object service) {
ServiceLoader.removeService(reference);
bundleContext.ungetService(reference);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
index e68ad10..7fe2614 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfig.java
@@ -39,15 +39,6 @@
import java.util.concurrent.ExecutorService;
import java.util.concurrent.atomic.AtomicInteger;
import javax.imageio.spi.ServiceRegistry;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.SAXException;
-
import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
import org.apache.tika.concurrent.SimpleThreadPoolExecutor;
import org.apache.tika.detect.CompositeDetector;
@@ -81,16 +72,21 @@
import org.apache.tika.utils.AnnotationUtils;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
-/**
- * Parse xml config file.
- */
+/** Parse xml config file. */
public class TikaConfig {
- public static int DEFAULT_MAX_JSON_STRING_FIELD_LENGTH = 20_000_000;//jackson's default
+ public static int DEFAULT_MAX_JSON_STRING_FIELD_LENGTH = 20_000_000; // jackson's default
public static String MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME = "maxJsonStringFieldLength";
- //use this to look for unneeded instantiations of TikaConfig
+ // use this to look for unneeded instantiations of TikaConfig
protected static final AtomicInteger TIMES_INSTANTIATED = new AtomicInteger();
private static final Logger LOG = LoggerFactory.getLogger(TikaConfig.class);
@@ -124,13 +120,16 @@
public TikaConfig(File file) throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(file.toPath()));
}
+
public TikaConfig(File file, ServiceLoader loader)
throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(file.toPath()), loader);
}
+
public TikaConfig(URL url) throws TikaException, IOException, SAXException {
this(url, ServiceLoader.getContextClassLoader());
}
+
public TikaConfig(URL url, ClassLoader loader) throws TikaException, IOException, SAXException {
this(XMLReaderUtils.buildDOM(url.toString()).getDocumentElement(), loader);
}
@@ -184,15 +183,14 @@
}
/**
- * Creates a Tika configuration from the built-in media type rules
- * and all the {@link Parser} implementations available through the
- * {@link ServiceRegistry service provider mechanism} in the given
- * class loader.
+ * Creates a Tika configuration from the built-in media type rules and all the {@link Parser}
+ * implementations available through the {@link ServiceRegistry service provider mechanism} in
+ * the given class loader.
*
- * @param loader the class loader through which parser implementations
- * are loaded, or <code>null</code> for no parsers
+ * @param loader the class loader through which parser implementations are loaded, or <code>null
+ * </code> for no parsers
* @throws MimeTypeException if the built-in media type rules are broken
- * @throws IOException if the built-in media type rules can not be read
+ * @throws IOException if the built-in media type rules can not be read
* @since Apache Tika 0.8
*/
public TikaConfig(ClassLoader loader) throws MimeTypeException, IOException {
@@ -210,20 +208,21 @@
}
/**
- * Creates a default Tika configuration.
- * First checks whether an XML config file is specified, either in
- * <ol>
- * <li>System property "tika.config", or</li>
- * <li>Environment variable TIKA_CONFIG</li>
- * </ol>
- * <p>If one of these have a value, try to resolve it relative to file
- * system or classpath.</p>
- * <p>If XML config is not specified, initialize from the built-in media
- * type rules and all the {@link Parser} implementations available through
- * the {@link ServiceRegistry service provider mechanism} in the context
- * class loader of the current thread.</p>
+ * Creates a default Tika configuration. First checks whether an XML config file is specified,
+ * either in
*
- * @throws IOException if the configuration can not be read
+ * <ol>
+ * <li>System property "tika.config", or
+ * <li>Environment variable TIKA_CONFIG
+ * </ol>
+ *
+ * <p>If one of these have a value, try to resolve it relative to file system or classpath.
+ *
+ * <p>If XML config is not specified, initialize from the built-in media type rules and all the
+ * {@link Parser} implementations available through the {@link ServiceRegistry service provider
+ * mechanism} in the context class loader of the current thread.
+ *
+ * @throws IOException if the configuration can not be read
* @throws TikaException if problem with MimeTypes or parsing XML config
*/
public TikaConfig() throws TikaException, IOException {
@@ -281,17 +280,16 @@
this.autoDetectParserConfig = AutoDetectParserConfig.load(element);
setMaxJsonStringFieldLength(element);
} catch (SAXException e) {
- throw new TikaException("Specified Tika configuration has syntax errors: " + config,
- e);
+ throw new TikaException(
+ "Specified Tika configuration has syntax errors: " + config, e);
}
}
TIMES_INSTANTIATED.incrementAndGet();
}
/**
- *
* @return maximum field length when serializing String fields in Tika's metadata or metadata
- * list into JSON
+ * list into JSON
*/
public static int getMaxJsonStringFieldLength() {
return MAX_JSON_STRING_FIELD_LENGTH;
@@ -305,8 +303,9 @@
try {
MAX_JSON_STRING_FIELD_LENGTH = Integer.parseInt(n.getTextContent());
} catch (NumberFormatException e) {
- throw new TikaConfigException(MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME + " " +
- "is not an integer", e);
+ throw new TikaConfigException(
+ MAX_JSON_STRING_FIELD_LENGTH_ELEMENT_NAME + " " + "is not an integer",
+ e);
}
return;
}
@@ -328,8 +327,12 @@
protected static CompositeRenderer getDefaultRenderer(ServiceLoader loader) {
return new CompositeRenderer(loader);
}
- private static CompositeParser getDefaultParser(MimeTypes types, ServiceLoader loader,
- EncodingDetector encodingDetector, Renderer renderer) {
+
+ private static CompositeParser getDefaultParser(
+ MimeTypes types,
+ ServiceLoader loader,
+ EncodingDetector encodingDetector,
+ Renderer renderer) {
return new DefaultParser(types.getMediaTypeRegistry(), loader, encodingDetector, renderer);
}
@@ -379,9 +382,9 @@
}
/**
- * Provides a default configuration (TikaConfig). Currently creates a
- * new instance each time it's called; we may be able to have it
- * return a shared instance once it is completely immutable.
+ * Provides a default configuration (TikaConfig). Currently creates a new instance each time
+ * it's called; we may be able to have it return a shared instance once it is completely
+ * immutable.
*
* @return default configuration
*/
@@ -406,9 +409,8 @@
return null;
}
- private static List<Element> getTopLevelElementChildren(Element element, String parentName,
- String childrenName)
- throws TikaException {
+ private static List<Element> getTopLevelElementChildren(
+ Element element, String parentName, String childrenName) throws TikaException {
Node parentNode = null;
if (parentName != null) {
// Should be only zero or one <parsers> / <detectors> etc tag
@@ -505,8 +507,9 @@
if (loader == null) {
loader = ServiceLoader.getContextClassLoader();
}
- serviceLoader = new ServiceLoader(loader, loadErrorHandler, initializableProblemHandler,
- dynamic);
+ serviceLoader =
+ new ServiceLoader(
+ loader, loadErrorHandler, initializableProblemHandler, dynamic);
} else if (loader != null) {
serviceLoader = new ServiceLoader(loader);
} else {
@@ -520,22 +523,28 @@
if (initializableProblemHandler == null || initializableProblemHandler.length() == 0) {
return InitializableProblemHandler.DEFAULT;
}
- if (InitializableProblemHandler.IGNORE.toString()
+ if (InitializableProblemHandler.IGNORE
+ .toString()
.equalsIgnoreCase(initializableProblemHandler)) {
return InitializableProblemHandler.IGNORE;
- } else if (InitializableProblemHandler.INFO.toString()
+ } else if (InitializableProblemHandler.INFO
+ .toString()
.equalsIgnoreCase(initializableProblemHandler)) {
return InitializableProblemHandler.INFO;
- } else if (InitializableProblemHandler.WARN.toString()
+ } else if (InitializableProblemHandler.WARN
+ .toString()
.equalsIgnoreCase(initializableProblemHandler)) {
return InitializableProblemHandler.WARN;
- } else if (InitializableProblemHandler.THROW.toString()
+ } else if (InitializableProblemHandler.THROW
+ .toString()
.equalsIgnoreCase(initializableProblemHandler)) {
return InitializableProblemHandler.THROW;
}
- throw new TikaConfigException(String.format(Locale.US,
- "Couldn't parse non-null '%s'. Must be one of 'ignore', 'info', 'warn' or 'throw'",
- initializableProblemHandler));
+ throw new TikaConfigException(
+ String.format(
+ Locale.US,
+ "Couldn't parse non-null '%s'. Must be one of 'ignore', 'info', 'warn' or 'throw'",
+ initializableProblemHandler));
}
public static void mustNotBeEmpty(String paramName, String paramValue)
@@ -562,17 +571,16 @@
}
if (child.hasAttribute("maxEntityExpansions")) {
- XMLReaderUtils.setMaxEntityExpansions(Integer.parseInt(child.getAttribute("maxEntityExpansions")));
+ XMLReaderUtils.setMaxEntityExpansions(
+ Integer.parseInt(child.getAttribute("maxEntityExpansions")));
}
// make sure to call this after set entity expansions
if (child.hasAttribute("poolSize")) {
XMLReaderUtils.setPoolSize(Integer.parseInt(child.getAttribute("poolSize")));
}
-
}
-
/**
* Returns the configured parser instance.
*
@@ -633,7 +641,7 @@
return autoDetectParserConfig;
}
- private static abstract class XmlLoader<CT, T> {
+ private abstract static class XmlLoader<CT, T> {
protected static final String PARAMS_TAG_NAME = "params";
abstract boolean supportsComposite();
@@ -655,10 +663,13 @@
abstract CT createComposite(List<T> loaded, MimeTypes mimeTypes, ServiceLoader loader);
- abstract T createComposite(Class<? extends T> compositeClass, List<T> children,
- Set<Class<? extends T>> excludeChildren,
- Map<String, Param> params, MimeTypes mimeTypes,
- ServiceLoader loader)
+ abstract T createComposite(
+ Class<? extends T> compositeClass,
+ List<T> children,
+ Set<Class<? extends T>> excludeChildren,
+ Map<String, Param> params,
+ MimeTypes mimeTypes,
+ ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException;
abstract T decorate(T created, Element element)
@@ -670,8 +681,8 @@
List<T> loaded = new ArrayList<>();
// Find the children of the parent tag, if any
- for (Element le : getTopLevelElementChildren(element, getParentTagName(),
- getLoaderTagName())) {
+ for (Element le :
+ getTopLevelElementChildren(element, getParentTagName(), getLoaderTagName())) {
T loadedChild = loadOne(le, mimeTypes, loader);
if (loadedChild != null) {
loaded.add(loadedChild);
@@ -694,10 +705,11 @@
return (CT) loaded.get(0);
} else if (loaded.size() > 1) {
throw new TikaConfigException(
- "Composite not supported for " + getParentTagName() +
- ". Must specify only one child!");
+ "Composite not supported for "
+ + getParentTagName()
+ + ". Must specify only one child!");
} else {
- //throw exception if empty?
+ // throw exception if empty?
}
}
// Wrap the defined parsers/detectors up in a Composite
@@ -724,7 +736,7 @@
Class<? extends T> loadedClass = loader.getServiceClass(getLoaderClass(), name);
// Do pre-load checks and short-circuits
- //TODO : allow duplicate instances with different configurations
+ // TODO : allow duplicate instances with different configurations
loaded = preLoadOne(loadedClass, name, mimeTypes);
if (loaded != null) {
return loaded;
@@ -762,10 +774,10 @@
Element excl = (Element) excludeChildNodes.item(i);
String exclName = excl.getAttribute("class");
try {
- excludeChildren
- .add(loader.getServiceClass(getLoaderClass(), exclName));
+ excludeChildren.add(
+ loader.getServiceClass(getLoaderClass(), exclName));
} catch (ClassNotFoundException e) {
- //TIKA-3268 -- This should stop the world.
+ // TIKA-3268 -- This should stop the world.
throw new TikaConfigException(
"Class not found in -exclude list: " + exclName);
}
@@ -773,8 +785,14 @@
}
// Create the Composite
- loaded = createComposite(loadedClass, children, excludeChildren, params,
- mimeTypes, loader);
+ loaded =
+ createComposite(
+ loadedClass,
+ children,
+ excludeChildren,
+ params,
+ mimeTypes,
+ loader);
// Default constructor fallback
if (loaded == null) {
@@ -787,7 +805,7 @@
// See the thread "Configuring parsers and translators" for details
}
- //Assigning the params to bean fields/setters
+ // Assigning the params to bean fields/setters
AnnotationUtils.assignFieldParams(loaded, params);
if (loaded instanceof Initializable) {
((Initializable) loaded).initialize(params);
@@ -817,15 +835,19 @@
"Unable to instantiate a " + getLoaderTagName() + " class: " + name, e);
} catch (NoSuchMethodException e) {
throw new TikaException(
- "Unable to find the right constructor for " + getLoaderTagName() +
- " class: " + name, e);
+ "Unable to find the right constructor for "
+ + getLoaderTagName()
+ + " class: "
+ + name,
+ e);
}
}
-
T newInstance(Class<? extends T> loadedClass)
- throws IllegalAccessException, InstantiationException, NoSuchMethodException,
- InvocationTargetException {
+ throws IllegalAccessException,
+ InstantiationException,
+ NoSuchMethodException,
+ InvocationTargetException {
return loadedClass.getDeclaredConstructor().newInstance();
}
@@ -838,8 +860,8 @@
Map<String, Param> getParams(Element el) throws TikaException {
Map<String, Param> params = new HashMap<>();
for (Node child = el.getFirstChild(); child != null; child = child.getNextSibling()) {
- if (PARAMS_TAG_NAME.equals(child.getNodeName())) { //found the node
- if (child.hasChildNodes()) { //it has children
+ if (PARAMS_TAG_NAME.equals(child.getNodeName())) { // found the node
+ if (child.hasChildNodes()) { // it has children
NodeList childNodes = child.getChildNodes();
for (int i = 0; i < childNodes.getLength(); i++) {
Node item = childNodes.item(i);
@@ -849,12 +871,11 @@
}
}
}
- break; //only the first one is used
+ break; // only the first one is used
}
}
return params;
}
-
}
private static class ParserXmlLoader extends XmlLoader<CompositeParser, Parser> {
@@ -885,13 +906,16 @@
}
@Override
- Parser preLoadOne(Class<? extends Parser> loadedClass, String classname,
- MimeTypes mimeTypes) throws TikaException {
+ Parser preLoadOne(
+ Class<? extends Parser> loadedClass, String classname, MimeTypes mimeTypes)
+ throws TikaException {
// Check for classes which can't be set in config
if (AutoDetectParser.class.isAssignableFrom(loadedClass)) {
// https://issues.apache.org/jira/browse/TIKA-866
- throw new TikaException("AutoDetectParser not supported in a <parser>" +
- " configuration element: " + classname);
+ throw new TikaException(
+ "AutoDetectParser not supported in a <parser>"
+ + " configuration element: "
+ + classname);
}
// Continue with normal loading
return null;
@@ -904,9 +928,9 @@
@Override
boolean isComposite(Class<? extends Parser> loadedClass) {
- return CompositeParser.class.isAssignableFrom(loadedClass) ||
- AbstractMultipleParser.class.isAssignableFrom(loadedClass) ||
- ParserDecorator.class.isAssignableFrom(loadedClass);
+ return CompositeParser.class.isAssignableFrom(loadedClass)
+ || AbstractMultipleParser.class.isAssignableFrom(loadedClass)
+ || ParserDecorator.class.isAssignableFrom(loadedClass);
}
@Override
@@ -915,16 +939,20 @@
}
@Override
- CompositeParser createComposite(List<Parser> parsers, MimeTypes mimeTypes,
- ServiceLoader loader) {
+ CompositeParser createComposite(
+ List<Parser> parsers, MimeTypes mimeTypes, ServiceLoader loader) {
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
return new CompositeParser(registry, parsers);
}
@Override
- Parser createComposite(Class<? extends Parser> parserClass, List<Parser> childParsers,
- Set<Class<? extends Parser>> excludeParsers,
- Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
+ Parser createComposite(
+ Class<? extends Parser> parserClass,
+ List<Parser> childParsers,
+ Set<Class<? extends Parser>> excludeParsers,
+ Map<String, Param> params,
+ MimeTypes mimeTypes,
+ ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException {
Parser parser = null;
Constructor<? extends Parser> c = null;
@@ -933,47 +961,61 @@
// Try the possible default and composite parser constructors
if (parser == null) {
try {
- c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class,
- Collection.class, EncodingDetector.class, Renderer.class);
- parser = c.newInstance(registry, loader, excludeParsers, encodingDetector, renderer);
+ c =
+ parserClass.getConstructor(
+ MediaTypeRegistry.class,
+ ServiceLoader.class,
+ Collection.class,
+ EncodingDetector.class,
+ Renderer.class);
+ parser =
+ c.newInstance(
+ registry, loader, excludeParsers, encodingDetector, renderer);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
if (parser == null) {
try {
- c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class,
- Collection.class, EncodingDetector.class);
+ c =
+ parserClass.getConstructor(
+ MediaTypeRegistry.class,
+ ServiceLoader.class,
+ Collection.class,
+ EncodingDetector.class);
parser = c.newInstance(registry, loader, excludeParsers, encodingDetector);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
if (parser == null) {
try {
- c = parserClass.getConstructor(MediaTypeRegistry.class, ServiceLoader.class,
- Collection.class);
+ c =
+ parserClass.getConstructor(
+ MediaTypeRegistry.class, ServiceLoader.class, Collection.class);
parser = c.newInstance(registry, loader, excludeParsers);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
if (parser == null) {
try {
- c = parserClass
- .getConstructor(MediaTypeRegistry.class, List.class, Collection.class);
+ c =
+ parserClass.getConstructor(
+ MediaTypeRegistry.class, List.class, Collection.class);
parser = c.newInstance(registry, childParsers, excludeParsers);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
if (parser == null) {
try {
- c = parserClass
- .getConstructor(MediaTypeRegistry.class, Collection.class, Map.class);
+ c =
+ parserClass.getConstructor(
+ MediaTypeRegistry.class, Collection.class, Map.class);
parser = c.newInstance(registry, childParsers, params);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
if (parser == null) {
@@ -981,7 +1023,7 @@
c = parserClass.getConstructor(MediaTypeRegistry.class, List.class);
parser = c.newInstance(registry, childParsers);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
@@ -989,8 +1031,9 @@
if (parser == null && ParserDecorator.class.isAssignableFrom(parserClass)) {
try {
CompositeParser cp = null;
- if (childParsers.size() == 1 && excludeParsers.size() == 0 &&
- childParsers.get(0) instanceof CompositeParser) {
+ if (childParsers.size() == 1
+ && excludeParsers.size() == 0
+ && childParsers.get(0) instanceof CompositeParser) {
cp = (CompositeParser) childParsers.get(0);
} else {
cp = new CompositeParser(registry, childParsers, excludeParsers);
@@ -998,7 +1041,7 @@
c = parserClass.getConstructor(Parser.class);
parser = c.newInstance(cp);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
return parser;
@@ -1006,8 +1049,10 @@
@Override
Parser newInstance(Class<? extends Parser> loadedClass)
- throws IllegalAccessException, InstantiationException, NoSuchMethodException,
- InvocationTargetException {
+ throws IllegalAccessException,
+ InstantiationException,
+ NoSuchMethodException,
+ InvocationTargetException {
Parser parser = null;
if (AbstractEncodingDetectorParser.class.isAssignableFrom(loadedClass)) {
Constructor ctor = loadedClass.getConstructor(EncodingDetector.class);
@@ -1017,7 +1062,7 @@
}
if (parser instanceof RenderingParser) {
- ((RenderingParser)parser).setRenderer(renderer);
+ ((RenderingParser) parser).setRenderer(renderer);
}
return parser;
}
@@ -1040,7 +1085,6 @@
// All done with decoration
return parser;
}
-
}
private static class DetectorXmlLoader extends XmlLoader<CompositeDetector, Detector> {
@@ -1062,8 +1106,9 @@
}
@Override
- Detector preLoadOne(Class<? extends Detector> loadedClass, String classname,
- MimeTypes mimeTypes) throws TikaException {
+ Detector preLoadOne(
+ Class<? extends Detector> loadedClass, String classname, MimeTypes mimeTypes)
+ throws TikaException {
// If they asked for the mime types as a detector, give
// them the one we've already created. TIKA-1708
if (MimeTypes.class.equals(loadedClass)) {
@@ -1089,18 +1134,20 @@
}
@Override
- CompositeDetector createComposite(List<Detector> detectors, MimeTypes mimeTypes,
- ServiceLoader loader) {
+ CompositeDetector createComposite(
+ List<Detector> detectors, MimeTypes mimeTypes, ServiceLoader loader) {
MediaTypeRegistry registry = mimeTypes.getMediaTypeRegistry();
return new CompositeDetector(registry, detectors);
}
@Override
- Detector createComposite(Class<? extends Detector> detectorClass,
- List<Detector> childDetectors,
- Set<Class<? extends Detector>> excludeDetectors,
- Map<String, Param> params, MimeTypes mimeTypes,
- ServiceLoader loader)
+ Detector createComposite(
+ Class<? extends Detector> detectorClass,
+ List<Detector> childDetectors,
+ Set<Class<? extends Detector>> excludeDetectors,
+ Map<String, Param> params,
+ MimeTypes mimeTypes,
+ ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException {
Detector detector = null;
Constructor<? extends Detector> c;
@@ -1109,20 +1156,22 @@
// Try the possible default and composite detector constructors
if (detector == null) {
try {
- c = detectorClass
- .getConstructor(MimeTypes.class, ServiceLoader.class, Collection.class);
+ c =
+ detectorClass.getConstructor(
+ MimeTypes.class, ServiceLoader.class, Collection.class);
detector = c.newInstance(mimeTypes, loader, excludeDetectors);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
if (detector == null) {
try {
- c = detectorClass
- .getConstructor(MediaTypeRegistry.class, List.class, Collection.class);
+ c =
+ detectorClass.getConstructor(
+ MediaTypeRegistry.class, List.class, Collection.class);
detector = c.newInstance(registry, childDetectors, excludeDetectors);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
if (detector == null) {
@@ -1130,7 +1179,7 @@
c = detectorClass.getConstructor(MediaTypeRegistry.class, List.class);
detector = c.newInstance(registry, childDetectors);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
if (detector == null) {
@@ -1138,7 +1187,7 @@
c = detectorClass.getConstructor(List.class);
detector = c.newInstance(childDetectors);
} catch (NoSuchMethodException me) {
- //swallow
+ // swallow
}
}
@@ -1170,8 +1219,9 @@
}
@Override
- Translator preLoadOne(Class<? extends Translator> loadedClass, String classname,
- MimeTypes mimeTypes) throws TikaException {
+ Translator preLoadOne(
+ Class<? extends Translator> loadedClass, String classname, MimeTypes mimeTypes)
+ throws TikaException {
// Continue with normal loading
return null;
}
@@ -1192,17 +1242,19 @@
}
@Override
- Translator createComposite(List<Translator> loaded, MimeTypes mimeTypes,
- ServiceLoader loader) {
+ Translator createComposite(
+ List<Translator> loaded, MimeTypes mimeTypes, ServiceLoader loader) {
return loaded.get(0);
}
@Override
- Translator createComposite(Class<? extends Translator> compositeClass,
- List<Translator> children,
- Set<Class<? extends Translator>> excludeChildren,
- Map<String, Param> params, MimeTypes mimeTypes,
- ServiceLoader loader)
+ Translator createComposite(
+ Class<? extends Translator> compositeClass,
+ List<Translator> children,
+ Set<Class<? extends Translator>> excludeChildren,
+ Map<String, Param> params,
+ MimeTypes mimeTypes,
+ ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException {
throw new InstantiationException("Only one translator supported");
}
@@ -1220,14 +1272,18 @@
Class<? extends ConfigurableThreadPoolExecutor> compositeClass,
List<ConfigurableThreadPoolExecutor> children,
Set<Class<? extends ConfigurableThreadPoolExecutor>> excludeChildren,
- Map<String, Param> params, MimeTypes mimeTypes, ServiceLoader loader)
+ Map<String, Param> params,
+ MimeTypes mimeTypes,
+ ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException {
throw new InstantiationException("Only one executor service supported");
}
@Override
- ConfigurableThreadPoolExecutor createComposite(List<ConfigurableThreadPoolExecutor> loaded,
- MimeTypes mimeTypes, ServiceLoader loader) {
+ ConfigurableThreadPoolExecutor createComposite(
+ List<ConfigurableThreadPoolExecutor> loaded,
+ MimeTypes mimeTypes,
+ ServiceLoader loader) {
return loaded.get(0);
}
@@ -1237,8 +1293,9 @@
}
@Override
- ConfigurableThreadPoolExecutor decorate(ConfigurableThreadPoolExecutor created,
- Element element) throws IOException, TikaException {
+ ConfigurableThreadPoolExecutor decorate(
+ ConfigurableThreadPoolExecutor created, Element element)
+ throws IOException, TikaException {
Element maxThreadElement = getChild(element, "max-threads");
if (maxThreadElement != null) {
@@ -1258,8 +1315,8 @@
}
@Override
- ConfigurableThreadPoolExecutor loadOne(Element element, MimeTypes mimeTypes,
- ServiceLoader loader)
+ ConfigurableThreadPoolExecutor loadOne(
+ Element element, MimeTypes mimeTypes, ServiceLoader loader)
throws TikaException, IOException {
return super.loadOne(element, mimeTypes, loader);
}
@@ -1291,8 +1348,10 @@
@Override
ConfigurableThreadPoolExecutor preLoadOne(
- Class<? extends ConfigurableThreadPoolExecutor> loadedClass, String classname,
- MimeTypes mimeTypes) throws TikaException {
+ Class<? extends ConfigurableThreadPoolExecutor> loadedClass,
+ String classname,
+ MimeTypes mimeTypes)
+ throws TikaException {
return null;
}
}
@@ -1317,7 +1376,6 @@
return EncodingDetector.class;
}
-
@Override
boolean isComposite(EncodingDetector loaded) {
return loaded instanceof CompositeEncodingDetector;
@@ -1329,8 +1387,11 @@
}
@Override
- EncodingDetector preLoadOne(Class<? extends EncodingDetector> loadedClass, String classname,
- MimeTypes mimeTypes) throws TikaException {
+ EncodingDetector preLoadOne(
+ Class<? extends EncodingDetector> loadedClass,
+ String classname,
+ MimeTypes mimeTypes)
+ throws TikaException {
// Check for classes which can't be set in config
// Continue with normal loading
return null;
@@ -1342,17 +1403,21 @@
}
@Override
- CompositeEncodingDetector createComposite(List<EncodingDetector> encodingDetectors,
- MimeTypes mimeTypes, ServiceLoader loader) {
+ CompositeEncodingDetector createComposite(
+ List<EncodingDetector> encodingDetectors,
+ MimeTypes mimeTypes,
+ ServiceLoader loader) {
return new CompositeEncodingDetector(encodingDetectors);
}
@Override
- EncodingDetector createComposite(Class<? extends EncodingDetector> encodingDetectorClass,
- List<EncodingDetector> childEncodingDetectors,
- Set<Class<? extends EncodingDetector>> excludeDetectors,
- Map<String, Param> params, MimeTypes mimeTypes,
- ServiceLoader loader)
+ EncodingDetector createComposite(
+ Class<? extends EncodingDetector> encodingDetectorClass,
+ List<EncodingDetector> childEncodingDetectors,
+ Set<Class<? extends EncodingDetector>> excludeDetectors,
+ Map<String, Param> params,
+ MimeTypes mimeTypes,
+ ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException {
EncodingDetector encodingDetector = null;
Constructor<? extends EncodingDetector> c;
@@ -1363,7 +1428,8 @@
c = encodingDetectorClass.getConstructor(ServiceLoader.class, Collection.class);
encodingDetector = c.newInstance(loader, excludeDetectors);
} catch (NoSuchMethodException me) {
- LOG.debug("couldn't find constructor for service loader + collection for {}",
+ LOG.debug(
+ "couldn't find constructor for service loader + collection for {}",
encodingDetectorClass);
}
}
@@ -1372,7 +1438,8 @@
c = encodingDetectorClass.getConstructor(List.class);
encodingDetector = c.newInstance(childEncodingDetectors);
} catch (NoSuchMethodException me) {
- LOG.debug("couldn't find constructor for EncodingDetector(List) for {}",
+ LOG.debug(
+ "couldn't find constructor for EncodingDetector(List) for {}",
encodingDetectorClass);
}
}
@@ -1386,8 +1453,7 @@
}
}
- private static class RendererXmlLoader
- extends XmlLoader<Renderer, Renderer> {
+ private static class RendererXmlLoader extends XmlLoader<Renderer, Renderer> {
boolean supportsComposite() {
return true;
@@ -1406,7 +1472,6 @@
return Renderer.class;
}
-
@Override
boolean isComposite(Renderer loaded) {
return loaded instanceof CompositeRenderer;
@@ -1418,8 +1483,9 @@
}
@Override
- Renderer preLoadOne(Class<? extends Renderer> loadedClass, String classname,
- MimeTypes mimeTypes) throws TikaException {
+ Renderer preLoadOne(
+ Class<? extends Renderer> loadedClass, String classname, MimeTypes mimeTypes)
+ throws TikaException {
// Check for classes which can't be set in config
// Continue with normal loading
return null;
@@ -1431,17 +1497,19 @@
}
@Override
- Renderer createComposite(List<Renderer> renderers,
- MimeTypes mimeTypes, ServiceLoader loader) {
+ Renderer createComposite(
+ List<Renderer> renderers, MimeTypes mimeTypes, ServiceLoader loader) {
return new CompositeRenderer(renderers);
}
@Override
- Renderer createComposite(Class<? extends Renderer> rendererClass,
- List<Renderer> childRenderers,
- Set<Class<? extends Renderer>> excludeRenderers,
- Map<String, Param> params, MimeTypes mimeTypes,
- ServiceLoader loader)
+ Renderer createComposite(
+ Class<? extends Renderer> rendererClass,
+ List<Renderer> childRenderers,
+ Set<Class<? extends Renderer>> excludeRenderers,
+ Map<String, Param> params,
+ MimeTypes mimeTypes,
+ ServiceLoader loader)
throws InvocationTargetException, IllegalAccessException, InstantiationException {
Renderer renderer = null;
Constructor<? extends Renderer> c;
@@ -1452,7 +1520,8 @@
c = rendererClass.getConstructor(ServiceLoader.class, Collection.class);
renderer = c.newInstance(loader, excludeRenderers);
} catch (NoSuchMethodException me) {
- LOG.debug("couldn't find constructor for service loader + collection for {}",
+ LOG.debug(
+ "couldn't find constructor for service loader + collection for {}",
renderer);
}
}
@@ -1461,8 +1530,7 @@
c = rendererClass.getConstructor(List.class);
renderer = c.newInstance(childRenderers);
} catch (NoSuchMethodException me) {
- LOG.debug("couldn't find constructor for Renderer(List) for {}",
- rendererClass);
+ LOG.debug("couldn't find constructor for Renderer(List) for {}", rendererClass);
}
}
return renderer;
diff --git a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
index a2313f4..8077c2b 100644
--- a/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
+++ b/tika-core/src/main/java/org/apache/tika/config/TikaConfigSerializer.java
@@ -39,13 +39,6 @@
import javax.xml.transform.Transformer;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.w3c.dom.Document;
-import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-
import org.apache.tika.detect.CompositeDetector;
import org.apache.tika.detect.CompositeEncodingDetector;
import org.apache.tika.detect.DefaultDetector;
@@ -63,6 +56,11 @@
import org.apache.tika.parser.multiple.AbstractMultipleParser;
import org.apache.tika.utils.StringUtils;
import org.apache.tika.utils.XMLReaderUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Document;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
public class TikaConfigSerializer {
@@ -86,9 +84,9 @@
}
/**
- * @param config config to serialize
- * @param mode serialization mode
- * @param writer writer
+ * @param config config to serialize
+ * @param mode serialization mode
+ * @param writer writer
* @param charset charset
* @throws Exception
*/
@@ -121,8 +119,8 @@
transformer.transform(source, result);
}
- private static void addExecutorService(Mode mode, Element rootElement, Document doc,
- TikaConfig config) {
+ private static void addExecutorService(
+ Mode mode, Element rootElement, Document doc, TikaConfig config) {
ExecutorService executor = config.getExecutorService();
// TODO Implement the reverse of ExecutorServiceXmlLoader
@@ -130,8 +128,8 @@
// TODO Make it possible to get the current values from ConfigurableThreadPoolExecutor
}
- private static void addServiceLoader(Mode mode, Element rootElement, Document doc,
- TikaConfig config) {
+ private static void addServiceLoader(
+ Mode mode, Element rootElement, Document doc, TikaConfig config) {
ServiceLoader loader = config.getServiceLoader();
if (mode == Mode.MINIMAL) {
@@ -148,18 +146,20 @@
rootElement.appendChild(dslEl);
}
- private static void addTranslator(Mode mode, Element rootElement, Document doc,
- TikaConfig config) {
+ private static void addTranslator(
+ Mode mode, Element rootElement, Document doc, TikaConfig config) {
// Unlike the other entries, TikaConfig only wants one of
// these, and no outer <translators> list
Translator translator = config.getTranslator();
if (mode == Mode.MINIMAL && translator instanceof DefaultTranslator) {
- Node mimeComment = doc.createComment("for example: <translator " +
- "class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
+ Node mimeComment =
+ doc.createComment(
+ "for example: <translator "
+ + "class=\"org.apache.tika.language.translate.GoogleTranslator\"/>");
rootElement.appendChild(mimeComment);
} else {
- if (translator instanceof DefaultTranslator &&
- (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
+ if (translator instanceof DefaultTranslator
+ && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
translator = ((DefaultTranslator) translator).getTranslator();
}
if (translator != null) {
@@ -173,28 +173,31 @@
}
private static void addMimeComment(Mode mode, Element rootElement, Document doc) {
- Node mimeComment = doc.createComment("for example: <mimeTypeRepository " +
- "resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
+ Node mimeComment =
+ doc.createComment(
+ "for example: <mimeTypeRepository "
+ + "resource=\"/org/apache/tika/mime/tika-mimetypes.xml\"/>");
rootElement.appendChild(mimeComment);
}
- private static void addEncodingDetectors(Mode mode, Element rootElement, Document doc,
- TikaConfig config) throws Exception {
+ private static void addEncodingDetectors(
+ Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
EncodingDetector encDetector = config.getEncodingDetector();
if (mode == Mode.MINIMAL && encDetector instanceof DefaultEncodingDetector) {
// Don't output anything, all using defaults
- Node detComment = doc.createComment(
- "for example: <encodingDetectors><encodingDetector class=\"" +
- "org.apache.tika.detect.DefaultEncodingDetector\">" +
- "</encodingDetectors>");
+ Node detComment =
+ doc.createComment(
+ "for example: <encodingDetectors><encodingDetector class=\""
+ + "org.apache.tika.detect.DefaultEncodingDetector\">"
+ + "</encodingDetectors>");
rootElement.appendChild(detComment);
return;
}
Element encDetectorsElement = doc.createElement("encodingDetectors");
- if (mode == Mode.CURRENT && encDetector instanceof DefaultEncodingDetector ||
- !(encDetector instanceof CompositeEncodingDetector)) {
+ if (mode == Mode.CURRENT && encDetector instanceof DefaultEncodingDetector
+ || !(encDetector instanceof CompositeEncodingDetector)) {
Element encDetectorElement = doc.createElement("encodingDetector");
encDetectorElement.setAttribute("class", encDetector.getClass().getCanonicalName());
encDetectorsElement.appendChild(encDetectorElement);
@@ -212,21 +215,23 @@
rootElement.appendChild(encDetectorsElement);
}
- private static void addDetectors(Mode mode, Element rootElement, Document doc,
- TikaConfig config) throws Exception {
+ private static void addDetectors(
+ Mode mode, Element rootElement, Document doc, TikaConfig config) throws Exception {
Detector detector = config.getDetector();
if (mode == Mode.MINIMAL && detector instanceof DefaultDetector) {
// Don't output anything, all using defaults
- Node detComment = doc.createComment("for example: <detectors><detector " +
- "class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
+ Node detComment =
+ doc.createComment(
+ "for example: <detectors><detector "
+ + "class=\"org.apache.tika.detector.MimeTypes\"></detectors>");
rootElement.appendChild(detComment);
return;
}
Element detectorsElement = doc.createElement("detectors");
- if (mode == Mode.CURRENT && detector instanceof DefaultDetector ||
- !(detector instanceof CompositeDetector)) {
+ if (mode == Mode.CURRENT && detector instanceof DefaultDetector
+ || !(detector instanceof CompositeDetector)) {
Element detectorElement = doc.createElement("detector");
detectorElement.setAttribute("class", detector.getClass().getCanonicalName());
detectorsElement.appendChild(detectorElement);
@@ -280,8 +285,8 @@
outputParser = false;
}
// Special case for making Default to static
- if (parser instanceof DefaultParser &&
- (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
+ if (parser instanceof DefaultParser
+ && (mode == Mode.STATIC || mode == Mode.STATIC_FULL)) {
outputParser = false;
}
} else if (parser instanceof AbstractMultipleParser) {
@@ -298,8 +303,9 @@
// TODO Parser Exclusions
}
- private static Element addParser(Mode mode, Element rootElement, Document doc, Parser parser,
- ParserDecorator decorator) throws Exception {
+ private static Element addParser(
+ Mode mode, Element rootElement, Document doc, Parser parser, ParserDecorator decorator)
+ throws Exception {
ParseContext context = new ParseContext();
Set<MediaType> addedTypes = new TreeSet<>();
@@ -343,7 +349,7 @@
Matcher setterMatcher = Pattern.compile("\\Aset([A-Z].*)").matcher("");
Matcher getterMatcher = Pattern.compile("\\A(?:get|is)([A-Z].+)\\Z").matcher("");
- //TODO -- check code base for setters with lowercase initial letters?!
+ // TODO -- check code base for setters with lowercase initial letters?!
MethodTuples nonPrimitiveSetters = new MethodTuples();
MethodTuples primitiveSetters = new MethodTuples();
MethodTuples nonPrimitiveGetters = new MethodTuples();
@@ -353,18 +359,22 @@
if (setterMatcher.reset(method.getName()).find()) {
if (!Modifier.isPublic(method.getModifiers())) {
- //we could just call getMethods, but this can be helpful debugging inf
+ // we could just call getMethods, but this can be helpful debugging inf
LOG.trace("inaccessible setter: {} in {}", method.getName(), object.getClass());
continue;
}
- //require @Field on setters
+ // require @Field on setters
if (method.getAnnotation(Field.class) == null) {
- // LOG.warn("unannotated setter {} in {}", method.getName(), object.getClass());
+ // LOG.warn("unannotated setter {} in {}", method.getName(), object.getClass());
continue;
}
if (parameterTypes.length != 1) {
- //TODO -- check code base for setX() zero parameters that set boolean to true
- LOG.warn("setter with wrong number of params " + method.getName() + " " + parameterTypes.length);
+ // TODO -- check code base for setX() zero parameters that set boolean to true
+ LOG.warn(
+ "setter with wrong number of params "
+ + method.getName()
+ + " "
+ + parameterTypes.length);
continue;
}
String paramName = methodToParamName(setterMatcher.group(1));
@@ -375,23 +385,23 @@
}
} else if (getterMatcher.reset(method.getName()).find()) {
if (parameterTypes.length != 0) {
- //require 0 parameters for the getter
+ // require 0 parameters for the getter
continue;
}
String paramName = methodToParamName(getterMatcher.group(1));
if (PRIMITIVES.containsKey(method.getReturnType())) {
- primitiveGetters.add(new MethodTuple(paramName, method, method.getReturnType()));
+ primitiveGetters.add(
+ new MethodTuple(paramName, method, method.getReturnType()));
} else {
- nonPrimitiveGetters.add(new MethodTuple(paramName, method, method.getReturnType()));
+ nonPrimitiveGetters.add(
+ new MethodTuple(paramName, method, method.getReturnType()));
}
-
}
}
- //TODO -- remove nonprimitive setters/getters that have a string equivalent
+ // TODO -- remove nonprimitive setters/getters that have a string equivalent
serializePrimitives(doc, element, object, primitiveSetters, primitiveGetters);
serializeNonPrimitives(doc, element, object, nonPrimitiveSetters, nonPrimitiveGetters);
-
}
private static String methodToParamName(String name) {
@@ -399,28 +409,35 @@
return name;
}
return name.substring(0, 1).toLowerCase(Locale.US) + name.substring(1);
-
}
- private static void serializeNonPrimitives(Document doc, Element element,
- Object object,
- MethodTuples setterTuples,
- MethodTuples getterTuples) {
+ private static void serializeNonPrimitives(
+ Document doc,
+ Element element,
+ Object object,
+ MethodTuples setterTuples,
+ MethodTuples getterTuples) {
for (Map.Entry<String, Set<MethodTuple>> e : setterTuples.tuples.entrySet()) {
Set<MethodTuple> getters = getterTuples.tuples.get(e.getKey());
processNonPrimitive(e.getKey(), e.getValue(), getters, doc, element, object);
if (!getterTuples.tuples.containsKey(e.getKey())) {
- LOG.warn("no getter for setter non-primitive: {} in {}", e.getKey(),
+ LOG.warn(
+ "no getter for setter non-primitive: {} in {}",
+ e.getKey(),
object.getClass());
continue;
}
}
}
- private static void processNonPrimitive(String name, Set<MethodTuple> setters,
- Set<MethodTuple> getters, Document doc, Element element,
- Object object) {
+ private static void processNonPrimitive(
+ String name,
+ Set<MethodTuple> setters,
+ Set<MethodTuple> getters,
+ Document doc,
+ Element element,
+ Object object) {
for (MethodTuple setter : setters) {
for (MethodTuple getter : getters) {
if (setter.singleParam.equals(getter.singleParam)) {
@@ -431,9 +448,13 @@
}
}
- private static void serializeObject(String name, Document doc, Element element,
- MethodTuple setter,
- MethodTuple getter, Object object) {
+ private static void serializeObject(
+ String name,
+ Document doc,
+ Element element,
+ MethodTuple setter,
+ MethodTuple getter,
+ Object object) {
Object item = null;
try {
@@ -451,17 +472,20 @@
serializeParams(doc, element, item);
}
- private static void serializePrimitives(Document doc, Element root,
- Object object,
- MethodTuples setterTuples, MethodTuples getterTuples) {
+ private static void serializePrimitives(
+ Document doc,
+ Element root,
+ Object object,
+ MethodTuples setterTuples,
+ MethodTuples getterTuples) {
Element paramsElement = null;
if (object instanceof AbstractMultipleParser) {
paramsElement = doc.createElement("params");
Element paramElement = doc.createElement("param");
paramElement.setAttribute("name", "metadataPolicy");
- paramElement.setAttribute("value",
- ((AbstractMultipleParser) object).getMetadataPolicy().toString());
+ paramElement.setAttribute(
+ "value", ((AbstractMultipleParser) object).getMetadataPolicy().toString());
paramsElement.appendChild(paramElement);
root.appendChild(paramsElement);
}
@@ -504,10 +528,10 @@
param.setAttribute("name", getterTuple.name);
param.setAttribute("type", PRIMITIVES.get(getterTuple.singleParam));
if (List.class.isAssignableFrom(getterTuple.singleParam)) {
- //this outputs even empty list elements, which I think is good.
+ // this outputs even empty list elements, which I think is good.
addList(param, doc, getterTuple, (List<String>) value);
} else if (Map.class.isAssignableFrom(getterTuple.singleParam)) {
- //this outputs even empty lists, which I think is good.
+ // this outputs even empty lists, which I think is good.
addMap(param, doc, getterTuple, (Map<String, String>) value);
} else {
param.setTextContent(valString);
@@ -520,19 +544,18 @@
}
}
- private static void addMap(Element param, Document doc, MethodTuple getterTuple,
- Map<String, String> object) {
+ private static void addMap(
+ Element param, Document doc, MethodTuple getterTuple, Map<String, String> object) {
for (Map.Entry<String, String> e : new TreeMap<String, String>(object).entrySet()) {
Element element = doc.createElement("string");
element.setAttribute("key", e.getKey());
element.setAttribute("value", e.getValue());
param.appendChild(element);
}
-
}
- private static void addList(Element param, Document doc, MethodTuple getterTuple,
- List<String> list) {
+ private static void addList(
+ Element param, Document doc, MethodTuple getterTuple, List<String> list) {
for (String s : list) {
Element element = doc.createElement("string");
element.setTextContent(s);
@@ -563,8 +586,8 @@
}
private static MethodTuple pickBestSetter(Set<MethodTuple> tuples) {
- //TODO -- if both string and integer, which one do we pick?
- //stub for now -- just pick the first
+ // TODO -- if both string and integer, which one do we pick?
+ // stub for now -- just pick the first
for (MethodTuple t : tuples) {
return t;
}
@@ -587,6 +610,7 @@
return tuples.size();
}
}
+
private static class MethodTuple {
String name;
Method method;
@@ -607,8 +631,9 @@
return false;
}
MethodTuple that = (MethodTuple) o;
- return name.equals(that.name) && method.equals(that.method) &&
- singleParam.equals(that.singleParam);
+ return name.equals(that.name)
+ && method.equals(that.method)
+ && singleParam.equals(that.singleParam);
}
@Override
@@ -616,24 +641,18 @@
return Objects.hash(name, method, singleParam);
}
}
+
public enum Mode {
- /**
- * Minimal version of the config, defaults where possible
- */
+ /** Minimal version of the config, defaults where possible */
MINIMAL,
- /**
- * Current config, roughly as loaded
- */
+ /** Current config, roughly as loaded */
CURRENT,
- /**
- * Static version of the config, with explicit lists of parsers/decorators/etc
- */
+ /** Static version of the config, with explicit lists of parsers/decorators/etc */
STATIC,
/**
- * Static version of the config, with explicit lists of decorators etc,
- * and all parsers given with their detected supported mime types
+ * Static version of the config, with explicit lists of decorators etc, and all parsers
+ * given with their detected supported mime types
*/
STATIC_FULL
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/config/package-info.java b/tika-core/src/main/java/org/apache/tika/config/package-info.java
index 77a0559..93b2ca8 100644
--- a/tika-core/src/main/java/org/apache/tika/config/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/config/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Tika configuration tools.
- */
+/** Tika configuration tools. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.config;
diff --git a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
index bd7d4f2..ee9833d 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/AutoDetectReader.java
@@ -22,9 +22,6 @@
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
-
-import org.xml.sax.InputSource;
-
import org.apache.tika.config.LoadErrorHandler;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.exception.TikaException;
@@ -32,10 +29,11 @@
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.utils.CharsetUtils;
+import org.xml.sax.InputSource;
/**
- * An input stream reader that automatically detects the character encoding
- * to be used for converting bytes to characters.
+ * An input stream reader that automatically detects the character encoding to be used for
+ * converting bytes to characters.
*
* @since Apache Tika 1.2
*/
@@ -47,8 +45,9 @@
private static final EncodingDetector DEFAULT_DETECTOR;
static {
- DEFAULT_DETECTOR = new CompositeEncodingDetector(
- DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class));
+ DEFAULT_DETECTOR =
+ new CompositeEncodingDetector(
+ DEFAULT_LOADER.loadServiceProviders(EncodingDetector.class));
}
private final Charset charset;
@@ -65,28 +64,33 @@
}
/**
- * @param stream stream from which to read -- make sure that it supports mark!
+ * @param stream stream from which to read -- make sure that it supports mark!
* @param metadata
* @param detector
* @param handler
* @throws IOException
* @throws TikaException
*/
- private AutoDetectReader(InputStream stream, Metadata metadata,
- EncodingDetector detector, LoadErrorHandler handler)
+ private AutoDetectReader(
+ InputStream stream,
+ Metadata metadata,
+ EncodingDetector detector,
+ LoadErrorHandler handler)
throws IOException, TikaException {
this(stream, detect(stream, metadata, detector, handler));
}
- public AutoDetectReader(InputStream stream, Metadata metadata,
- EncodingDetector encodingDetector) throws IOException, TikaException {
- this(getBuffered(stream), metadata, encodingDetector,
- DEFAULT_LOADER.getLoadErrorHandler());
+ public AutoDetectReader(
+ InputStream stream, Metadata metadata, EncodingDetector encodingDetector)
+ throws IOException, TikaException {
+ this(getBuffered(stream), metadata, encodingDetector, DEFAULT_LOADER.getLoadErrorHandler());
}
public AutoDetectReader(InputStream stream, Metadata metadata, ServiceLoader loader)
throws IOException, TikaException {
- this(getBuffered(stream), metadata,
+ this(
+ getBuffered(stream),
+ metadata,
new CompositeEncodingDetector(loader.loadServiceProviders(EncodingDetector.class)),
loader.getLoadErrorHandler());
}
@@ -100,8 +104,11 @@
this(stream, new Metadata());
}
- private static Charset detect(InputStream input, Metadata metadata,
- EncodingDetector detector, LoadErrorHandler handler)
+ private static Charset detect(
+ InputStream input,
+ Metadata metadata,
+ EncodingDetector detector,
+ LoadErrorHandler handler)
throws IOException, TikaException {
// Ask all given detectors for the character encoding
try {
@@ -122,7 +129,8 @@
try {
Charset cs = CharsetUtils.forName(charset);
metadata.set(TikaCoreProperties.DETECTED_ENCODING, cs.name());
- metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+ metadata.set(
+ TikaCoreProperties.ENCODING_DETECTOR,
"AutoDetectReader-charset-metadata-fallback");
return cs;
} catch (IllegalArgumentException e) {
@@ -141,7 +149,6 @@
return new BufferedInputStream(stream);
}
-
public Charset getCharset() {
return charset;
}
@@ -151,5 +158,4 @@
source.setEncoding(charset.name());
return source;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
index ed53918..13d5396 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeDetector.java
@@ -23,29 +23,26 @@
import java.util.Collection;
import java.util.Collections;
import java.util.List;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.utils.StringUtils;
-/**
- * Content type detector that combines multiple different detection mechanisms.
- */
+/** Content type detector that combines multiple different detection mechanisms. */
public class CompositeDetector implements Detector {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 5980683158436430252L;
private final MediaTypeRegistry registry;
private final List<Detector> detectors;
- public CompositeDetector(MediaTypeRegistry registry, List<Detector> detectors,
- Collection<Class<? extends Detector>> excludeDetectors) {
+ public CompositeDetector(
+ MediaTypeRegistry registry,
+ List<Detector> detectors,
+ Collection<Class<? extends Detector>> excludeDetectors) {
if (excludeDetectors == null || excludeDetectors.isEmpty()) {
this.detectors = detectors;
} else {
@@ -78,8 +75,8 @@
}
MediaType type = MediaType.OCTET_STREAM;
- //we have to iterate through all detectors because the override detector may
- //be within a CompositeDetector
+ // we have to iterate through all detectors because the override detector may
+ // be within a CompositeDetector
for (Detector detector : getDetectors()) {
MediaType detected = detector.detect(input, metadata);
if (registry.isSpecializationOf(detected, type)) {
@@ -90,7 +87,6 @@
}
/**
- *
* @param metadata
* @return mediaType if a parseable mediatype was sent in via user or parser overrides
*/
@@ -111,20 +107,19 @@
}
return null;
}
- /**
- * Returns the component detectors.
- */
+
+ /** Returns the component detectors. */
public List<Detector> getDetectors() {
return Collections.unmodifiableList(detectors);
}
- private boolean isExcluded(Collection<Class<? extends Detector>> excludeDetectors,
- Class<? extends Detector> d) {
+ private boolean isExcluded(
+ Collection<Class<? extends Detector>> excludeDetectors, Class<? extends Detector> d) {
return excludeDetectors.contains(d) || assignableFrom(excludeDetectors, d);
}
- private boolean assignableFrom(Collection<Class<? extends Detector>> excludeDetectors,
- Class<? extends Detector> d) {
+ private boolean assignableFrom(
+ Collection<Class<? extends Detector>> excludeDetectors, Class<? extends Detector> d) {
for (Class<? extends Detector> e : excludeDetectors) {
if (e.isAssignableFrom(d)) {
return true;
diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
index 7db79cc..999ed60 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
@@ -24,29 +24,25 @@
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
public class CompositeEncodingDetector implements EncodingDetector, Serializable {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 5980683158436430252L;
private final List<EncodingDetector> detectors;
- public CompositeEncodingDetector(List<EncodingDetector> detectors,
- Collection<Class<? extends EncodingDetector>>
- excludeEncodingDetectors) {
+ public CompositeEncodingDetector(
+ List<EncodingDetector> detectors,
+ Collection<Class<? extends EncodingDetector>> excludeEncodingDetectors) {
this.detectors = new LinkedList<>();
for (EncodingDetector encodingDetector : detectors) {
if (!isExcluded(excludeEncodingDetectors, encodingDetector.getClass())) {
this.detectors.add(encodingDetector);
}
}
-
}
public CompositeEncodingDetector(List<EncodingDetector> detectors) {
@@ -55,7 +51,7 @@
}
/**
- * @param input text document input stream, or <code>null</code>
+ * @param input text document input stream, or <code>null</code>
* @param metadata input metadata for the document
* @return the detected Charset or null if no charset could be detected
* @throws IOException
@@ -66,9 +62,10 @@
Charset detected = detector.detect(input, metadata);
if (detected != null) {
metadata.set(TikaCoreProperties.DETECTED_ENCODING, detected.name());
- //if this has been set by a leaf detector, do not overwrite
- if (! detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) {
- metadata.set(TikaCoreProperties.ENCODING_DETECTOR,
+ // if this has been set by a leaf detector, do not overwrite
+ if (!detector.getClass().getSimpleName().equals("CompositeEncodingDetector")) {
+ metadata.set(
+ TikaCoreProperties.ENCODING_DETECTOR,
detector.getClass().getSimpleName());
}
return detected;
@@ -84,8 +81,8 @@
private boolean isExcluded(
Collection<Class<? extends EncodingDetector>> excludeEncodingDetectors,
Class<? extends EncodingDetector> encodingDetector) {
- return excludeEncodingDetectors.contains(encodingDetector) ||
- assignableFrom(excludeEncodingDetectors, encodingDetector);
+ return excludeEncodingDetectors.contains(encodingDetector)
+ || assignableFrom(excludeEncodingDetectors, encodingDetector);
}
private boolean assignableFrom(
diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
index 038d274..755c176 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultDetector.java
@@ -20,33 +20,32 @@
import java.util.Collections;
import java.util.List;
import javax.imageio.spi.ServiceRegistry;
-
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
- * A composite detector based on all the {@link Detector} implementations
- * available through the {@link ServiceRegistry service provider mechanism}.
- * <p>
- * Detectors are loaded and returned in a specified order, of user supplied
- * followed by non-MimeType Tika, followed by the Tika MimeType class.
- * If you need to control the order of the Detectors, you should instead
- * construct your own {@link CompositeDetector} and pass in the list
+ * A composite detector based on all the {@link Detector} implementations available through the
+ * {@link ServiceRegistry service provider mechanism}.
+ *
+ * <p>Detectors are loaded and returned in a specified order, of user supplied followed by
+ * non-MimeType Tika, followed by the Tika MimeType class. If you need to control the order of the
+ * Detectors, you should instead construct your own {@link CompositeDetector} and pass in the list
* of Detectors in the required order.
*
* @since Apache Tika 0.9
*/
public class DefaultDetector extends CompositeDetector {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -8170114575326908027L;
- private transient final ServiceLoader loader;
- public DefaultDetector(MimeTypes types, ServiceLoader loader,
- Collection<Class<? extends Detector>> excludeDetectors) {
+ private final transient ServiceLoader loader;
+
+ public DefaultDetector(
+ MimeTypes types,
+ ServiceLoader loader,
+ Collection<Class<? extends Detector>> excludeDetectors) {
super(types.getMediaTypeRegistry(), getDefaultDetectors(types, loader, excludeDetectors));
this.loader = loader;
}
@@ -72,25 +71,24 @@
}
/**
- * Finds all statically loadable detectors and sort the list by name,
- * rather than discovery order. Detectors are used in the given order,
- * so put the Tika parsers last so that non-Tika (user supplied)
- * parsers can take precedence.
- * <p>
- * If an {@link OverrideDetector} is loaded, it takes precedence over
- * all other detectors.
+ * Finds all statically loadable detectors and sort the list by name, rather than discovery
+ * order. Detectors are used in the given order, so put the Tika parsers last so that non-Tika
+ * (user supplied) parsers can take precedence.
+ *
+ * <p>If an {@link OverrideDetector} is loaded, it takes precedence over all other detectors.
*
* @param loader service loader
* @return ordered list of statically loadable detectors
*/
- private static List<Detector> getDefaultDetectors(MimeTypes types, ServiceLoader loader,
- Collection<Class<? extends Detector>>
- excludeDetectors) {
+ private static List<Detector> getDefaultDetectors(
+ MimeTypes types,
+ ServiceLoader loader,
+ Collection<Class<? extends Detector>> excludeDetectors) {
List<Detector> detectors =
loader.loadStaticServiceProviders(Detector.class, excludeDetectors);
ServiceLoaderUtils.sortLoadedClasses(detectors);
- //look for the override index and put that first
+ // look for the override index and put that first
int overrideIndex = -1;
int i = 0;
for (Detector detector : detectors) {
@@ -123,5 +121,4 @@
return super.getDetectors();
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
index 4cf64d5..347cd4d 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultEncodingDetector.java
@@ -19,19 +19,18 @@
import java.util.Collection;
import javax.imageio.spi.ServiceRegistry;
-
import org.apache.tika.config.ServiceLoader;
/**
- * A composite encoding detector based on all the {@link EncodingDetector} implementations
- * available through the {@link ServiceRegistry service provider mechanism}. Those
- * loaded via the service provider mechanism are ordered by how they appear in the
- * file, if there is a single service file. If multiple, there is no guarantee of order.
+ * A composite encoding detector based on all the {@link EncodingDetector} implementations available
+ * through the {@link ServiceRegistry service provider mechanism}. Those loaded via the service
+ * provider mechanism are ordered by how they appear in the file, if there is a single service file.
+ * If multiple, there is no guarantee of order.
+ *
* <p>
- * <p>
- * If you need to control the order of the Detectors, you should instead
- * construct your own {@link CompositeDetector} and pass in the list
- * of Detectors in the required order.
+ *
+ * <p>If you need to control the order of the Detectors, you should instead construct your own
+ * {@link CompositeDetector} and pass in the list of Detectors in the required order.
*
* @since Apache Tika 1.15
*/
@@ -45,10 +44,9 @@
super(loader.loadServiceProviders(EncodingDetector.class));
}
- public DefaultEncodingDetector(ServiceLoader loader,
- Collection<Class<? extends EncodingDetector>>
- excludeEncodingDetectors) {
+ public DefaultEncodingDetector(
+ ServiceLoader loader,
+ Collection<Class<? extends EncodingDetector>> excludeEncodingDetectors) {
super(loader.loadServiceProviders(EncodingDetector.class), excludeEncodingDetectors);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java b/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java
index b7df0b6..c4de066 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/DefaultProbDetector.java
@@ -17,22 +17,19 @@
package org.apache.tika.detect;
import java.util.List;
-
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.mime.MimeTypes;
import org.apache.tika.mime.ProbabilisticMimeDetectionSelector;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
- * A version of {@link DefaultDetector} for probabilistic mime
- * detectors, which use statistical techniques to blend the
- * results of differing underlying detectors when attempting
- * to detect the type of a given file.
- * TODO Link to documentation on configuring these probabilities
+ * A version of {@link DefaultDetector} for probabilistic mime detectors, which use statistical
+ * techniques to blend the results of differing underlying detectors when attempting to detect the
+ * type of a given file. TODO Link to documentation on configuring these probabilities
*/
public class DefaultProbDetector extends CompositeDetector {
private static final long serialVersionUID = -8836240060532323352L;
- private transient final ServiceLoader loader;
+ private final transient ServiceLoader loader;
public DefaultProbDetector(ProbabilisticMimeDetectionSelector sel, ServiceLoader loader) {
super(sel.getMediaTypeRegistry(), getDefaultDetectors(sel, loader));
@@ -55,8 +52,8 @@
this(MimeTypes.getDefaultMimeTypes());
}
- private static List<Detector> getDefaultDetectors(ProbabilisticMimeDetectionSelector sel,
- ServiceLoader loader) {
+ private static List<Detector> getDefaultDetectors(
+ ProbabilisticMimeDetectionSelector sel, ServiceLoader loader) {
List<Detector> detectors = loader.loadStaticServiceProviders(Detector.class);
ServiceLoaderUtils.sortLoadedClasses(detectors);
detectors.add(sel);
diff --git a/tika-core/src/main/java/org/apache/tika/detect/Detector.java b/tika-core/src/main/java/org/apache/tika/detect/Detector.java
index fc237aa..ead745a 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/Detector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/Detector.java
@@ -19,41 +19,35 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
- * Content type detector. Implementations of this interface use various
- * heuristics to detect the content type of a document based on given
- * input metadata or the first few bytes of the document stream.
+ * Content type detector. Implementations of this interface use various heuristics to detect the
+ * content type of a document based on given input metadata or the first few bytes of the document
+ * stream.
*
* @since Apache Tika 0.3
*/
public interface Detector extends Serializable {
/**
- * Detects the content type of the given input document. Returns
- * <code>application/octet-stream</code> if the type of the document
- * can not be detected.
- * <p>
- * If the document input stream is not available, then the first
- * argument may be <code>null</code>. Otherwise the detector may
- * read bytes from the start of the stream to help in type detection.
- * The given stream is guaranteed to support the
- * {@link InputStream#markSupported() mark feature} and the detector
- * is expected to {@link InputStream#mark(int) mark} the stream before
- * reading any bytes from it, and to {@link InputStream#reset() reset}
- * the stream before returning. The stream must not be closed by the
- * detector.
- * <p>
- * The given input metadata is only read, not modified, by the detector.
+ * Detects the content type of the given input document. Returns <code>application/octet-stream
+ * </code> if the type of the document can not be detected.
*
- * @param input document input stream, or <code>null</code>
+ * <p>If the document input stream is not available, then the first argument may be <code>null
+ * </code>. Otherwise the detector may read bytes from the start of the stream to help in type
+ * detection. The given stream is guaranteed to support the {@link InputStream#markSupported()
+ * mark feature} and the detector is expected to {@link InputStream#mark(int) mark} the stream
+ * before reading any bytes from it, and to {@link InputStream#reset() reset} the stream before
+ * returning. The stream must not be closed by the detector.
+ *
+ * <p>The given input metadata is only read, not modified, by the detector.
+ *
+ * @param input document input stream, or <code>null</code>
* @param metadata input metadata for the document
* @return detected media type, or <code>application/octet-stream</code>
* @throws IOException if the document input stream could not be read
*/
MediaType detect(InputStream input, Metadata metadata) throws IOException;
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java b/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
index 9f99630..c76d79e 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/EmptyDetector.java
@@ -18,22 +18,16 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-/**
- * Dummy detector that returns application/octet-stream for all documents.
- */
+/** Dummy detector that returns application/octet-stream for all documents. */
public class EmptyDetector implements Detector {
- /**
- * Singleton instance of this class.
- */
+ /** Singleton instance of this class. */
public static final EmptyDetector INSTANCE = new EmptyDetector();
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
return MediaType.OCTET_STREAM;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
index 9dbad4c..be60018 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
@@ -20,39 +20,35 @@
import java.io.InputStream;
import java.io.Serializable;
import java.nio.charset.Charset;
-
import org.apache.tika.metadata.Metadata;
/**
- * Character encoding detector. Implementations of this interface use
- * various heuristics to detect the character encoding of a text document
- * based on given input metadata or the first few bytes of the document stream.
+ * Character encoding detector. Implementations of this interface use various heuristics to detect
+ * the character encoding of a text document based on given input metadata or the first few bytes of
+ * the document stream.
*
* @since Apache Tika 0.4
*/
public interface EncodingDetector extends Serializable {
/**
- * Detects the character encoding of the given text document, or
- * <code>null</code> if the encoding of the document can not be detected.
- * <p>
- * If the document input stream is not available, then the first
- * argument may be <code>null</code>. Otherwise the detector may
- * read bytes from the start of the stream to help in encoding detection.
- * The given stream is guaranteed to support the
- * {@link InputStream#markSupported() mark feature} and the detector
- * is expected to {@link InputStream#mark(int) mark} the stream before
- * reading any bytes from it, and to {@link InputStream#reset() reset}
- * the stream before returning. The stream must not be closed by the
- * detector.
- * <p>
- * The given input metadata is only read, not modified, by the detector.
+ * Detects the character encoding of the given text document, or <code>null</code> if the
+ * encoding of the document can not be detected.
*
- * @param input text document input stream, or <code>null</code>
+ * <p>If the document input stream is not available, then the first argument may be <code>null
+ * </code>. Otherwise the detector may read bytes from the start of the stream to help in
+ * encoding detection. The given stream is guaranteed to support the {@link
+ * InputStream#markSupported() mark feature} and the detector is expected to {@link
+ * InputStream#mark(int) mark} the stream before reading any bytes from it, and to {@link
+ * InputStream#reset() reset} the stream before returning. The stream must not be closed by the
+ * detector.
+ *
+ * <p>The given input metadata is only read, not modified, by the detector.
+ *
+ * @param input text document input stream, or <code>null</code>
* @param metadata input metadata for the document
* @return detected character encoding, or <code>null</code>
* @throws IOException if the document input stream could not be read
*/
Charset detect(InputStream input, Metadata metadata) throws IOException;
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
index 42349fa..a3a7211 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java
@@ -22,10 +22,6 @@
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.config.Field;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TemporaryResources;
@@ -38,26 +34,27 @@
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * This runs the linux 'file' command against a file. If
- * this is called on a TikaInputStream, it will use the underlying Path
- * or spool the full file to disk and then run file against that.
- * <p>
- * If this is run against any other type of InputStream, it will spool
- * up to {@link #maxBytes} to disk and then run the detector.
- * <p>
- * As with all detectors, mark must be supported.
- * <p>
- * If you want to use file's mime type in the parse, e.g.
- * to select the parser in AutoDetectParser, set {@link FileCommandDetector#setUseMime(boolean)}
- * to true. The default behavior is to store the value as {@link FileCommandDetector#FILE_MIME}
- * but rely on other detectors for the "active" mime used by Tika.
+ * This runs the linux 'file' command against a file. If this is called on a TikaInputStream, it
+ * will use the underlying Path or spool the full file to disk and then run file against that.
+ *
+ * <p>If this is run against any other type of InputStream, it will spool up to {@link #maxBytes} to
+ * disk and then run the detector.
+ *
+ * <p>As with all detectors, mark must be supported.
+ *
+ * <p>If you want to use file's mime type in the parse, e.g. to select the parser in
+ * AutoDetectParser, set {@link FileCommandDetector#setUseMime(boolean)} to true. The default
+ * behavior is to store the value as {@link FileCommandDetector#FILE_MIME} but rely on other
+ * detectors for the "active" mime used by Tika.
*/
public class FileCommandDetector implements Detector {
- //TODO: file has some diff mimes names for some very common mimes
- //should we map file mimes to Tika mimes, e.g. text/xml -> application/xml??
+ // TODO: file has some diff mimes names for some very common mimes
+ // should we map file mimes to Tika mimes, e.g. text/xml -> application/xml??
public static Property FILE_MIME = Property.externalText("file:mime");
private static final Logger LOGGER = LoggerFactory.getLogger(FileCommandDetector.class);
@@ -75,14 +72,13 @@
return checkHasFile(DEFAULT_FILE_COMMAND_PATH);
}
-
public static boolean checkHasFile(String fileCommandPath) {
- String[] commandline = new String[]{fileCommandPath, "-v"};
+ String[] commandline = new String[] {fileCommandPath, "-v"};
return ExternalParser.check(commandline);
}
/**
- * @param input document input stream, or <code>null</code>
+ * @param input document input stream, or <code>null</code>
* @param metadata input metadata for the document
* @return mime as identified by the file command or application/octet-stream otherwise
* @throws IOException
@@ -101,8 +97,8 @@
}
TikaInputStream tis = TikaInputStream.cast(input);
if (tis != null) {
- //spool the full file to disk, if called with a TikaInputStream
- //and there is no underlying file
+ // spool the full file to disk, if called with a TikaInputStream
+ // and there is no underlying file
return detectOnPath(tis.getPath(), metadata);
}
@@ -119,8 +115,12 @@
private MediaType detectOnPath(Path path, Metadata metadata) throws IOException {
String[] args =
- new String[]{ProcessUtils.escapeCommandLine(fileCommandPath), "-b", "--mime-type",
- ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString())};
+ new String[] {
+ ProcessUtils.escapeCommandLine(fileCommandPath),
+ "-b",
+ "--mime-type",
+ ProcessUtils.escapeCommandLine(path.toAbsolutePath().toString())
+ };
ProcessBuilder builder = new ProcessBuilder(args);
FileProcessResult result = ProcessUtils.execute(builder, timeoutMs, 10000, 10000);
if (result.isTimeout()) {
@@ -149,8 +149,8 @@
@Field
public void setFilePath(String fileCommandPath) {
- //this opens up a potential command vulnerability.
- //Don't ever let an untrusted user set this.
+ // this opens up a potential command vulnerability.
+ // Don't ever let an untrusted user set this.
this.fileCommandPath = fileCommandPath;
checkHasFile(this.fileCommandPath);
}
@@ -163,10 +163,10 @@
public boolean isUseMime() {
return useMime;
}
+
/**
- * If this is not called on a TikaInputStream, this detector
- * will spool up to this many bytes to a file to be detected
- * by the 'file' command.
+ * If this is not called on a TikaInputStream, this detector will spool up to this many bytes to
+ * a file to be detected by the 'file' command.
*
* @param maxBytes
*/
@@ -179,5 +179,4 @@
public void setTimeoutMs(long timeoutMs) {
this.timeoutMs = timeoutMs;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java b/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
index bb9ec1d..b8d5205 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
@@ -27,75 +27,72 @@
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
- * Content type detection based on magic bytes, i.e. type-specific patterns
- * near the beginning of the document input stream.
- * <p>
- * Because this works on bytes, not characters, by default any string
- * matching is done as ISO_8859_1. To use an explicit different
- * encoding, supply a type other than "string" / "stringignorecase"
+ * Content type detection based on magic bytes, i.e. type-specific patterns near the beginning of
+ * the document input stream.
+ *
+ * <p>Because this works on bytes, not characters, by default any string matching is done as
+ * ISO_8859_1. To use an explicit different encoding, supply a type other than "string" /
+ * "stringignorecase"
*
* @since Apache Tika 0.3
*/
public class MagicDetector implements Detector {
/**
- * The matching media type. Returned by the
- * {@link #detect(InputStream, Metadata)} method if a match is found.
+ * The matching media type. Returned by the {@link #detect(InputStream, Metadata)} method if a
+ * match is found.
*/
private final MediaType type;
- /**
- * Length of the comparison window.
- */
+
+ /** Length of the comparison window. */
private final int length;
+
/**
- * The magic match pattern. If this byte pattern is equal to the
- * possibly bit-masked bytes from the input stream, then the type
- * detection succeeds and the configured {@link #type} is returned.
+ * The magic match pattern. If this byte pattern is equal to the possibly bit-masked bytes from
+ * the input stream, then the type detection succeeds and the configured {@link #type} is
+ * returned.
*/
private final byte[] pattern;
+
/**
- * Length of the pattern, which in the case of regular expressions will
- * not be the same as the comparison window length.
+ * Length of the pattern, which in the case of regular expressions will not be the same as the
+ * comparison window length.
*/
private final int patternLength;
- /**
- * True if pattern is a regular expression, false otherwise.
- */
+
+ /** True if pattern is a regular expression, false otherwise. */
private final boolean isRegex;
- /**
- * True if we're doing a case-insensitive string match, false otherwise.
- */
+
+ /** True if we're doing a case-insensitive string match, false otherwise. */
private final boolean isStringIgnoreCase;
- /**
- * Bit mask that is applied to the source bytes before pattern matching.
- */
+
+ /** Bit mask that is applied to the source bytes before pattern matching. */
private final byte[] mask;
+
/**
- * First offset (inclusive) of the comparison window within the
- * document input stream. Greater than or equal to zero.
+ * First offset (inclusive) of the comparison window within the document input stream. Greater
+ * than or equal to zero.
*/
private final int offsetRangeBegin;
+
/**
- * Last offset (inclusive) of the comparison window within the document
- * input stream. Greater than or equal to the
- * {@link #offsetRangeBegin first offset}.
- * <p>
- * Note that this is <em>not</em> the offset of the last byte read from
- * the document stream. Instead, the last window of bytes to be compared
- * starts at this offset.
+ * Last offset (inclusive) of the comparison window within the document input stream. Greater
+ * than or equal to the {@link #offsetRangeBegin first offset}.
+ *
+ * <p>Note that this is <em>not</em> the offset of the last byte read from the document stream.
+ * Instead, the last window of bytes to be compared starts at this offset.
*/
private final int offsetRangeEnd;
/**
- * Creates a detector for input documents that have the exact given byte
- * pattern at the beginning of the document stream.
+ * Creates a detector for input documents that have the exact given byte pattern at the
+ * beginning of the document stream.
*
- * @param type matching media type
+ * @param type matching media type
* @param pattern magic match pattern
*/
public MagicDetector(MediaType type, byte[] pattern) {
@@ -103,42 +100,46 @@
}
/**
- * Creates a detector for input documents that have the exact given byte
- * pattern at the given offset of the document stream.
+ * Creates a detector for input documents that have the exact given byte pattern at the given
+ * offset of the document stream.
*
- * @param type matching media type
+ * @param type matching media type
* @param pattern magic match pattern
- * @param offset offset of the pattern match
+ * @param offset offset of the pattern match
*/
public MagicDetector(MediaType type, byte[] pattern, int offset) {
this(type, pattern, null, offset, offset);
}
/**
- * Creates a detector for input documents that meet the specified magic
- * match. {@code pattern} must NOT be a regular expression.
- * Constructor maintained for legacy reasons.
+ * Creates a detector for input documents that meet the specified magic match. {@code pattern}
+ * must NOT be a regular expression. Constructor maintained for legacy reasons.
*/
- public MagicDetector(MediaType type, byte[] pattern, byte[] mask, int offsetRangeBegin,
- int offsetRangeEnd) {
+ public MagicDetector(
+ MediaType type, byte[] pattern, byte[] mask, int offsetRangeBegin, int offsetRangeEnd) {
this(type, pattern, mask, false, offsetRangeBegin, offsetRangeEnd);
}
- /**
- * Creates a detector for input documents that meet the specified
- * magic match.
- */
- public MagicDetector(MediaType type, byte[] pattern, byte[] mask, boolean isRegex,
- int offsetRangeBegin, int offsetRangeEnd) {
+ /** Creates a detector for input documents that meet the specified magic match. */
+ public MagicDetector(
+ MediaType type,
+ byte[] pattern,
+ byte[] mask,
+ boolean isRegex,
+ int offsetRangeBegin,
+ int offsetRangeEnd) {
this(type, pattern, mask, isRegex, false, offsetRangeBegin, offsetRangeEnd);
}
- /**
- * Creates a detector for input documents that meet the specified
- * magic match.
- */
- public MagicDetector(MediaType type, byte[] pattern, byte[] mask, boolean isRegex,
- boolean isStringIgnoreCase, int offsetRangeBegin, int offsetRangeEnd) {
+ /** Creates a detector for input documents that meet the specified magic match. */
+ public MagicDetector(
+ MediaType type,
+ byte[] pattern,
+ byte[] mask,
+ boolean isRegex,
+ boolean isStringIgnoreCase,
+ int offsetRangeBegin,
+ int offsetRangeEnd) {
if (type == null) {
throw new IllegalArgumentException("Matching media type is null");
} else if (pattern == null) {
@@ -183,8 +184,8 @@
this.offsetRangeEnd = offsetRangeEnd;
}
- public static MagicDetector parse(MediaType mediaType, String type, String offset, String value,
- String mask) {
+ public static MagicDetector parse(
+ MediaType mediaType, String type, String offset, String value, String mask) {
int start = 0;
int end = 0;
if (offset != null) {
@@ -204,8 +205,14 @@
maskBytes = decodeValue(mask, type);
}
- return new MagicDetector(mediaType, patternBytes, maskBytes, type.equals("regex"),
- type.equals("stringignorecase"), start, end);
+ return new MagicDetector(
+ mediaType,
+ patternBytes,
+ maskBytes,
+ type.equals("regex"),
+ type.equals("stringignorecase"),
+ start,
+ end);
}
private static byte[] decodeValue(String value, String type) {
@@ -241,29 +248,43 @@
decoded = tmpVal.getBytes(UTF_8);
break;
case "host16":
- case "little16": {
- int i = Integer.parseInt(tmpVal, radix);
- decoded = new byte[]{(byte) (i & 0x00FF), (byte) (i >> 8)};
- break;
- }
- case "big16": {
- int i = Integer.parseInt(tmpVal, radix);
- decoded = new byte[]{(byte) (i >> 8), (byte) (i & 0x00FF)};
- break;
- }
+ case "little16":
+ {
+ int i = Integer.parseInt(tmpVal, radix);
+ decoded = new byte[] {(byte) (i & 0x00FF), (byte) (i >> 8)};
+ break;
+ }
+ case "big16":
+ {
+ int i = Integer.parseInt(tmpVal, radix);
+ decoded = new byte[] {(byte) (i >> 8), (byte) (i & 0x00FF)};
+ break;
+ }
case "host32":
- case "little32": {
- long i = Long.parseLong(tmpVal, radix);
- decoded = new byte[]{(byte) ((i & 0x000000FF)), (byte) ((i & 0x0000FF00) >> 8),
- (byte) ((i & 0x00FF0000) >> 16), (byte) ((i & 0xFF000000) >> 24)};
- break;
- }
- case "big32": {
- long i = Long.parseLong(tmpVal, radix);
- decoded = new byte[]{(byte) ((i & 0xFF000000) >> 24), (byte) ((i & 0x00FF0000) >> 16),
- (byte) ((i & 0x0000FF00) >> 8), (byte) ((i & 0x000000FF))};
- break;
- }
+ case "little32":
+ {
+ long i = Long.parseLong(tmpVal, radix);
+ decoded =
+ new byte[] {
+ (byte) ((i & 0x000000FF)),
+ (byte) ((i & 0x0000FF00) >> 8),
+ (byte) ((i & 0x00FF0000) >> 16),
+ (byte) ((i & 0xFF000000) >> 24)
+ };
+ break;
+ }
+ case "big32":
+ {
+ long i = Long.parseLong(tmpVal, radix);
+ decoded =
+ new byte[] {
+ (byte) ((i & 0xFF000000) >> 24),
+ (byte) ((i & 0x00FF0000) >> 16),
+ (byte) ((i & 0x0000FF00) >> 8),
+ (byte) ((i & 0x000000FF))
+ };
+ break;
+ }
}
return decoded;
}
@@ -295,8 +316,9 @@
i++;
} else {
int j = i + 1;
- while ((j < i + 4) && (j < value.length()) &&
- (Character.isDigit(value.charAt(j)))) {
+ while ((j < i + 4)
+ && (j < value.length())
+ && (Character.isDigit(value.charAt(j)))) {
j++;
}
decoded.write(Short.decode("0" + value.substring(i + 1, j)).byteValue());
@@ -333,7 +355,7 @@
}
/**
- * @param input document input stream, or <code>null</code>
+ * @param input document input stream, or <code>null</code>
* @param metadata ignored
*/
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
@@ -425,13 +447,18 @@
}
/**
- * Returns a string representation of the Detection Rule.
- * Should sort nicely by type and details, as we sometimes
- * compare these.
+ * Returns a string representation of the Detection Rule. Should sort nicely by type and
+ * details, as we sometimes compare these.
*/
public String toString() {
// Needs to be unique, as these get compared.
- return "Magic Detection for " + type + " looking for " + pattern.length + " bytes = " +
- this.pattern + " mask = " + this.mask;
+ return "Magic Detection for "
+ + type
+ + " looking for "
+ + pattern.length
+ + " bytes = "
+ + this.pattern
+ + " mask = "
+ + this.mask;
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
index bcbf48f..76fe42d 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/NNExampleModelDetector.java
@@ -27,12 +27,10 @@
import java.net.URL;
import java.nio.file.Path;
import java.util.Objects;
-
+import org.apache.tika.mime.MediaType;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.mime.MediaType;
-
public class NNExampleModelDetector extends TrainedModelDetector {
private static final String EXAMPLE_NNMODEL_FILE = "tika-example.nnmodel";
@@ -68,16 +66,13 @@
// add this model into map of trained models.
super.registerModels(nnBuilder.getType(), nnBuilder.build());
}
-
}
} catch (IOException e) {
throw new RuntimeException("Unable to read the default media type registry", e);
}
}
- /**
- * this method gets overwritten to register load neural network models
- */
+ /** this method gets overwritten to register load neural network models */
@Override
public void loadDefaultModels(ClassLoader classLoader) {
if (classLoader == null) {
@@ -91,22 +86,20 @@
// Get the core URL, and all the extensions URLs
URL modelURL = classLoader.getResource(classPrefix + EXAMPLE_NNMODEL_FILE);
- Objects.requireNonNull(modelURL,
- "required resource " + classPrefix + EXAMPLE_NNMODEL_FILE + " not found");
+ Objects.requireNonNull(
+ modelURL, "required resource " + classPrefix + EXAMPLE_NNMODEL_FILE + " not found");
try (InputStream stream = modelURL.openStream()) {
loadDefaultModels(stream);
} catch (IOException e) {
throw new RuntimeException("Unable to read the default media type registry", e);
}
-
}
/**
- * read the comments where the model configuration is written, e.g the
- * number of inputs, hiddens and output please ensure the first char in the
- * given string is # In this example grb model file, there are 4 elements 1)
- * type 2) number of input units 3) number of hidden units. 4) number of
- * output units.
+ * read the comments where the model configuration is written, e.g the number of inputs, hiddens
+ * and output please ensure the first char in the given string is # In this example grb model
+ * file, there are 4 elements 1) type 2) number of input units 3) number of hidden units. 4)
+ * number of output units.
*/
private void readDescription(final NNTrainedModelBuilder builder, final String line) {
int numInputs;
@@ -130,8 +123,8 @@
}
/**
- * Read the next line for the model parameters and populate the build which
- * later will be used to instantiate the instance of TrainedModel
+ * Read the next line for the model parameters and populate the build which later will be used
+ * to instantiate the instance of TrainedModel
*
* @param builder
* @param line
diff --git a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java
index 73ee560..c0f06e9 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModel.java
@@ -25,8 +25,8 @@
private final float[][] Theta1;
private final float[][] Theta2;
- public NNTrainedModel(final int nInput, final int nHidden, final int nOutput,
- final float[] nn_params) {
+ public NNTrainedModel(
+ final int nInput, final int nHidden, final int nOutput, final float[] nn_params) {
this.numOfInputs = nInput;
this.numOfHidden = nHidden;
this.numOfOutputs = nOutput;
@@ -64,8 +64,7 @@
}
/**
- * The given input vector of unseen is m=(256 + 1) * n= 1 this returns a
- * prediction probability
+ * The given input vector of unseen is m=(256 + 1) * n= 1 this returns a prediction probability
*/
@Override
public float predict(float[] unseen) {
@@ -74,7 +73,7 @@
int i, j;
int m = this.Theta1.length;
int n = this.Theta1[0].length;
- float[] hh = new float[m + 1];// hidden unit summation
+ float[] hh = new float[m + 1]; // hidden unit summation
hh[0] = 1;
for (i = 0; i < m; i++) {
double h = 0;
diff --git a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java
index 9b4eab3..f710a9d 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/NNTrainedModelBuilder.java
@@ -15,7 +15,6 @@
* limitations under the License.
*/
-
package org.apache.tika.detect;
import org.apache.tika.mime.MediaType;
diff --git a/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
index 36d01e1..2f49ec3 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/NameDetector.java
@@ -23,37 +23,33 @@
import java.net.URLDecoder;
import java.util.Map;
import java.util.regex.Pattern;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
/**
- * Content type detection based on the resource name. An instance of this
- * class contains a set of regular expression patterns that are matched
- * against the resource name potentially given as a part of the input metadata.
- * <p>
- * If a pattern matches the given name, then the media type associated with
- * that pattern is returned as the likely content type of the input document.
- * Otherwise the returned type is <code>application/octet-stream</code>.
- * <p>
- * See the {@link #detect(InputStream, Metadata)} method for more details
- * of the matching algorithm.
+ * Content type detection based on the resource name. An instance of this class contains a set of
+ * regular expression patterns that are matched against the resource name potentially given as a
+ * part of the input metadata.
+ *
+ * <p>If a pattern matches the given name, then the media type associated with that pattern is
+ * returned as the likely content type of the input document. Otherwise the returned type is <code>
+ * application/octet-stream</code>.
+ *
+ * <p>See the {@link #detect(InputStream, Metadata)} method for more details of the matching
+ * algorithm.
*
* @since Apache Tika 0.3
*/
public class NameDetector implements Detector {
- /**
- * The regular expression patterns used for type detection.
- */
+ /** The regular expression patterns used for type detection. */
private final Map<Pattern, MediaType> patterns;
/**
- * Creates a new content type detector based on the given name patterns.
- * The given pattern map is not copied, so the caller may update the
- * mappings even after this detector instance has been created. However,
- * the map <em>must not be concurrently modified</em> while this instance
+ * Creates a new content type detector based on the given name patterns. The given pattern map
+ * is not copied, so the caller may update the mappings even after this detector instance has
+ * been created. However, the map <em>must not be concurrently modified</em> while this instance
* is used for type detection.
*
* @param patterns map from name patterns to corresponding media types
@@ -63,34 +59,25 @@
}
/**
- * Detects the content type of an input document based on the document
- * name given in the input metadata. The RESOURCE_NAME_KEY attribute of
- * the given input metadata is expected to contain the name (normally
- * a file name or a URL) of the input document.
- * <p>
- * If a resource name is given, then it is first processed as follows.
- * <ol>
- * <li>
- * Potential URL query (?...) and fragment identifier (#...)
- * parts are removed from the end of the resource name.
- * </li>
- * <li>
- * Potential leading path elements (up to the last slash or backslash)
- * are removed from the beginning of the resource name.
- * </li>
- * <li>
- * Potential URL encodings (%nn, in UTF-8) are decoded.
- * </li>
- * <li>
- * Any leading and trailing whitespace is removed.
- * </li>
- * </ol>
- * <p>
- * The resulting name string (if any) is then matched in sequence against
- * all the configured name patterns. If a match is found, then the (first)
- * matching media type is returned.
+ * Detects the content type of an input document based on the document name given in the input
+ * metadata. The RESOURCE_NAME_KEY attribute of the given input metadata is expected to contain
+ * the name (normally a file name or a URL) of the input document.
*
- * @param input ignored
+ * <p>If a resource name is given, then it is first processed as follows.
+ *
+ * <ol>
+ * <li>Potential URL query (?...) and fragment identifier (#...) parts are removed from the
+ * end of the resource name.
+ * <li>Potential leading path elements (up to the last slash or backslash) are removed from
+ * the beginning of the resource name.
+ * <li>Potential URL encodings (%nn, in UTF-8) are decoded.
+ * <li>Any leading and trailing whitespace is removed.
+ * </ol>
+ *
+ * <p>The resulting name string (if any) is then matched in sequence against all the configured
+ * name patterns. If a match is found, then the (first) matching media type is returned.
+ *
+ * @param input ignored
* @param metadata input metadata, possibly with a RESOURCE_NAME_KEY value
* @return detected media type, or <code>application/octet-stream</code>
*/
@@ -147,5 +134,4 @@
return MediaType.OCTET_STREAM;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java
index 896a795..a473cff 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/NonDetectingEncodingDetector.java
@@ -21,24 +21,17 @@
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
-
import org.apache.tika.config.Field;
import org.apache.tika.metadata.Metadata;
-/**
- * Always returns the charset passed in via the initializer
- */
+/** Always returns the charset passed in via the initializer */
public class NonDetectingEncodingDetector implements EncodingDetector {
- //would have preferred final, but need mutability for
- //loading via TikaConfig
+ // would have preferred final, but need mutability for
+ // loading via TikaConfig
private Charset charset = StandardCharsets.UTF_8;
- /**
- * Sets charset to UTF-8.
- */
- public NonDetectingEncodingDetector() {
-
- }
+ /** Sets charset to UTF-8. */
+ public NonDetectingEncodingDetector() {}
public NonDetectingEncodingDetector(Charset charset) {
this.charset = charset;
diff --git a/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java b/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java
index b6c5a41..ee2944d 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/OverrideDetector.java
@@ -18,17 +18,16 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
/**
- * Use this to force a content type detection via the
- * {@link TikaCoreProperties#CONTENT_TYPE_USER_OVERRIDE} key in the metadata object.
- * <p>
- * This is also required to override detection by some parsers
- * via {@link TikaCoreProperties#CONTENT_TYPE_PARSER_OVERRIDE}.
+ * Use this to force a content type detection via the {@link
+ * TikaCoreProperties#CONTENT_TYPE_USER_OVERRIDE} key in the metadata object.
+ *
+ * <p>This is also required to override detection by some parsers via {@link
+ * TikaCoreProperties#CONTENT_TYPE_PARSER_OVERRIDE}.
*
* @deprecated after 2.5.0 this functionality was moved to the CompositeDetector
*/
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
index 96583be..8cfad3f 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TextDetector.java
@@ -19,46 +19,39 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
- * Content type detection of plain text documents. This detector looks at the
- * beginning of the document input stream and considers the document to be
- * a text document if no ASCII (ISO-Latin-1, UTF-8, etc.) control bytes are
- * found. As a special case some control bytes (up to 2% of all characters)
- * are also allowed in a text document if it also contains no or just a few
- * (less than 10%) characters above the 7-bit ASCII range.
- * <p>
- * Note that text documents with a character encoding like UTF-16 are better
- * detected with {@link MagicDetector} and an appropriate magic byte pattern.
+ * Content type detection of plain text documents. This detector looks at the beginning of the
+ * document input stream and considers the document to be a text document if no ASCII (ISO-Latin-1,
+ * UTF-8, etc.) control bytes are found. As a special case some control bytes (up to 2% of all
+ * characters) are also allowed in a text document if it also contains no or just a few (less than
+ * 10%) characters above the 7-bit ASCII range.
+ *
+ * <p>Note that text documents with a character encoding like UTF-16 are better detected with {@link
+ * MagicDetector} and an appropriate magic byte pattern.
*
* @since Apache Tika 0.3
*/
public class TextDetector implements Detector {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 4774601079503507765L;
- /**
- * The number of bytes from the beginning of the document stream
- * to test for control bytes.
- */
+ /** The number of bytes from the beginning of the document stream to test for control bytes. */
private static final int DEFAULT_NUMBER_OF_BYTES_TO_TEST = 512;
/**
- * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes
- * in the range below 0x20 (the space character). If an entry in this
- * table is <code>true</code> then that byte is very unlikely to occur
- * in a plain text document.
- * <p>
- * The contents of this lookup table are based on the following definition
- * from section 4 of the "Content-Type Processing Model" Internet-draft
- * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+ * Lookup table for all the ASCII/ISO-Latin/UTF-8/etc. control bytes in the range below 0x20
+ * (the space character). If an entry in this table is <code>true</code> then that byte is very
+ * unlikely to occur in a plain text document.
+ *
+ * <p>The contents of this lookup table are based on the following definition from section 4 of
+ * the "Content-Type Processing Model" Internet-draft (<a
+ * href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
* >draft-abarth-mime-sniff-01</a>).
+ *
* <pre>
* +-------------------------+
* | Binary data byte ranges |
@@ -86,29 +79,29 @@
private final int bytesToTest;
/**
- * Constructs a {@link TextDetector} which will look at the default number
- * of bytes from the beginning of the document.
+ * Constructs a {@link TextDetector} which will look at the default number of bytes from the
+ * beginning of the document.
*/
public TextDetector() {
this(DEFAULT_NUMBER_OF_BYTES_TO_TEST);
}
/**
- * Constructs a {@link TextDetector} which will look at a given number of
- * bytes from the beginning of the document.
+ * Constructs a {@link TextDetector} which will look at a given number of bytes from the
+ * beginning of the document.
*/
public TextDetector(int bytesToTest) {
this.bytesToTest = bytesToTest;
}
/**
- * Looks at the beginning of the document input stream to determine
- * whether the document is text or not.
+ * Looks at the beginning of the document input stream to determine whether the document is text
+ * or not.
*
- * @param input document input stream, or <code>null</code>
+ * @param input document input stream, or <code>null</code>
* @param metadata ignored
- * @return "text/plain" if the input stream suggest a text document,
- * "application/octet-stream" otherwise
+ * @return "text/plain" if the input stream suggest a text document, "application/octet-stream"
+ * otherwise
*/
public MediaType detect(InputStream input, Metadata metadata) throws IOException {
if (input == null) {
@@ -137,5 +130,4 @@
input.reset();
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
index 50f8d79..29252cc 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TextStatistics.java
@@ -35,11 +35,11 @@
}
/**
- * Checks whether at least one byte was seen and that the bytes that
- * were seen were mostly plain text (i.e. < 2% control, > 90% ASCII range).
+ * Checks whether at least one byte was seen and that the bytes that were seen were mostly plain
+ * text (i.e. < 2% control, > 90% ASCII range).
*
- * @return <code>true</code> if the seen bytes were mostly safe ASCII,
- * <code>false</code> otherwise
+ * @return <code>true</code> if the seen bytes were mostly safe ASCII, <code>false</code>
+ * otherwise
* @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
* @see <a href="https://issues.apache.org/jira/browse/TIKA-688">TIKA-688</a>
*/
@@ -53,8 +53,7 @@
/**
* Checks whether the observed byte stream looks like UTF-8 encoded text.
*
- * @return <code>true</code> if the seen bytes look like UTF-8,
- * <code>false</code> otherwise
+ * @return <code>true</code> if the seen bytes look like UTF-8, <code>false</code> otherwise
* @since Apache Tika 1.3
*/
public boolean looksLikeUTF8() {
@@ -63,16 +62,18 @@
int safe = countSafeControl();
int expectedContinuation = 0;
- int[] leading = new int[]{count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8)};
+ int[] leading = new int[] {count(0xc0, 0xe0), count(0xe0, 0xf0), count(0xf0, 0xf8)};
for (int i = 0; i < leading.length; i++) {
utf8 += leading[i];
expectedContinuation += (i + 1) * leading[i];
}
int continuation = count(0x80, 0xc0);
- return utf8 > 0 && continuation <= expectedContinuation &&
- continuation >= expectedContinuation - 3 && count(0xf8, 0x100) == 0 &&
- (control - safe) * 100 < utf8 * 2;
+ return utf8 > 0
+ && continuation <= expectedContinuation
+ && continuation >= expectedContinuation - 3
+ && count(0xf8, 0x100) == 0
+ && (control - safe) * 100 < utf8 * 2;
}
/**
@@ -95,13 +96,13 @@
}
/**
- * Counts control characters (i.e. < 0x20, excluding tab, CR, LF,
- * page feed and escape).
- * <p>
- * This definition of control characters is based on section 4 of the
- * "Content-Type Processing Model" Internet-draft
- * (<a href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
+ * Counts control characters (i.e. < 0x20, excluding tab, CR, LF, page feed and escape).
+ *
+ * <p>This definition of control characters is based on section 4 of the "Content-Type
+ * Processing Model" Internet-draft (<a
+ * href="http://webblaze.cs.berkeley.edu/2009/mime-sniff/mime-sniff.txt"
* >draft-abarth-mime-sniff-01</a>).
+ *
* <pre>
* +-------------------------+
* | Binary data byte ranges |
@@ -149,8 +150,10 @@
}
private int countSafeControl() {
- return count('\t') + count('\n') + count('\r') // tab, LF, CR
- + count(0x0c) + count(0x1b); // new page, escape
+ return count('\t')
+ + count('\n')
+ + count('\r') // tab, LF, CR
+ + count(0x0c)
+ + count(0x1b); // new page, escape
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java b/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java
index 0111b23..6d725ee 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TrainedModel.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.detect;
-
public abstract class TrainedModel {
public abstract double predict(double[] input);
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
index 25b9f08..170ed0b 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TrainedModelDetector.java
@@ -31,7 +31,6 @@
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
-
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
diff --git a/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java b/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
index 60d75c7..d10ef00 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/TypeDetector.java
@@ -17,27 +17,25 @@
package org.apache.tika.detect;
import java.io.InputStream;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
- * Content type detection based on a content type hint. This detector simply
- * trusts any valid content type hint given in the input metadata, and returns
- * that as the likely type of the input document.
+ * Content type detection based on a content type hint. This detector simply trusts any valid
+ * content type hint given in the input metadata, and returns that as the likely type of the input
+ * document.
*
* @since Apache Tika 0.3
*/
public class TypeDetector implements Detector {
/**
- * Detects the content type of an input document based on a type hint
- * given in the input metadata. The CONTENT_TYPE attribute of the given
- * input metadata is expected to contain the type of the input document.
- * If that attribute exists and contains a valid type name, then that
- * type is returned.
+ * Detects the content type of an input document based on a type hint given in the input
+ * metadata. The CONTENT_TYPE attribute of the given input metadata is expected to contain the
+ * type of the input document. If that attribute exists and contains a valid type name, then
+ * that type is returned.
*
- * @param input ignored
+ * @param input ignored
* @param metadata input metadata, possibly with a CONTENT_TYPE value
* @return detected media type, or <code>application/octet-stream</code>
*/
@@ -52,5 +50,4 @@
}
return MediaType.OCTET_STREAM;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java b/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
index 94d85314..83f73e1 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/XmlRootExtractor.java
@@ -20,19 +20,17 @@
import java.io.InputStream;
import java.util.Arrays;
import javax.xml.namespace.QName;
-
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.utils.XMLReaderUtils;
-
/**
- * Utility class that uses a {@link javax.xml.parsers.SAXParser} to determine
- * the namespace URI and local name of the root element of an XML file.
+ * Utility class that uses a {@link javax.xml.parsers.SAXParser} to determine the namespace URI and
+ * local name of the root element of an XML file.
*
* @since Apache Tika 0.4
*/
@@ -66,17 +64,17 @@
public QName extractRootElement(InputStream stream) {
return extractRootElement(stream, false);
}
-
+
private QName extractRootElement(InputStream stream, boolean throwMalformed) {
ExtractorHandler handler = new ExtractorHandler();
try {
- XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(stream),
- handler, EMPTY_CONTEXT);
+ XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(stream), handler, EMPTY_CONTEXT);
} catch (SecurityException e) {
throw e;
} catch (Exception e) {
- if (throwMalformed && (e instanceof CharConversionException
- || e.getCause() instanceof CharConversionException)) {
+ if (throwMalformed
+ && (e instanceof CharConversionException
+ || e.getCause() instanceof CharConversionException)) {
throw new MalformedCharException(e);
}
}
@@ -93,7 +91,6 @@
this.rootElement = new QName(uri, local);
throw new SAXException("Aborting: root element received");
}
-
}
private static class MalformedCharException extends RuntimeException {
@@ -101,7 +98,5 @@
public MalformedCharException(Exception e) {
super(e);
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java b/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java
index 5ce5268..86f4917 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/ZeroSizeFileDetector.java
@@ -18,13 +18,10 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
-/**
- * Detector to identify zero length files as application/x-zerovalue
- */
+/** Detector to identify zero length files as application/x-zerovalue */
public class ZeroSizeFileDetector implements Detector {
public MediaType detect(InputStream stream, Metadata metadata) throws IOException {
if (stream != null) {
diff --git a/tika-core/src/main/java/org/apache/tika/detect/package-info.java b/tika-core/src/main/java/org/apache/tika/detect/package-info.java
index dede49c..b04a9d1 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Media type detection.
- */
+/** Media type detection. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.detect;
diff --git a/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java b/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
index 2af59d3..d1bead9 100644
--- a/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
+++ b/tika-core/src/main/java/org/apache/tika/embedder/Embedder.java
@@ -21,7 +21,6 @@
import java.io.OutputStream;
import java.io.Serializable;
import java.util.Set;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -36,11 +35,11 @@
public interface Embedder extends Serializable {
/**
- * Returns the set of media types supported by this embedder when used with
- * the given parse context.
- * <p>
- * The name differs from the precedence of {@link Parser#getSupportedTypes(ParseContext)}
- * so that parser implementations may also choose to implement this interface.
+ * Returns the set of media types supported by this embedder when used with the given parse
+ * context.
+ *
+ * <p>The name differs from the precedence of {@link Parser#getSupportedTypes(ParseContext)} so
+ * that parser implementations may also choose to implement this interface.
*
* @param context parse context
* @return immutable set of media types
@@ -48,46 +47,45 @@
Set<MediaType> getSupportedEmbedTypes(ParseContext context);
/**
- * Embeds related document metadata from the given metadata object into the
- * given output stream.
- * <p>
- * The given document stream is consumed but not closed by this method. The
- * responsibility to close the stream remains on the caller.
- * <p>
- * Information about the parsing context can be passed in the context
- * parameter. See the parser implementations for the kinds of context
- * information they expect.
- * <p>
- * In general implementations should favor preserving the source file's metadata
- * unless an update to a field is explicitly defined in the Metadata object.
- * More specifically:
+ * Embeds related document metadata from the given metadata object into the given output stream.
+ *
+ * <p>The given document stream is consumed but not closed by this method. The responsibility to
+ * close the stream remains on the caller.
+ *
+ * <p>Information about the parsing context can be passed in the context parameter. See the
+ * parser implementations for the kinds of context information they expect.
+ *
+ * <p>In general implementations should favor preserving the source file's metadata unless an
+ * update to a field is explicitly defined in the Metadata object. More specifically:
+ *
* <ul>
- * <li>Embedder implementations should only attempt to update metadata fields
- * present in the given Metadata object. Other fields should be left untouched.</li>
- * <li>Embedder implementations should set properties as empty when the
- * corresponding field in the Metadata object is an empty string, i.e. ""</li>
- * <li>Embedder implementations should nullify or delete properties
- * corresponding to fields with a null value in the given Metadata object.</li>
- * <li>Embedder implementations should set the property
- * corresponding to a particular field in the given Metadata object in all
- * metadata containers whenever possible and appropriate for the file format at the time.
- * If a particular metadata container falls out of use and/or is superseded by another
- * (such as IIC vs XMP for IPTC) it is up to the implementation to decide if and when
- * to cease embedding in the alternate container.</li>
- * <li>Embedder implementations should attempt to embed as much of the metadata
- * as accurately as possible. An implementation may choose a strict approach
- * and throw an exception if a value to be embedded exceeds the length allowed
- * or may choose to truncate the value.</li>
+ * <li>Embedder implementations should only attempt to update metadata fields present in the
+ * given Metadata object. Other fields should be left untouched.
+ * <li>Embedder implementations should set properties as empty when the corresponding field in
+ * the Metadata object is an empty string, i.e. ""
+ * <li>Embedder implementations should nullify or delete properties corresponding to fields
+ * with a null value in the given Metadata object.
+ * <li>Embedder implementations should set the property corresponding to a particular field in
+ * the given Metadata object in all metadata containers whenever possible and appropriate
+ * for the file format at the time. If a particular metadata container falls out of use
+ * and/or is superseded by another (such as IIC vs XMP for IPTC) it is up to the
+ * implementation to decide if and when to cease embedding in the alternate container.
+ * <li>Embedder implementations should attempt to embed as much of the metadata as accurately
+ * as possible. An implementation may choose a strict approach and throw an exception if a
+ * value to be embedded exceeds the length allowed or may choose to truncate the value.
* </ul>
*
- * @param metadata document metadata (input and output)
+ * @param metadata document metadata (input and output)
* @param originalStream the document stream (input)
- * @param outputStream the output stream to write the metadata embedded data to
- * @param context parse context
- * @throws IOException if the document stream could not be read
+ * @param outputStream the output stream to write the metadata embedded data to
+ * @param context parse context
+ * @throws IOException if the document stream could not be read
* @throws TikaException if the document could not be parsed
*/
- void embed(Metadata metadata, InputStream originalStream, OutputStream outputStream,
- ParseContext context) throws IOException, TikaException;
-
+ void embed(
+ Metadata metadata,
+ InputStream originalStream,
+ OutputStream outputStream,
+ ParseContext context)
+ throws IOException, TikaException;
}
diff --git a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
index b2411d7..a1b2cf6 100644
--- a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
+++ b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java
@@ -29,10 +29,8 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
-
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -43,50 +41,49 @@
import org.apache.tika.parser.external.ExternalParser;
/**
- * Embedder that uses an external program (like sed or exiftool) to embed text
- * content and metadata into a given document.
+ * Embedder that uses an external program (like sed or exiftool) to embed text content and metadata
+ * into a given document.
*
* @since Apache Tika 1.3
*/
public class ExternalEmbedder implements Embedder {
- /**
- * Token to be replaced with a String array of metadata assignment command
- * arguments
- */
+ /** Token to be replaced with a String array of metadata assignment command arguments */
public static final String METADATA_COMMAND_ARGUMENTS_TOKEN = "${METADATA}";
- /**
- * Token to be replaced with a String array of metadata assignment command
- * arguments
- */
+
+ /** Token to be replaced with a String array of metadata assignment command arguments */
public static final String METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN =
"${METADATA_SERIALIZED}";
+
private static final long serialVersionUID = -2828829275642475697L;
private final TemporaryResources tmp = new TemporaryResources();
- /**
- * Media types supported by the external program.
- */
+
+ /** Media types supported by the external program. */
private Set<MediaType> supportedEmbedTypes = Collections.emptySet();
- /**
- * Mapping of Tika metadata to command line parameters.
- */
+
+ /** Mapping of Tika metadata to command line parameters. */
private Map<Property, String[]> metadataCommandArguments = null;
+
/**
* The external command to invoke.
*
* @see Runtime#exec(String[])
*/
private String[] command =
- new String[]{"sed", "-e", "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
- ExternalParser.INPUT_FILE_TOKEN};
+ new String[] {
+ "sed",
+ "-e",
+ "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
+ ExternalParser.INPUT_FILE_TOKEN
+ };
+
private String commandAssignmentOperator = "=";
private String commandAssignmentDelimeter = ", ";
private String commandAppendOperator = "=";
private boolean quoteAssignmentValues = false;
/**
- * Serializes a collection of metadata command line arguments into a single
- * string.
+ * Serializes a collection of metadata command line arguments into a single string.
*
* @param metadataCommandArguments
* @return the serialized metadata arguments string
@@ -99,30 +96,28 @@
}
/**
- * Checks to see if the command can be run. Typically used with something
- * like "myapp --version" to check to see if "myapp" is installed and on the
- * path.
+ * Checks to see if the command can be run. Typically used with something like "myapp --version"
+ * to check to see if "myapp" is installed and on the path.
*
- * @param checkCmd the check command to run
+ * @param checkCmd the check command to run
* @param errorValue what is considered an error value?
* @return whether or not the check completed without error
*/
public static boolean check(String checkCmd, int... errorValue) {
- return check(new String[]{checkCmd}, errorValue);
+ return check(new String[] {checkCmd}, errorValue);
}
/**
- * Checks to see if the command can be run. Typically used with something
- * like "myapp --version" to check to see if "myapp" is installed and on the
- * path.
+ * Checks to see if the command can be run. Typically used with something like "myapp --version"
+ * to check to see if "myapp" is installed and on the path.
*
- * @param checkCmd the check command to run
+ * @param checkCmd the check command to run
* @param errorValue what is considered an error value?
* @return whether or not the check completed without error
*/
public static boolean check(String[] checkCmd, int... errorValue) {
if (errorValue.length == 0) {
- errorValue = new int[]{127};
+ errorValue = new int[] {127};
}
try {
@@ -155,14 +150,12 @@
}
public void setSupportedEmbedTypes(Set<MediaType> supportedEmbedTypes) {
- this.supportedEmbedTypes =
- Collections.unmodifiableSet(new HashSet<>(supportedEmbedTypes));
+ this.supportedEmbedTypes = Collections.unmodifiableSet(new HashSet<>(supportedEmbedTypes));
}
/**
- * Gets the command to be run. This can include either of
- * {@link ExternalParser#INPUT_FILE_TOKEN} or
- * {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command
+ * Gets the command to be run. This can include either of {@link
+ * ExternalParser#INPUT_FILE_TOKEN} or {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command
* needs filenames.
*
* @return
@@ -172,9 +165,8 @@
}
/**
- * Sets the command to be run. This can include either of
- * {@link ExternalParser#INPUT_FILE_TOKEN} or
- * {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command
+ * Sets the command to be run. This can include either of {@link
+ * ExternalParser#INPUT_FILE_TOKEN} or {@link ExternalParser#OUTPUT_FILE_TOKEN} if the command
* needs filenames.
*
* @see Runtime#exec(String[])
@@ -202,8 +194,7 @@
}
/**
- * Gets the delimiter for multiple assignments for the command line tool,
- * i.e. ", ".
+ * Gets the delimiter for multiple assignments for the command line tool, i.e. ", ".
*
* @return the assignment delimiter
*/
@@ -212,8 +203,7 @@
}
/**
- * Sets the delimiter for multiple assignments for the command line tool,
- * i.e. ", ".
+ * Sets the delimiter for multiple assignments for the command line tool, i.e. ", ".
*
* @param commandAssignmentDelimeter
*/
@@ -222,8 +212,7 @@
}
/**
- * Gets the operator to append rather than replace a value for the command
- * line tool, i.e. "+=".
+ * Gets the operator to append rather than replace a value for the command line tool, i.e. "+=".
*
* @return the append operator
*/
@@ -232,8 +221,7 @@
}
/**
- * Sets the operator to append rather than replace a value for the command
- * line tool, i.e. "+=".
+ * Sets the operator to append rather than replace a value for the command line tool, i.e. "+=".
*
* @param commandAppendOperator
*/
@@ -242,8 +230,7 @@
}
/**
- * Gets whether or not to quote assignment values, i.e. tag='value'. The
- * default is false.
+ * Gets whether or not to quote assignment values, i.e. tag='value'. The default is false.
*
* @return whether or not to quote assignment values
*/
@@ -270,8 +257,8 @@
}
/**
- * Sets the map of Metadata keys to command line parameters. Set this to
- * null to disable Metadata embedding.
+ * Sets the map of Metadata keys to command line parameters. Set this to null to disable
+ * Metadata embedding.
*
* @param arguments
*/
@@ -280,8 +267,8 @@
}
/**
- * Constructs a collection of command line arguments responsible for setting
- * individual metadata fields based on the given <code>metadata</code>.
+ * Constructs a collection of command line arguments responsible for setting individual metadata
+ * fields based on the given <code>metadata</code>.
*
* @param metadata the metadata to embed
* @return the metadata-related command line arguments
@@ -303,18 +290,20 @@
if (quoteAssignmentValues) {
assignmentValue = "'" + assignmentValue + "'";
}
- commandMetadataSegments
- .add(metadataCommandArgument + commandAppendOperator +
- assignmentValue);
+ commandMetadataSegments.add(
+ metadataCommandArgument
+ + commandAppendOperator
+ + assignmentValue);
}
} else {
String assignmentValue = metadata.get(metadataName);
if (quoteAssignmentValues) {
assignmentValue = "'" + assignmentValue + "'";
}
- commandMetadataSegments
- .add(metadataCommandArgument + commandAssignmentOperator +
- assignmentValue);
+ commandMetadataSegments.add(
+ metadataCommandArgument
+ + commandAssignmentOperator
+ + assignmentValue);
}
}
}
@@ -325,13 +314,15 @@
}
/**
- * Executes the configured external command and passes the given document
- * stream as a simple XHTML document to the given SAX content handler.
- * Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
- * has been called to set arguments.
+ * Executes the configured external command and passes the given document stream as a simple
+ * XHTML document to the given SAX content handler. Metadata is only extracted if {@link
+ * #setMetadataCommandArguments(Map)} has been called to set arguments.
*/
- public void embed(final Metadata metadata, final InputStream inputStream,
- final OutputStream outputStream, final ParseContext context)
+ public void embed(
+ final Metadata metadata,
+ final InputStream inputStream,
+ final OutputStream outputStream,
+ final ParseContext context)
throws IOException, TikaException {
boolean inputToStdIn = true;
@@ -354,14 +345,17 @@
List<String> cmd = new ArrayList<>();
for (String commandSegment : origCmd) {
if (commandSegment.contains(ExternalParser.INPUT_FILE_TOKEN)) {
- commandSegment = commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN,
- tikaInputStream.getFile().toString());
+ commandSegment =
+ commandSegment.replace(
+ ExternalParser.INPUT_FILE_TOKEN,
+ tikaInputStream.getFile().toString());
inputToStdIn = false;
}
if (commandSegment.contains(ExternalParser.OUTPUT_FILE_TOKEN)) {
tempOutputFile = tmp.createTemporaryFile();
- commandSegment = commandSegment
- .replace(ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString());
+ commandSegment =
+ commandSegment.replace(
+ ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString());
outputFromStdOut = false;
}
if (commandSegment.contains(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN)) {
@@ -382,15 +376,16 @@
int i = 0;
for (String commandSegment : cmd) {
if (commandSegment.contains(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN)) {
- commandSegment = commandSegment
- .replace(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
+ commandSegment =
+ commandSegment.replace(
+ METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN,
serializeMetadata(commandMetadataSegments));
cmd.set(i, commandSegment);
}
i++;
}
- } else if (!replacedMetadataCommandArgumentsToken &&
- !serializeMetadataCommandArgumentsToken) {
+ } else if (!replacedMetadataCommandArgumentsToken
+ && !serializeMetadataCommandArgumentsToken) {
// Tack metadata onto the end of the cmd as arguments
cmd.addAll(commandMetadataSegments);
}
@@ -399,12 +394,13 @@
// Execute
Process process;
if (cmd.toArray().length == 1) {
- process = Runtime.getRuntime().exec(cmd.toArray(new String[]{})[0]);
+ process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
} else {
- process = Runtime.getRuntime().exec(cmd.toArray(new String[]{}));
+ process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
}
- UnsynchronizedByteArrayOutputStream stdErrOutputStream = UnsynchronizedByteArrayOutputStream.builder().get();
+ UnsynchronizedByteArrayOutputStream stdErrOutputStream =
+ UnsynchronizedByteArrayOutputStream.builder().get();
try {
sendStdErrToOutputStream(process, stdErrOutputStream);
@@ -439,7 +435,7 @@
// Clean up temp output files
tempOutputFile.delete();
} catch (Exception e) {
- //swallow
+ // swallow
}
}
if (!inputToStdIn) {
@@ -450,9 +446,12 @@
IOUtils.closeQuietly(outputStream);
IOUtils.closeQuietly(stdErrOutputStream);
if (process.exitValue() != 0) {
- throw new TikaException("There was an error executing the command line" +
- "\nExecutable Command:\n\n" + cmd + "\nExecutable Error:\n\n" +
- stdErrOutputStream.toString(UTF_8.name()));
+ throw new TikaException(
+ "There was an error executing the command line"
+ + "\nExecutable Command:\n\n"
+ + cmd
+ + "\nExecutable Error:\n\n"
+ + stdErrOutputStream.toString(UTF_8.name()));
}
}
}
@@ -460,28 +459,29 @@
/**
* Creates a new thread for copying a given input stream to a given output stream.
*
- * @param inputStream the source input stream
+ * @param inputStream the source input stream
* @param outputStream the target output stream
*/
- private void multiThreadedStreamCopy(final InputStream inputStream,
- final OutputStream outputStream) {
- new Thread(() -> {
- try {
- IOUtils.copy(inputStream, outputStream);
- } catch (IOException e) {
- System.out.println("ERROR: " + e.getMessage());
- }
- }).start();
+ private void multiThreadedStreamCopy(
+ final InputStream inputStream, final OutputStream outputStream) {
+ new Thread(
+ () -> {
+ try {
+ IOUtils.copy(inputStream, outputStream);
+ } catch (IOException e) {
+ System.out.println("ERROR: " + e.getMessage());
+ }
+ })
+ .start();
}
/**
- * Sends the contents of the given input stream to the
- * standard input of the given process. Potential exceptions are
- * ignored.
- * <p>
- * Note that the given input stream is <em>not</em> closed by this method.
+ * Sends the contents of the given input stream to the standard input of the given process.
+ * Potential exceptions are ignored.
*
- * @param process the process
+ * <p>Note that the given input stream is <em>not</em> closed by this method.
+ *
+ * @param process the process
* @param inputStream the input stream to send to standard input of the process
*/
private void sendInputStreamToStdIn(final InputStream inputStream, final Process process) {
@@ -489,13 +489,12 @@
}
/**
- * Sends the standard output of the given
- * process to the given output stream. Potential exceptions are
- * ignored.
- * <p>
- * Note that the given output stream is <em>not</em> closed by this method.
+ * Sends the standard output of the given process to the given output stream. Potential
+ * exceptions are ignored.
*
- * @param process the process
+ * <p>Note that the given output stream is <em>not</em> closed by this method.
+ *
+ * @param process the process
* @param outputStream the putput stream to send to standard input of the process
*/
private void sendStdOutToOutputStream(final Process process, final OutputStream outputStream) {
@@ -507,12 +506,11 @@
}
/**
- * Starts a thread that reads and discards the contents of the standard
- * stream of the given process. Potential exceptions are ignored, and the
- * stream is closed once fully processed.
+ * Starts a thread that reads and discards the contents of the standard stream of the given
+ * process. Potential exceptions are ignored, and the stream is closed once fully processed.
*
- * @param process the process
- * param outputStream the output stream to send to standard error of the process
+ * @param process the process param outputStream the output stream to send to standard error of
+ * the process
*/
private void sendStdErrToOutputStream(final Process process, final OutputStream outputStream) {
multiThreadedStreamCopy(process.getErrorStream(), outputStream);
diff --git a/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java b/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java
index b5f2136..9042868 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/AccessPermissionException.java
@@ -17,9 +17,8 @@
package org.apache.tika.exception;
/**
- * Exception to be thrown when a document does not allow content extraction.
- * As of this writing, PDF documents are the only type of document that might
- * cause this type of exception.
+ * Exception to be thrown when a document does not allow content extraction. As of this writing, PDF
+ * documents are the only type of document that might cause this type of exception.
*/
public class AccessPermissionException extends TikaException {
public AccessPermissionException() {
diff --git a/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java b/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java
index 5ebad6d..ede9da5 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/CorruptedFileException.java
@@ -17,8 +17,8 @@
package org.apache.tika.exception;
/**
- * This exception should be thrown when the parse absolutely, positively has to stop.
- * This exception must not be caught and swallowed if an embedded parser throws it.
+ * This exception should be thrown when the parse absolutely, positively has to stop. This exception
+ * must not be caught and swallowed if an embedded parser throws it.
*/
public class CorruptedFileException extends TikaException {
public CorruptedFileException(String msg) {
diff --git a/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java b/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java
index 3ec3294..97ebb60 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/FileTooLongException.java
@@ -34,8 +34,11 @@
}
private static String msg(long length, long maxLength) {
- return "File is " + length + " bytes, but " + maxLength +
- " is the maximum length allowed. You can modify maxLength via " +
- "the setter on the fetcher.";
+ return "File is "
+ + length
+ + " bytes, but "
+ + maxLength
+ + " is the maximum length allowed. You can modify maxLength via "
+ + "the setter on the fetcher.";
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java
index 4e0bc43..853dfc9 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/RuntimeSAXException.java
@@ -18,13 +18,10 @@
import org.xml.sax.SAXException;
-/**
- * Use this to throw a SAXException in subclassed methods that don't throw SAXExceptions
- */
+/** Use this to throw a SAXException in subclassed methods that don't throw SAXExceptions */
public class RuntimeSAXException extends RuntimeException {
public RuntimeSAXException(SAXException t) {
super(t);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java
index 1dcd327..3c68553 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/TikaConfigException.java
@@ -17,9 +17,8 @@
package org.apache.tika.exception;
/**
- * Tika Config Exception is an exception to occur when there is an error
- * in Tika config file and/or one or more of the parsers failed to initialize
- * from that erroneous config.
+ * Tika Config Exception is an exception to occur when there is an error in Tika config file and/or
+ * one or more of the parsers failed to initialize from that erroneous config.
*
* @since Apache Tika 1.14
*/
diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaException.java
index ceac19d..a2bfc87 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/TikaException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/TikaException.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.exception;
-/**
- * Tika exception
- */
+/** Tika exception */
public class TikaException extends Exception {
public TikaException(String msg) {
@@ -28,5 +26,4 @@
public TikaException(String msg, Throwable cause) {
super(msg, cause);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java
index fbc1a95..9730f54 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/TikaMemoryLimitException.java
@@ -33,8 +33,11 @@
}
private static String msg(long triedToAllocate, long maxAllowable) {
- return "Tried to allocate " + triedToAllocate + " bytes, but " + maxAllowable +
- " is the maximum allowed. Please open an issue https://issues.apache.org/jira/projects/TIKA" +
- " if you believe this file is not corrupt.";
+ return "Tried to allocate "
+ + triedToAllocate
+ + " bytes, but "
+ + maxAllowable
+ + " is the maximum allowed. Please open an issue https://issues.apache.org/jira/projects/TIKA"
+ + " if you believe this file is not corrupt.";
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java b/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java
index a53dbd6..97bfebe 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/TikaTimeoutException.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.exception;
-/**
- * Runtime/unchecked version of {@link java.util.concurrent.TimeoutException}
- */
+/** Runtime/unchecked version of {@link java.util.concurrent.TimeoutException} */
public class TikaTimeoutException extends RuntimeException {
public TikaTimeoutException(String message) {
super(message);
diff --git a/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java b/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java
index 4322e64..76844ac 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/UnsupportedFormatException.java
@@ -18,18 +18,15 @@
package org.apache.tika.exception;
/**
- * Parsers should throw this exception when they encounter
- * a file format that they do not support. This should only happen
- * when we're not able to differentiate versions by the mime. For example,
- * At the time of this writing, "application/wordperfect" covers all versions
- * of the wordperfect format; however, the parser only handles 6.x.
- * <p/>
- * Whenever possible/convenient, it is better to distinguish file formats by mime
- * so that unsupported formats will be handled by the
- * {@link org.apache.tika.parser.EmptyParser}.
- * However, if we can't differentiate by mime or we need to rely on the parser
- * to distinguish the versions (in the case that magic can't distinguish),
- * this exception should be thrown.
+ * Parsers should throw this exception when they encounter a file format that they do not support.
+ * This should only happen when we're not able to differentiate versions by the mime. For example,
+ * At the time of this writing, "application/wordperfect" covers all versions of the wordperfect
+ * format; however, the parser only handles 6.x.
+ *
+ * <p>Whenever possible/convenient, it is better to distinguish file formats by mime so that
+ * unsupported formats will be handled by the {@link org.apache.tika.parser.EmptyParser}. However,
+ * if we can't differentiate by mime or we need to rely on the parser to distinguish the versions
+ * (in the case that magic can't distinguish), this exception should be thrown.
*/
public class UnsupportedFormatException extends TikaException {
diff --git a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
index 3e661ad..b4559a6 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/WriteLimitReachedException.java
@@ -20,29 +20,31 @@
public class WriteLimitReachedException extends SAXException {
- //in case of (hopefully impossible) cyclic exception
- private final static int MAX_DEPTH = 100;
+ // in case of (hopefully impossible) cyclic exception
+ private static final int MAX_DEPTH = 100;
private final int writeLimit;
+
public WriteLimitReachedException(int writeLimit) {
this.writeLimit = writeLimit;
}
@Override
public String getMessage() {
- return "Your document contained more than " + writeLimit
+ return "Your document contained more than "
+ + writeLimit
+ " characters, and so your requested limit has been"
+ " reached. To receive the full text of the document,"
+ " increase your limit. (Text up to the limit is"
+ " however available).";
}
+
/**
- * Checks whether the given exception (or any of it's root causes) was
- * thrown by this handler as a signal of reaching the write limit.
+ * Checks whether the given exception (or any of it's root causes) was thrown by this handler as
+ * a signal of reaching the write limit.
*
* @param t throwable
- * @return <code>true</code> if the write limit was reached,
- * <code>false</code> otherwise
+ * @return <code>true</code> if the write limit was reached, <code>false</code> otherwise
* @since Apache Tika 2.0
*/
public static boolean isWriteLimitReached(Throwable t) {
diff --git a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
index 125bc21..9adcf5b 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/ZeroByteFileException.java
@@ -17,28 +17,24 @@
package org.apache.tika.exception;
-/**
- * Exception thrown by the AutoDetectParser when a file contains zero-bytes.
- */
+/** Exception thrown by the AutoDetectParser when a file contains zero-bytes. */
public class ZeroByteFileException extends TikaException {
-
/**
- * If this is in the {@link org.apache.tika.parser.ParseContext}, the
- * {@link org.apache.tika.parser.AutoDetectParser} and the
- * {@link org.apache.tika.parser.RecursiveParserWrapper} will
- * ignore embedded files with zero-byte length inputstreams
+ * If this is in the {@link org.apache.tika.parser.ParseContext}, the {@link
+ * org.apache.tika.parser.AutoDetectParser} and the {@link
+ * org.apache.tika.parser.RecursiveParserWrapper} will ignore embedded files with zero-byte
+ * length inputstreams
*/
public static IgnoreZeroByteFileException IGNORE_ZERO_BYTE_FILE_EXCEPTION =
new IgnoreZeroByteFileException();
- //If this is in the parse context, the AutoDetectParser and the
- //RecursiveParserWrapper should ignore zero byte files
- //and not throw a Zero}
+ // If this is in the parse context, the AutoDetectParser and the
+ // RecursiveParserWrapper should ignore zero byte files
+ // and not throw a Zero}
public ZeroByteFileException(String msg) {
super(msg);
}
- public static class IgnoreZeroByteFileException {
- }
+ public static class IgnoreZeroByteFileException {}
}
diff --git a/tika-core/src/main/java/org/apache/tika/exception/package-info.java b/tika-core/src/main/java/org/apache/tika/exception/package-info.java
index 80ab125..af490fb 100644
--- a/tika-core/src/main/java/org/apache/tika/exception/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/exception/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Tika exception.
- */
+/** Tika exception. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.exception;
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
index 3f2f38f..ca74e99 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/AbstractEmbeddedDocumentBytesHandler.java
@@ -21,7 +21,6 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
-
import org.apache.tika.io.FilenameUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -32,23 +31,29 @@
List<Integer> ids = new ArrayList<>();
- public String getEmitKey(String containerEmitKey, int embeddedId,
- EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig,
- Metadata metadata) {
- String embeddedIdString = embeddedDocumentBytesConfig.getZeroPadName() > 0 ?
- StringUtils.leftPad(Integer.toString(embeddedId),
- embeddedDocumentBytesConfig.getZeroPadName(), "0") :
- Integer.toString(embeddedId);
+ public String getEmitKey(
+ String containerEmitKey,
+ int embeddedId,
+ EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig,
+ Metadata metadata) {
+ String embeddedIdString =
+ embeddedDocumentBytesConfig.getZeroPadName() > 0
+ ? StringUtils.leftPad(
+ Integer.toString(embeddedId),
+ embeddedDocumentBytesConfig.getZeroPadName(),
+ "0")
+ : Integer.toString(embeddedId);
+ StringBuilder emitKey =
+ new StringBuilder(containerEmitKey)
+ .append("/")
+ .append(FilenameUtils.getName(containerEmitKey))
+ .append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix())
+ .append(embeddedIdString);
- StringBuilder emitKey = new StringBuilder(containerEmitKey)
- .append("/")
- .append(FilenameUtils.getName(containerEmitKey))
- .append(embeddedDocumentBytesConfig.getEmbeddedIdPrefix())
- .append(embeddedIdString);
-
- if (embeddedDocumentBytesConfig.getSuffixStrategy().equals(
- EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING)) {
+ if (embeddedDocumentBytesConfig
+ .getSuffixStrategy()
+ .equals(EmbeddedDocumentBytesConfig.SUFFIX_STRATEGY.EXISTING)) {
String fName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
String suffix = FilenameUtils.getSuffixFromPath(fName);
suffix = suffix.toLowerCase(Locale.US);
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
index 1d5a239..c866139 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedBytesSelector.java
@@ -17,7 +17,6 @@
package org.apache.tika.extractor;
import java.util.Set;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -25,17 +24,17 @@
public class BasicEmbeddedBytesSelector implements EmbeddedBytesSelector {
-
-
private final Set<String> includeMimes;
private final Set<String> excludeMimes;
private final Set<String> includeEmbeddedResourceTypes;
private final Set<String> excludeEmbeddedResourceTypes;
- public BasicEmbeddedBytesSelector(Set<String> includeMimes, Set<String> excludeMimes,
- Set<String> includeEmbeddedResourceTypes,
- Set<String> excludeEmbeddedResourceTypes) {
+ public BasicEmbeddedBytesSelector(
+ Set<String> includeMimes,
+ Set<String> excludeMimes,
+ Set<String> includeEmbeddedResourceTypes,
+ Set<String> excludeEmbeddedResourceTypes) {
this.includeMimes = includeMimes;
this.excludeMimes = excludeMimes;
this.includeEmbeddedResourceTypes = includeEmbeddedResourceTypes;
@@ -47,7 +46,7 @@
if (mime == null) {
mime = "";
} else {
- //if mime matters at all, make sure to get the mime without parameters
+ // if mime matters at all, make sure to get the mime without parameters
if (includeMimes.size() > 0 || excludeMimes.size() > 0) {
MediaType mt = MediaType.parse(mime);
if (mt != null) {
@@ -58,18 +57,19 @@
if (excludeMimes.contains(mime)) {
return false;
}
- if (includeMimes.size() > 0 && ! includeMimes.contains(mime)) {
+ if (includeMimes.size() > 0 && !includeMimes.contains(mime)) {
return false;
}
String embeddedResourceType = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE);
- //if a parser doesn't specify the type, treat it as ATTACHMENT
- embeddedResourceType = StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" :
- embeddedResourceType;
+ // if a parser doesn't specify the type, treat it as ATTACHMENT
+ embeddedResourceType =
+ StringUtils.isBlank(embeddedResourceType) ? "ATTACHMENT" : embeddedResourceType;
if (excludeEmbeddedResourceTypes.contains(embeddedResourceType)) {
return false;
}
- if (includeEmbeddedResourceTypes.size() > 0 && includeEmbeddedResourceTypes.contains(embeddedResourceType)) {
+ if (includeEmbeddedResourceTypes.size() > 0
+ && includeEmbeddedResourceTypes.contains(embeddedResourceType)) {
return true;
}
return false;
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
index cf6441b..7221c9c 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/BasicEmbeddedDocumentBytesHandler.java
@@ -20,27 +20,28 @@
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
-
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedBufferedInputStream;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
/**
- * For now, this is an in-memory EmbeddedDocumentBytesHandler that stores
- * all the bytes in memory. Users can retrieve the documents with {@link #getDocument(int)}.
+ * For now, this is an in-memory EmbeddedDocumentBytesHandler that stores all the bytes in memory.
+ * Users can retrieve the documents with {@link #getDocument(int)}.
*
- * We'll need to make this cache to disk at some point if there are many bytes of
- * embedded documents.
+ * <p>We'll need to make this cache to disk at some point if there are many bytes of embedded
+ * documents.
*/
public class BasicEmbeddedDocumentBytesHandler extends AbstractEmbeddedDocumentBytesHandler {
private final EmbeddedDocumentBytesConfig config;
+
public BasicEmbeddedDocumentBytesHandler(EmbeddedDocumentBytesConfig config) {
this.config = config;
}
- //this won't scale, but let's start fully in memory for now;
+
+ // this won't scale, but let's start fully in memory for now;
Map<Integer, byte[]> docBytes = new HashMap<>();
+
@Override
public void add(int id, Metadata metadata, InputStream is) throws IOException {
super.add(id, metadata, is);
@@ -53,6 +54,6 @@
@Override
public void close() throws IOException {
- //delete tmp dir or whatever here
+ // delete tmp dir or whatever here
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java
index cfc70b5..4858911 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ContainerExtractor.java
@@ -18,47 +18,44 @@
import java.io.IOException;
import java.io.Serializable;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
/**
- * Tika container extractor interface.
- * Container Extractors provide access to the embedded
- * resources within container formats such as .zip and .doc
+ * Tika container extractor interface. Container Extractors provide access to the embedded resources
+ * within container formats such as .zip and .doc
*/
public interface ContainerExtractor extends Serializable {
/**
- * Is this Container Extractor able to process the
- * supplied container?
+ * Is this Container Extractor able to process the supplied container?
*
* @since Apache Tika 0.8
*/
boolean isSupported(TikaInputStream input) throws IOException;
/**
- * Processes a container file, and extracts all the embedded
- * resources from within it.
- * <p>
- * The {@link EmbeddedResourceHandler} you supply will
- * be called for each embedded resource in the container. It is
- * up to you whether you process the contents of the resource or not.
- * <p>
- * The given document stream is consumed but not closed by this method.
- * The responsibility to close the stream remains on the caller.
- * <p>
- * If required, nested containers (such as a .docx within a .zip)
- * can automatically be recursed into, and processed inline. If
- * no recurseExtractor is given, the nested containers will be
- * treated as with any other embedded resources.
+ * Processes a container file, and extracts all the embedded resources from within it.
*
- * @param stream the document stream (input)
+ * <p>The {@link EmbeddedResourceHandler} you supply will be called for each embedded resource
+ * in the container. It is up to you whether you process the contents of the resource or not.
+ *
+ * <p>The given document stream is consumed but not closed by this method. The responsibility to
+ * close the stream remains on the caller.
+ *
+ * <p>If required, nested containers (such as a .docx within a .zip) can automatically be
+ * recursed into, and processed inline. If no recurseExtractor is given, the nested containers
+ * will be treated as with any other embedded resources.
+ *
+ * @param stream the document stream (input)
* @param recurseExtractor the extractor to use on any embedded containers
- * @param handler handler for the embedded files (output)
- * @throws IOException if the document stream could not be read
+ * @param handler handler for the embedded files (output)
+ * @throws IOException if the document stream could not be read
* @throws TikaException if the container could not be parsed
* @since Apache Tika 0.8
*/
- void extract(TikaInputStream stream, ContainerExtractor recurseExtractor,
- EmbeddedResourceHandler handler) throws IOException, TikaException;
+ void extract(
+ TikaInputStream stream,
+ ContainerExtractor recurseExtractor,
+ EmbeddedResourceHandler handler)
+ throws IOException, TikaException;
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
index 537c5ff..d8d4aca 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/DefaultEmbeddedStreamTranslator.java
@@ -19,24 +19,22 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
-
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.ServiceLoaderUtils;
/**
- * Loads EmbeddedStreamTranslators via service loading. Tries to run each
- * in turn and returns the first non-null value. If no translation has occurred,
- * this returns the original InputStream. If a translation has occurred, the
- * translator will consume the InputStream but not close it.
+ * Loads EmbeddedStreamTranslators via service loading. Tries to run each in turn and returns the
+ * first non-null value. If no translation has occurred, this returns the original InputStream. If a
+ * translation has occurred, the translator will consume the InputStream but not close it.
*/
public class DefaultEmbeddedStreamTranslator implements EmbeddedStreamTranslator {
final List<EmbeddedStreamTranslator> translators;
private static List<EmbeddedStreamTranslator> getDefaultFilters(ServiceLoader loader) {
- List<EmbeddedStreamTranslator> embeddedStreamTranslators
- = loader.loadServiceProviders(EmbeddedStreamTranslator.class);
+ List<EmbeddedStreamTranslator> embeddedStreamTranslators =
+ loader.loadServiceProviders(EmbeddedStreamTranslator.class);
ServiceLoaderUtils.sortLoadedClasses(embeddedStreamTranslators);
return embeddedStreamTranslators;
}
@@ -50,8 +48,9 @@
}
/**
- * This should sniff the stream to determine if it needs to be translated.
- * The translator is responsible for resetting the stream if any bytes have been read.
+ * This should sniff the stream to determine if it needs to be translated. The translator is
+ * responsible for resetting the stream if any bytes have been read.
+ *
* @param inputStream
* @param metadata
* @return
@@ -69,6 +68,7 @@
/**
* This will consume the InputStream and return a new stream of translated bytes.
+ *
* @param inputStream
* @param metadata
* @return
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java
index aa34aa1..4976fe4 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/DocumentSelector.java
@@ -19,24 +19,21 @@
import org.apache.tika.metadata.Metadata;
/**
- * Interface for different document selection strategies for purposes like
- * embedded document extraction by a {@link ContainerExtractor} instance.
- * An implementation of this interface defines some specific selection
- * criteria to be applied against the document metadata passed to the
- * {@link #select(Metadata)} method.
+ * Interface for different document selection strategies for purposes like embedded document
+ * extraction by a {@link ContainerExtractor} instance. An implementation of this interface defines
+ * some specific selection criteria to be applied against the document metadata passed to the {@link
+ * #select(Metadata)} method.
*
* @since Apache Tika 0.8
*/
public interface DocumentSelector {
/**
- * Checks if a document with the given metadata matches the specified
- * selection criteria.
+ * Checks if a document with the given metadata matches the specified selection criteria.
*
* @param metadata document metadata
- * @return <code>true</code> if the document matches the selection criteria,
- * <code>false</code> otherwise
+ * @return <code>true</code> if the document matches the selection criteria, <code>false</code>
+ * otherwise
*/
boolean select(Metadata metadata);
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
index 2ec7df6..1e84731 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedBytesSelector.java
@@ -26,6 +26,7 @@
return true;
}
}
+
EmbeddedBytesSelector ACCEPT_ALL = new AcceptAll();
boolean select(Metadata metadata);
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
index f7237bd..15acaac 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentByteStoreExtractorFactory.java
@@ -16,18 +16,15 @@
*/
package org.apache.tika.extractor;
-
/**
- * This factory creates EmbeddedDocumentExtractors that require an
- * {@link EmbeddedDocumentBytesHandler} in the
- * {@link org.apache.tika.parser.ParseContext} should extend this.
+ * This factory creates EmbeddedDocumentExtractors that require an {@link
+ * EmbeddedDocumentBytesHandler} in the {@link org.apache.tika.parser.ParseContext} should extend
+ * this.
*
- * This is a shim interface to signal to {@link org.apache.tika.pipes.PipesServer}
- * to use the {@link @RUnpackExtractor} if the user doesn't configure a custom
- * EmbeddedDocumentExtractor.
+ * <p>This is a shim interface to signal to {@link org.apache.tika.pipes.PipesServer} to use the
+ * {@link @RUnpackExtractor} if the user doesn't configure a custom EmbeddedDocumentExtractor.
*
- * TODO: Figure out how to simplify this and allow for emitting of the source document.
+ * <p>TODO: Figure out how to simplify this and allow for emitting of the source document.
*/
-public interface EmbeddedDocumentByteStoreExtractorFactory extends EmbeddedDocumentExtractorFactory {
-
-}
+public interface EmbeddedDocumentByteStoreExtractorFactory
+ extends EmbeddedDocumentExtractorFactory {}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
index 12357a7..e665a87 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentBytesHandler.java
@@ -20,11 +20,10 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.List;
-
import org.apache.tika.metadata.Metadata;
public interface EmbeddedDocumentBytesHandler extends Closeable {
- //we need metadata for the emitter store...can we get away without it?
+ // we need metadata for the emitter store...can we get away without it?
void add(int id, Metadata metadata, InputStream inputStream) throws IOException;
List<Integer> getIds();
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
index f804208..f4b3cbc 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractor.java
@@ -19,18 +19,17 @@
import java.io.IOException;
import java.io.InputStream;
-
+import org.apache.tika.metadata.Metadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.apache.tika.metadata.Metadata;
-
public interface EmbeddedDocumentExtractor {
boolean shouldParseEmbedded(Metadata metadata);
/**
- * Processes the supplied embedded resource, calling the delegating
- * parser with the appropriate details.
+ * Processes the supplied embedded resource, calling the delegating parser with the appropriate
+ * details.
+ *
* @param stream The embedded resource
* @param handler The handler to use
* @param metadata The metadata for the embedded resource
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java
index 4a55052..3c795a8 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentExtractorFactory.java
@@ -18,7 +18,6 @@
package org.apache.tika.extractor;
import java.io.Serializable;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
index d6e2c28..da7ebf6 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedDocumentUtil.java
@@ -16,14 +16,9 @@
*/
package org.apache.tika.extractor;
-
import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.io.TikaInputStream;
@@ -41,21 +36,22 @@
import org.apache.tika.parser.PasswordProvider;
import org.apache.tika.parser.StatefulParser;
import org.apache.tika.utils.ExceptionUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Utility class to handle common issues with embedded documents.
- * <p/>
- * Use statically if all that is needed is getting the EmbeddedDocumentExtractor.
- * Otherwise, instantiate an instance.
- * <p/>
- * Note: This is not thread safe. Make sure to instantiate one per thread.
+ *
+ * <p>Use statically if all that is needed is getting the EmbeddedDocumentExtractor. Otherwise,
+ * instantiate an instance.
+ *
+ * <p>Note: This is not thread safe. Make sure to instantiate one per thread.
*/
public class EmbeddedDocumentUtil implements Serializable {
-
private final ParseContext context;
private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
- //these are lazily initialized and can be null
+ // these are lazily initialized and can be null
private TikaConfig tikaConfig;
private MimeTypes mimeTypes;
private Detector detector;
@@ -66,12 +62,12 @@
}
/**
- * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext.
- * As of Tika 1.15, an AutoDetectParser will automatically be added to parse
- * embedded documents if no Parser.class is specified in the ParseContext.
- * <p/>
- * If you'd prefer not to parse embedded documents, set Parser.class
- * to {@link org.apache.tika.parser.EmptyParser} in the ParseContext.
+ * This offers a uniform way to get an EmbeddedDocumentExtractor from a ParseContext. As of Tika
+ * 1.15, an AutoDetectParser will automatically be added to parse embedded documents if no
+ * Parser.class is specified in the ParseContext.
+ *
+ * <p>If you'd prefer not to parse embedded documents, set Parser.class to {@link
+ * org.apache.tika.parser.EmptyParser} in the ParseContext.
*
* @param context
* @return EmbeddedDocumentExtractor
@@ -81,8 +77,8 @@
if (extractor != null) {
return extractor;
}
- //ensure that an AutoDetectParser is
- //available for parsing embedded docs TIKA-2096
+ // ensure that an AutoDetectParser is
+ // available for parsing embedded docs TIKA-2096
Parser embeddedParser = context.get(Parser.class);
if (embeddedParser == null) {
TikaConfig tikaConfig = context.get(TikaConfig.class);
@@ -98,11 +94,10 @@
}
/**
- * Utility function to get the Parser that was sent in to the
- * ParseContext to handle embedded documents. If it is stateful,
- * unwrap it to get its stateless delegating parser.
- * <p>
- * If there is no Parser in the parser context, this will return null.
+ * Utility function to get the Parser that was sent in to the ParseContext to handle embedded
+ * documents. If it is stateful, unwrap it to get its stateless delegating parser.
+ *
+ * <p>If there is no Parser in the parser context, this will return null.
*
* @param context
* @return
@@ -123,7 +118,7 @@
}
public Detector getDetector() {
- //be as lazy as possible and cache
+ // be as lazy as possible and cache
Detector localDetector = context.get(Detector.class);
if (localDetector != null) {
return localDetector;
@@ -138,7 +133,7 @@
public MimeTypes getMimeTypes() {
MimeTypes localMimeTypes = context.get(MimeTypes.class);
- //be as lazy as possible and cache the mimeTypes
+ // be as lazy as possible and cache the mimeTypes
if (localMimeTypes != null) {
return localMimeTypes;
}
@@ -150,13 +145,13 @@
}
/**
- * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext
- * that was included during initialization, and then creating a new one from
- * via {@link TikaConfig#getDefaultConfig()} if it can't find one in the
- * ParseContext. This caches the default config so that it only has to be created once.
+ * @return Returns a {@link TikaConfig} -- trying to find it first in the ParseContext that was
+ * included during initialization, and then creating a new one from via {@link
+ * TikaConfig#getDefaultConfig()} if it can't find one in the ParseContext. This caches the
+ * default config so that it only has to be created once.
*/
public TikaConfig getTikaConfig() {
- //be as lazy as possible and cache the TikaConfig
+ // be as lazy as possible and cache the TikaConfig
if (tikaConfig == null) {
tikaConfig = context.get(TikaConfig.class);
if (tikaConfig == null) {
@@ -169,7 +164,7 @@
public String getExtension(TikaInputStream is, Metadata metadata) {
String mimeString = metadata.get(Metadata.CONTENT_TYPE);
- //use the buffered mimetypes as default
+ // use the buffered mimetypes as default
MimeTypes localMimeTypes = getMimeTypes();
MimeType mimeType = null;
@@ -178,7 +173,7 @@
try {
mimeType = localMimeTypes.forName(mimeString);
} catch (MimeTypeException e) {
- //swallow
+ // swallow
}
}
if (mimeType == null) {
@@ -188,12 +183,12 @@
detected = true;
is.reset();
} catch (IOException | MimeTypeException e) {
- //swallow
+ // swallow
}
}
if (mimeType != null) {
if (detected) {
- //set or correct the mime type
+ // set or correct the mime type
metadata.set(Metadata.CONTENT_TYPE, mimeType.toString());
}
return mimeType.getExtension();
@@ -219,21 +214,21 @@
return embeddedDocumentExtractor;
}
- public void parseEmbedded(InputStream inputStream, ContentHandler handler, Metadata metadata,
- boolean outputHtml) throws IOException, SAXException {
+ public void parseEmbedded(
+ InputStream inputStream, ContentHandler handler, Metadata metadata, boolean outputHtml)
+ throws IOException, SAXException {
embeddedDocumentExtractor.parseEmbedded(inputStream, handler, metadata, outputHtml);
}
/**
- * Tries to find an existing parser within the ParseContext.
- * It looks inside of CompositeParsers and ParserDecorators.
- * The use case is when a parser needs to parse an internal stream
- * that is _part_ of the document, e.g. rtf body inside an msg.
- * <p/>
- * Can return <code>null</code> if the context contains no parser or
- * the correct parser can't be found.
+ * Tries to find an existing parser within the ParseContext. It looks inside of CompositeParsers
+ * and ParserDecorators. The use case is when a parser needs to parse an internal stream that is
+ * _part_ of the document, e.g. rtf body inside an msg.
*
- * @param clazz parser class to search for
+ * <p>Can return <code>null</code> if the context contains no parser or the correct parser can't
+ * be found.
+ *
+ * @param clazz parser class to search for
* @param context
* @return
*/
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java
index 23d0063..de20ae4 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedResourceHandler.java
@@ -17,21 +17,18 @@
package org.apache.tika.extractor;
import java.io.InputStream;
-
import org.apache.tika.mime.MediaType;
/**
- * Tika container extractor callback interface.
- * To work with a {@link ContainerExtractor}, your code needs
- * to implement this interface.
+ * Tika container extractor callback interface. To work with a {@link ContainerExtractor}, your code
+ * needs to implement this interface.
*/
public interface EmbeddedResourceHandler {
/**
- * Called to process an embedded resource within the container.
- * This will be called once per embedded resource within the
- * container, along with whatever details are available on
- * the embedded resource.
- *
+ * Called to process an embedded resource within the container. This will be called once per
+ * embedded resource within the container, along with whatever details are available on the
+ * embedded resource.
+ *
* @since Apache Tika 0.8
* @param filename The filename of the embedded resource, if known
* @param mediaType The media type of the embedded resource, if known
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
index b2ce05d..4ea2c77 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/EmbeddedStreamTranslator.java
@@ -18,13 +18,11 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.metadata.Metadata;
/**
- * Interface for different filtering of embedded streams.
- * Specifically, unravel OLE streams in tika-server unpack,
- * and/or handle open containers in TikaInputStream
+ * Interface for different filtering of embedded streams. Specifically, unravel OLE streams in
+ * tika-server unpack, and/or handle open containers in TikaInputStream
*
* @since Apache Tika 2.0.0
*/
@@ -32,7 +30,5 @@
boolean shouldTranslate(InputStream inputStream, Metadata metadata) throws IOException;
- InputStream translate(InputStream inputStream,
- Metadata metadata) throws IOException;
-
+ InputStream translate(InputStream inputStream, Metadata metadata) throws IOException;
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
index b2e9cd1..e9bf874 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParserContainerExtractor.java
@@ -20,11 +20,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
@@ -38,13 +33,15 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.StatefulParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
/**
- * An implementation of {@link ContainerExtractor} powered by the regular
- * {@link Parser} API. This allows you to easily extract out all the
- * embedded resources from within container files supported by normal Tika
- * parsers. By default the {@link AutoDetectParser} will be used, to allow
- * extraction from the widest range of containers.
+ * An implementation of {@link ContainerExtractor} powered by the regular {@link Parser} API. This
+ * allows you to easily extract out all the embedded resources from within container files supported
+ * by normal Tika parsers. By default the {@link AutoDetectParser} will be used, to allow extraction
+ * from the widest range of containers.
*/
public class ParserContainerExtractor implements ContainerExtractor {
@@ -60,8 +57,7 @@
}
public ParserContainerExtractor(TikaConfig config) {
- this(new AutoDetectParser(config),
- new DefaultDetector(config.getMimeRepository()));
+ this(new AutoDetectParser(config), new DefaultDetector(config.getMimeRepository()));
}
public ParserContainerExtractor(Parser parser, Detector detector) {
@@ -75,7 +71,8 @@
}
public void extract(
- TikaInputStream stream, ContainerExtractor recurseExtractor,
+ TikaInputStream stream,
+ ContainerExtractor recurseExtractor,
EmbeddedResourceHandler handler)
throws IOException, TikaException {
ParseContext context = new ParseContext();
@@ -93,7 +90,8 @@
private final EmbeddedResourceHandler handler;
- private RecursiveParser(Parser statelessParser,
+ private RecursiveParser(
+ Parser statelessParser,
ContainerExtractor extractor,
EmbeddedResourceHandler handler) {
super(statelessParser);
@@ -106,8 +104,7 @@
}
public void parse(
- InputStream stream, ContentHandler ignored,
- Metadata metadata, ParseContext context)
+ InputStream stream, ContentHandler ignored, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
@@ -118,7 +115,7 @@
MediaType type = detector.detect(tis, metadata);
if (extractor == null) {
- // Let the handler process the embedded resource
+ // Let the handler process the embedded resource
handler.handle(filename, type, tis);
} else {
// Use a temporary file to process the stream twice
@@ -136,7 +133,5 @@
tmp.dispose();
}
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index edcb78f..28f2d86 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -22,12 +22,7 @@
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
@@ -41,10 +36,13 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
/**
- * Helper class for parsers of package archives or other compound document
- * formats that support embedded or attached component documents.
+ * Helper class for parsers of package archives or other compound document formats that support
+ * embedded or attached component documents.
*
* @since Apache Tika 0.8
*/
@@ -106,13 +104,16 @@
newStream.setOpenContainer(container);
}
}
- DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)),
- metadata, context);
+ DELEGATING_PARSER.parse(
+ newStream,
+ new EmbeddedContentHandler(new BodyContentHandler(handler)),
+ metadata,
+ context);
} catch (EncryptedDocumentException ede) {
recordException(ede, context);
} catch (CorruptedFileException e) {
- //necessary to stop the parse to avoid infinite loops
- //on corrupt sqlite3 files
+ // necessary to stop the parse to avoid infinite loops
+ // on corrupt sqlite3 files
throw new IOException(e);
} catch (TikaException e) {
recordException(e, context);
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
index 9136228..f6a3611 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractorFactory.java
@@ -20,8 +20,7 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
-public class ParsingEmbeddedDocumentExtractorFactory
- implements EmbeddedDocumentExtractorFactory {
+public class ParsingEmbeddedDocumentExtractorFactory implements EmbeddedDocumentExtractorFactory {
private boolean writeFileNameToContent = true;
@@ -32,8 +31,7 @@
@Override
public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
- ParsingEmbeddedDocumentExtractor ex =
- new ParsingEmbeddedDocumentExtractor(parseContext);
+ ParsingEmbeddedDocumentExtractor ex = new ParsingEmbeddedDocumentExtractor(parseContext);
ex.setWriteFileNameToContent(writeFileNameToContent);
return ex;
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
index 76b297d..7f12f63 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractor.java
@@ -23,14 +23,7 @@
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.AttributesImpl;
-
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
@@ -42,6 +35,11 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
/**
* Recursive Unpacker and text and metadata extractor.
@@ -65,7 +63,6 @@
this.maxEmbeddedBytesForExtraction = maxEmbeddedBytesForExtraction;
}
-
@Override
public void parseEmbedded(
InputStream stream, ContentHandler handler, Metadata metadata, boolean outputHtml)
@@ -94,7 +91,8 @@
newStream.setOpenContainer(container);
}
}
- EmbeddedDocumentBytesHandler bytesHandler = context.get(EmbeddedDocumentBytesHandler.class);
+ EmbeddedDocumentBytesHandler bytesHandler =
+ context.get(EmbeddedDocumentBytesHandler.class);
if (bytesHandler != null) {
parseWithBytes(newStream, handler, metadata);
} else {
@@ -103,8 +101,8 @@
} catch (EncryptedDocumentException ede) {
recordException(ede, context);
} catch (CorruptedFileException e) {
- //necessary to stop the parse to avoid infinite loops
- //on corrupt sqlite3 files
+ // necessary to stop the parse to avoid infinite loops
+ // on corrupt sqlite3 files
throw new IOException(e);
} catch (TikaException e) {
recordException(e, context);
@@ -117,8 +115,8 @@
private void parseWithBytes(TikaInputStream stream, ContentHandler handler, Metadata metadata)
throws TikaException, IOException, SAXException {
- //TODO -- improve the efficiency of this so that we're not
- //literally writing out a file per request
+ // TODO -- improve the efficiency of this so that we're not
+ // literally writing out a file per request
Path p = stream.getPath();
try {
parse(stream, handler, metadata);
@@ -129,15 +127,19 @@
private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata)
throws TikaException, IOException, SAXException {
- getDelegatingParser().parse(stream,
- new EmbeddedContentHandler(new BodyContentHandler(handler)),
- metadata, context);
+ getDelegatingParser()
+ .parse(
+ stream,
+ new EmbeddedContentHandler(new BodyContentHandler(handler)),
+ metadata,
+ context);
}
private void storeEmbeddedBytes(Path p, Metadata metadata) {
- if (! embeddedBytesSelector.select(metadata)) {
+ if (!embeddedBytesSelector.select(metadata)) {
if (LOGGER.isDebugEnabled()) {
- LOGGER.debug("skipping embedded bytes {} <-> {}",
+ LOGGER.debug(
+ "skipping embedded bytes {} <-> {}",
metadata.get(Metadata.CONTENT_TYPE),
metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
}
@@ -148,8 +150,12 @@
int id = metadata.getInt(TikaCoreProperties.EMBEDDED_ID);
try (InputStream is = Files.newInputStream(p)) {
if (bytesExtracted >= maxEmbeddedBytesForExtraction) {
- throw new IOException("Bytes extracted (" + bytesExtracted +
- ") >= max allowed (" + maxEmbeddedBytesForExtraction + ")");
+ throw new IOException(
+ "Bytes extracted ("
+ + bytesExtracted
+ + ") >= max allowed ("
+ + maxEmbeddedBytesForExtraction
+ + ")");
}
long maxToRead = maxEmbeddedBytesForExtraction - bytesExtracted;
@@ -157,19 +163,23 @@
embeddedDocumentBytesHandler.add(id, metadata, boundedIs);
bytesExtracted += boundedIs.getPos();
if (boundedIs.hasHitBound()) {
- throw new IOException("Bytes extracted (" + bytesExtracted +
- ") >= max allowed (" + maxEmbeddedBytesForExtraction + "). Truncated " +
- "bytes");
+ throw new IOException(
+ "Bytes extracted ("
+ + bytesExtracted
+ + ") >= max allowed ("
+ + maxEmbeddedBytesForExtraction
+ + "). Truncated "
+ + "bytes");
}
}
} catch (IOException e) {
LOGGER.warn("problem writing out embedded bytes", e);
- //info in metadata doesn't actually make it back to the metadata list
- //because we're filtering and cloning the metadata at the end of the parse
- //which happens before we try to copy out the files.
- //TODO fix this
- //metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
- // ExceptionUtils.getStackTrace(e));
+ // info in metadata doesn't actually make it back to the metadata list
+ // because we're filtering and cloning the metadata at the end of the parse
+ // which happens before we try to copy out the files.
+ // TODO fix this
+ // metadata.set(TikaCoreProperties.EMBEDDED_BYTES_EXCEPTION,
+ // ExceptionUtils.getStackTrace(e));
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
index a715ed2..f70ebe0 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/RUnpackExtractorFactory.java
@@ -20,7 +20,6 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
-
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.metadata.Metadata;
@@ -37,6 +36,7 @@
private Set<String> embeddedBytesExcludeEmbeddedResourceTypes = Collections.EMPTY_SET;
private long maxEmbeddedBytesForExtraction = DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION;
+
@Field
public void setWriteFileNameToContent(boolean writeFileNameToContent) {
this.writeFileNameToContent = writeFileNameToContent;
@@ -52,34 +52,32 @@
public void setEmbeddedBytesExcludeMimeTypes(List<String> excludeMimeTypes) {
embeddedBytesExcludeMimeTypes = new HashSet<>();
embeddedBytesExcludeMimeTypes.addAll(excludeMimeTypes);
-
}
@Field
public void setEmbeddedBytesIncludeEmbeddedResourceTypes(List<String> includeAttachmentTypes) {
embeddedBytesIncludeEmbeddedResourceTypes = new HashSet<>();
embeddedBytesIncludeEmbeddedResourceTypes.addAll(includeAttachmentTypes);
-
}
@Field
public void setEmbeddedBytesExcludeEmbeddedResourceTypes(List<String> excludeAttachmentTypes) {
embeddedBytesExcludeEmbeddedResourceTypes = new HashSet<>();
embeddedBytesExcludeEmbeddedResourceTypes.addAll(excludeAttachmentTypes);
-
}
/**
- * Total number of bytes to write out. A good zip bomb may contain petabytes
- * compressed into a few kb. Make sure that you can't fill up a disk!
+ * Total number of bytes to write out. A good zip bomb may contain petabytes compressed into a
+ * few kb. Make sure that you can't fill up a disk!
*
- * This does not include the container file in the count of bytes written out.
- * This only counts the lengths of the embedded files.
+ * <p>This does not include the container file in the count of bytes written out. This only
+ * counts the lengths of the embedded files.
*
* @param maxEmbeddedBytesForExtraction
*/
@Field
- public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction) throws TikaConfigException {
+ public void setMaxEmbeddedBytesForExtraction(long maxEmbeddedBytesForExtraction)
+ throws TikaConfigException {
if (maxEmbeddedBytesForExtraction < 0) {
throw new TikaConfigException("maxEmbeddedBytesForExtraction must be >= 0");
}
@@ -88,24 +86,23 @@
@Override
public EmbeddedDocumentExtractor newInstance(Metadata metadata, ParseContext parseContext) {
- RUnpackExtractor ex =
- new RUnpackExtractor(parseContext,
- maxEmbeddedBytesForExtraction);
+ RUnpackExtractor ex = new RUnpackExtractor(parseContext, maxEmbeddedBytesForExtraction);
ex.setWriteFileNameToContent(writeFileNameToContent);
ex.setEmbeddedBytesSelector(createEmbeddedBytesSelector());
return ex;
}
-
private EmbeddedBytesSelector createEmbeddedBytesSelector() {
- if (embeddedBytesIncludeMimeTypes.size() == 0 &&
- embeddedBytesExcludeMimeTypes.size() == 0 &&
- embeddedBytesIncludeEmbeddedResourceTypes.size() == 0 &&
- embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) {
+ if (embeddedBytesIncludeMimeTypes.size() == 0
+ && embeddedBytesExcludeMimeTypes.size() == 0
+ && embeddedBytesIncludeEmbeddedResourceTypes.size() == 0
+ && embeddedBytesExcludeEmbeddedResourceTypes.size() == 0) {
return EmbeddedBytesSelector.ACCEPT_ALL;
}
- return new BasicEmbeddedBytesSelector(embeddedBytesIncludeMimeTypes,
- embeddedBytesExcludeMimeTypes, embeddedBytesIncludeEmbeddedResourceTypes,
+ return new BasicEmbeddedBytesSelector(
+ embeddedBytesIncludeMimeTypes,
+ embeddedBytesExcludeMimeTypes,
+ embeddedBytesIncludeEmbeddedResourceTypes,
embeddedBytesExcludeEmbeddedResourceTypes);
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/package-info.java b/tika-core/src/main/java/org/apache/tika/extractor/package-info.java
index 3d3e92b..5917177 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Extraction of component documents.
- */
+/** Extraction of component documents. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.extractor;
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java
index 51b1bee..91707eb 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderProxy.java
@@ -30,15 +30,13 @@
class ClassLoaderProxy extends ClassLoader implements ForkProxy {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -7303109260448540420L;
/**
- * Names of resources that could not be found. Used to avoid repeated
- * lookup of commonly accessed, but often not present, resources like
- * <code>META-INF/services/javax.xml.parsers.SAXParserFactory</code>.
+ * Names of resources that could not be found. Used to avoid repeated lookup of commonly
+ * accessed, but often not present, resources like <code>
+ * META-INF/services/javax.xml.parsers.SAXParserFactory</code>.
*/
private final Set<String> notFound = new HashSet<>();
@@ -149,5 +147,4 @@
return stream.toByteArray();
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java
index 7af85ad..dc7e718 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ClassLoaderResource.java
@@ -32,11 +32,10 @@
}
/**
- * Processes a request for one (code 1) or many (code 2) class loader
- * resources. The requested resources are sent preceded with a boolean
- * <code>true</code> value. If the resource was not found (code 1) or
- * when the last resource has been sent (code 2), a boolean
- * <code>false</code> value is sent instead.
+ * Processes a request for one (code 1) or many (code 2) class loader resources. The requested
+ * resources are sent preceded with a boolean <code>true</code> value. If the resource was not
+ * found (code 1) or when the last resource has been sent (code 2), a boolean <code>false</code>
+ * value is sent instead.
*
* @param name resource name
* @throws IOException if the resource could not be sent
@@ -66,14 +65,12 @@
}
/**
- * Sends the contents of the given input stream to the given output.
- * The stream is sent in chunks of less than 64kB, each preceded by
- * a 16-bit integer value that indicates the length of the following
- * chunk. A zero short value is sent at the end to signify the end of
- * the stream.
- * <p>
- * The stream is guaranteed to be closed by this method, regardless of
- * the way it returns.
+ * Sends the contents of the given input stream to the given output. The stream is sent in
+ * chunks of less than 64kB, each preceded by a 16-bit integer value that indicates the length
+ * of the following chunk. A zero short value is sent at the end to signify the end of the
+ * stream.
+ *
+ * <p>The stream is guaranteed to be closed by this method, regardless of the way it returns.
*
* @param stream the stream to be sent
* @throws IOException if the stream could not be sent
@@ -92,5 +89,4 @@
stream.close();
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
index 371dd05..d9430ed 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerProxy.java
@@ -19,7 +19,6 @@
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
-
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.Locator;
@@ -38,9 +37,7 @@
public static final int PROCESSING_INSTRUCTION = 9;
public static final int SKIPPED_ENTITY = 10;
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 737511106054617524L;
private final int resource;
@@ -79,8 +76,8 @@
}
/**
- * Breaks the string in 21,845 size chunks to not
- * throw UTFDataFormatException at least in Oracle JDK 8.
+ * Breaks the string in 21,845 size chunks to not throw UTFDataFormatException at least in
+ * Oracle JDK 8.
*/
private void writeString(String string) throws IOException {
int max = 65535 / 3;
@@ -195,5 +192,4 @@
sendString(name);
doneSending();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
index f8971b9..6d3e830 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ContentHandlerResource.java
@@ -19,7 +19,6 @@
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
-
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
@@ -60,8 +59,12 @@
if (n >= 0) {
atts = new AttributesImpl();
for (int i = 0; i < n; i++) {
- atts.addAttribute(readString(input), readString(input), readString(input),
- readString(input), readString(input));
+ atts.addAttribute(
+ readString(input),
+ readString(input),
+ readString(input),
+ readString(input),
+ readString(input));
}
}
handler.startElement(uri, localName, qName, atts);
@@ -103,5 +106,4 @@
}
return sb.toString();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java b/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
index f1a4720..a49a53e 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ForkClient.java
@@ -33,14 +33,12 @@
import java.util.jar.JarEntry;
import java.util.jar.JarOutputStream;
import java.util.zip.ZipEntry;
-
import org.apache.commons.io.IOUtils;
-import org.xml.sax.ContentHandler;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.utils.ProcessUtils;
+import org.xml.sax.ContentHandler;
class ForkClient {
private static final AtomicInteger CLIENT_COUNTER = new AtomicInteger(0);
@@ -57,29 +55,35 @@
private final DataInputStream input;
- //this is used for debugging/smoke testing
+ // this is used for debugging/smoke testing
private final int id = CLIENT_COUNTER.incrementAndGet();
private volatile int filesProcessed = 0;
- public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory, List<String> java,
- TimeoutLimits timeoutLimits) throws IOException, TikaException {
+ public ForkClient(
+ Path tikaDir,
+ ParserFactoryFactory parserFactoryFactory,
+ List<String> java,
+ TimeoutLimits timeoutLimits)
+ throws IOException, TikaException {
this(tikaDir, parserFactoryFactory, null, java, timeoutLimits);
}
/**
- * @param tikaDir directory containing jars from which to start
- * the child server and load the Parser
- * @param parserFactoryFactory factory to send to forked process to build parser
- * upon arrival
- * @param classLoader class loader to use for non-parser resource
- * (content-handler, etc.)
- * @param java java commandline to use for the commandline server
+ * @param tikaDir directory containing jars from which to start the child server and load the
+ * Parser
+ * @param parserFactoryFactory factory to send to forked process to build parser upon arrival
+ * @param classLoader class loader to use for non-parser resource (content-handler, etc.)
+ * @param java java commandline to use for the commandline server
* @throws IOException
* @throws TikaException
*/
- public ForkClient(Path tikaDir, ParserFactoryFactory parserFactoryFactory,
- ClassLoader classLoader, List<String> java, TimeoutLimits timeoutLimits)
+ public ForkClient(
+ Path tikaDir,
+ ParserFactoryFactory parserFactoryFactory,
+ ClassLoader classLoader,
+ List<String> java,
+ TimeoutLimits timeoutLimits)
throws IOException, TikaException {
jar = null;
loader = null;
@@ -130,9 +134,9 @@
}
}
-
- public ForkClient(ClassLoader loader, Object object, List<String> java,
- TimeoutLimits timeoutLimits) throws IOException, TikaException {
+ public ForkClient(
+ ClassLoader loader, Object object, List<String> java, TimeoutLimits timeoutLimits)
+ throws IOException, TikaException {
boolean ok = false;
try {
this.loader = loader;
@@ -168,8 +172,8 @@
}
/**
- * Creates a temporary jar file that can be used to bootstrap the forked
- * server process. Remember to remove the file when no longer used.
+ * Creates a temporary jar file that can be used to bootstrap the forked server process.
+ * Remember to remove the file when no longer used.
*
* @return the created jar file
* @throws IOException if the bootstrap archive could not be created
@@ -189,9 +193,9 @@
}
/**
- * Fills in the jar file used to bootstrap the forked server process.
- * All the required <code>.class</code> files and a manifest with a
- * <code>Main-Class</code> entry are written into the archive.
+ * Fills in the jar file used to bootstrap the forked server process. All the required <code>
+ * .class</code> files and a manifest with a <code>Main-Class</code> entry are written into the
+ * archive.
*
* @param file file to hold the bootstrap archive
* @throws IOException if the bootstrap archive could not be created
@@ -202,10 +206,17 @@
jar.putNextEntry(new ZipEntry("META-INF/MANIFEST.MF"));
jar.write(manifest.getBytes(UTF_8));
- Class<?>[] bootstrap = {ForkServer.class, ForkObjectInputStream.class, ForkProxy.class,
- ClassLoaderProxy.class, MemoryURLConnection.class, MemoryURLStreamHandler.class,
- MemoryURLStreamHandlerFactory.class, MemoryURLStreamRecord.class,
- TikaException.class};
+ Class<?>[] bootstrap = {
+ ForkServer.class,
+ ForkObjectInputStream.class,
+ ForkProxy.class,
+ ClassLoaderProxy.class,
+ MemoryURLConnection.class,
+ MemoryURLStreamHandler.class,
+ MemoryURLStreamHandlerFactory.class,
+ MemoryURLStreamRecord.class,
+ TikaException.class
+ };
ClassLoader loader = ForkServer.class.getClassLoader();
for (Class<?> klass : bootstrap) {
String path = klass.getName().replace('.', '/') + ".class";
@@ -227,10 +238,11 @@
} else if (type == -1) {
throw new IOException("EOF while waiting for start beacon");
} else {
- //can't do this because of
+ // can't do this because of
// ForkParserIntegrationTest
// #testAttachingADebuggerOnTheForkedParserShouldWork
-// throw new IOException("Unexpected byte while waiting for start beacon: "+type);
+ // throw new IOException("Unexpected byte while waiting for start
+ // beacon: "+type);
}
}
}
@@ -265,10 +277,10 @@
}
/**
- * Serializes the object first into an in-memory buffer and then
- * writes it to the output stream with a preceding size integer.
+ * Serializes the object first into an in-memory buffer and then writes it to the output stream
+ * with a preceding size integer.
*
- * @param object object to be serialized
+ * @param object object to be serialized
* @param resources list of fork resources, used when adding proxies
* @throws IOException if the object could not be serialized
*/
@@ -279,12 +291,14 @@
resources.add(new InputStreamResource((InputStream) object));
object = new InputStreamProxy(n);
} else if (object instanceof RecursiveParserWrapperHandler) {
- resources.add(new RecursiveMetadataContentHandlerResource(
- (RecursiveParserWrapperHandler) object));
- object = new RecursiveMetadataContentHandlerProxy(n,
- ((RecursiveParserWrapperHandler) object).getContentHandlerFactory());
- } else if (object instanceof ContentHandler &&
- !(object instanceof AbstractRecursiveParserWrapperHandler)) {
+ resources.add(
+ new RecursiveMetadataContentHandlerResource(
+ (RecursiveParserWrapperHandler) object));
+ object =
+ new RecursiveMetadataContentHandlerProxy(
+ n, ((RecursiveParserWrapperHandler) object).getContentHandlerFactory());
+ } else if (object instanceof ContentHandler
+ && !(object instanceof AbstractRecursiveParserWrapperHandler)) {
resources.add(new ContentHandlerResource((ContentHandler) object));
object = new ContentHandlerProxy(n);
} else if (object instanceof ClassLoader) {
@@ -296,8 +310,11 @@
ForkObjectInputStream.sendObject(object, output);
} catch (NotSerializableException nse) {
// Build a more friendly error message for this
- throw new TikaException("Unable to serialize " + object.getClass().getSimpleName() +
- " to pass to the Forked Parser", nse);
+ throw new TikaException(
+ "Unable to serialize "
+ + object.getClass().getSimpleName()
+ + " to pass to the Forked Parser",
+ nse);
}
waitForResponse(resources);
@@ -316,10 +333,10 @@
if (process != null) {
process.destroyForcibly();
try {
- //TIKA-1933
+ // TIKA-1933
process.waitFor();
} catch (InterruptedException e) {
- //swallow
+ // swallow
}
}
if (jar != null) {
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java b/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java
index 61e2dae..2e7b6d3 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ForkObjectInputStream.java
@@ -27,28 +27,24 @@
import java.io.ObjectStreamClass;
/**
- * An object input stream that uses a given class loader when deserializing
- * objects.
- * <p>
- * Note that this functionality could easily be implemented as a simple
- * anonymous {@link ObjectInputStream} subclass, but since the
- * functionality is needed during the somewhat complicated bootstrapping
- * of the stdin/out communication channel of a forked server process,
- * it's better if class has a stable name that can be referenced at
- * compile-time by the {@link ForkClient} class.
+ * An object input stream that uses a given class loader when deserializing objects.
+ *
+ * <p>Note that this functionality could easily be implemented as a simple anonymous {@link
+ * ObjectInputStream} subclass, but since the functionality is needed during the somewhat
+ * complicated bootstrapping of the stdin/out communication channel of a forked server process, it's
+ * better if class has a stable name that can be referenced at compile-time by the {@link
+ * ForkClient} class.
*/
class ForkObjectInputStream extends ObjectInputStream {
- /**
- * The class loader used when deserializing objects.
- */
+ /** The class loader used when deserializing objects. */
private final ClassLoader loader;
/**
- * Creates a new object input stream that uses the given class loader
- * when deserializing objects.
+ * Creates a new object input stream that uses the given class loader when deserializing
+ * objects.
*
- * @param input underlying input stream
+ * @param input underlying input stream
* @param loader class loader used when deserializing objects
* @throws IOException if this stream could not be initiated
*/
@@ -58,8 +54,8 @@
}
/**
- * Serializes the object first into an in-memory buffer and then
- * writes it to the output stream with a preceding size integer.
+ * Serializes the object first into an in-memory buffer and then writes it to the output stream
+ * with a preceding size integer.
*
* @param object object to be serialized
* @param output output stream
@@ -77,13 +73,13 @@
}
/**
- * Deserializes an object from the given stream. The serialized object
- * is expected to be preceded by a size integer, that is used for reading
- * the entire serialization into a memory before deserializing it.
+ * Deserializes an object from the given stream. The serialized object is expected to be
+ * preceded by a size integer, that is used for reading the entire serialization into a memory
+ * before deserializing it.
*
- * @param input input stream from which the serialized object is read
+ * @param input input stream from which the serialized object is read
* @param loader class loader to be used for loading referenced classes
- * @throws IOException if the object could not be deserialized
+ * @throws IOException if the object could not be deserialized
* @throws ClassNotFoundException if a referenced class is not found
*/
public static Object readObject(DataInputStream input, ClassLoader loader)
@@ -108,5 +104,4 @@
protected Class<?> resolveClass(ObjectStreamClass desc) throws ClassNotFoundException {
return Class.forName(desc.getName(), false, loader);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java b/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java
index 84d1156..2b1c9df 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ForkParser.java
@@ -27,10 +27,6 @@
import java.util.List;
import java.util.Queue;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -41,54 +37,46 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.TeeContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
public class ForkParser implements Parser, Closeable {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -4962742892274663950L;
- //these are used by the legacy usage
+ // these are used by the legacy usage
private final ClassLoader loader;
private final Parser parser;
- //these are used when the server builds a parser via a directory
- //of jars, not via legacy bootstrap etc.
+ // these are used when the server builds a parser via a directory
+ // of jars, not via legacy bootstrap etc.
private final Path tikaBin;
private final ParserFactoryFactory parserFactoryFactory;
private final Queue<ForkClient> pool = new LinkedList<>();
- /**
- * Java command line
- */
+
+ /** Java command line */
private List<String> java = Arrays.asList("java", "-Xmx32m", "-Djava.awt.headless=true");
- /**
- * Process pool size
- */
- @Field
- private int poolSize = 5;
+
+ /** Process pool size */
+ @Field private int poolSize = 5;
+
private int currentlyInUse = 0;
- @Field
- private long serverPulseMillis = 1000;
+ @Field private long serverPulseMillis = 1000;
- @Field
- private long serverParseTimeoutMillis = 60000;
+ @Field private long serverParseTimeoutMillis = 60000;
- @Field
- private long serverWaitTimeoutMillis = 60000;
+ @Field private long serverWaitTimeoutMillis = 60000;
- @Field
- private int maxFilesProcessedPerClient = -1;
+ @Field private int maxFilesProcessedPerClient = -1;
/**
- * If you have a directory with, say, tike-app.jar and you want the
- * forked process/server to build a parser
- * and run it from that -- so that you can keep all of those dependencies out of
+ * If you have a directory with, say, tike-app.jar and you want the forked process/server to
+ * build a parser and run it from that -- so that you can keep all of those dependencies out of
* your client code, use this initializer.
*
- * @param tikaBin directory containing the tika-app.jar or similar --
- * full jar including tika-core and all
- * desired parsers and dependencies
+ * @param tikaBin directory containing the tika-app.jar or similar -- full jar including
+ * tika-core and all desired parsers and dependencies
* @param factoryFactory
*/
public ForkParser(Path tikaBin, ParserFactoryFactory factoryFactory) {
@@ -101,16 +89,14 @@
/**
* <b>EXPERT</b>
*
- * @param tikaBin directory containing the tika-app.jar or similar
- * -- full jar including tika-core and all
- * desired parsers and dependencies
- * @param parserFactoryFactory -- the factory to use to generate the parser factory
- * in the forked process/server
- * @param classLoader to use for all classes besides the parser in the
- * forked process/server
+ * @param tikaBin directory containing the tika-app.jar or similar -- full jar including
+ * tika-core and all desired parsers and dependencies
+ * @param parserFactoryFactory -- the factory to use to generate the parser factory in the
+ * forked process/server
+ * @param classLoader to use for all classes besides the parser in the forked process/server
*/
- public ForkParser(Path tikaBin, ParserFactoryFactory parserFactoryFactory,
- ClassLoader classLoader) {
+ public ForkParser(
+ Path tikaBin, ParserFactoryFactory parserFactoryFactory, ClassLoader classLoader) {
parser = null;
loader = classLoader;
this.tikaBin = tikaBin;
@@ -124,8 +110,8 @@
public ForkParser(ClassLoader loader, Parser parser) {
if (parser instanceof ForkParser) {
throw new IllegalArgumentException(
- "The underlying parser of a ForkParser should not be a ForkParser, " +
- "but a specific implementation.");
+ "The underlying parser of a ForkParser should not be a ForkParser, "
+ + "but a specific implementation.");
}
this.tikaBin = null;
this.parserFactoryFactory = null;
@@ -160,13 +146,11 @@
}
/**
- * Sets the command used to start the forked server process.
- * The arguments "-jar" and "/path/to/bootstrap.jar"
- * or "-cp" and "/path/to/tika_bin" are
- * appended to the given command when starting the process.
- * The default setting is {"java", "-Xmx32m"}.
- * <p/>
- * Creates a defensive copy.
+ * Sets the command used to start the forked server process. The arguments "-jar" and
+ * "/path/to/bootstrap.jar" or "-cp" and "/path/to/tika_bin" are appended to the given command
+ * when starting the process. The default setting is {"java", "-Xmx32m"}.
+ *
+ * <p>Creates a defensive copy.
*
* @param java java command line
*/
@@ -176,8 +160,8 @@
/**
* Returns the command used to start the forked server process.
- * <p/>
- * Returned list is unmodifiable.
+ *
+ * <p>Returned list is unmodifiable.
*
* @return java command line args
*/
@@ -190,51 +174,45 @@
}
/**
- * This sends the objects to the server for parsing, and the server via
- * the proxies acts on the handler as if it were updating it directly.
- * <p>
- * If using a {@link org.apache.tika.parser.RecursiveParserWrapper}, there are two options:
- * </p>
- * <p>
- * <ol>
- * <li>Send in a class that extends
- * {@link org.apache.tika.sax.RecursiveParserWrapperHandler},
- * and the server will proxy back the data as best it can[0].</li>
- * <li>Send in a class that extends {@link AbstractRecursiveParserWrapperHandler}
- * and the server will act on the class but not proxy back the data. This
- * can be used, for example, if all you want to do is write to disc, extend
- * {@link AbstractRecursiveParserWrapperHandler} to write to disc when
- * {@link AbstractRecursiveParserWrapperHandler#endDocument(ContentHandler,
- * Metadata)}
- * is called, and the server will take care of the writing via the handler.</li>
- * </ol>
- * </p>
- * <p>
- * <b>NOTE:</b>[0] "the server will proxy back the data as best it can".
- * If the handler implements Serializable and is actually serializable, the
- * server will send it and the
- * {@link Metadata} back upon
- * {@link org.apache.tika.sax.RecursiveParserWrapperHandler#
- * endEmbeddedDocument(ContentHandler, Metadata)}
- * or {@link org.apache.tika.sax.RecursiveParserWrapperHandler#
- * endEmbeddedDocument(ContentHandler, Metadata)}.
- * If the handler does not implement {@link java.io.Serializable} or if there is a
- * {@link java.io.NotSerializableException} thrown during serialization, the server will
- * call {@link ContentHandler#toString()} on the ContentHandler and set that value with the
- * {@link TikaCoreProperties#TIKA_CONTENT} key and then
- * serialize and proxy that data back.
- * </p>
+ * This sends the objects to the server for parsing, and the server via the proxies acts on the
+ * handler as if it were updating it directly.
*
- * @param stream the document stream (input)
- * @param handler handler for the XHTML SAX events (output)
+ * <p>If using a {@link org.apache.tika.parser.RecursiveParserWrapper}, there are two options:
+ *
+ * <p>
+ *
+ * <ol>
+ * <li>Send in a class that extends {@link org.apache.tika.sax.RecursiveParserWrapperHandler},
+ * and the server will proxy back the data as best it can[0].
+ * <li>Send in a class that extends {@link AbstractRecursiveParserWrapperHandler} and the
+ * server will act on the class but not proxy back the data. This can be used, for
+ * example, if all you want to do is write to disc, extend {@link
+ * AbstractRecursiveParserWrapperHandler} to write to disc when {@link
+ * AbstractRecursiveParserWrapperHandler#endDocument(ContentHandler, Metadata)} is called,
+ * and the server will take care of the writing via the handler.
+ * </ol>
+ *
+ * <p><b>NOTE:</b>[0] "the server will proxy back the data as best it can". If the
+ * handler implements Serializable and is actually serializable, the server will send it and the
+ * {@link Metadata} back upon {@link org.apache.tika.sax.RecursiveParserWrapperHandler#
+ * endEmbeddedDocument(ContentHandler, Metadata)} or {@link
+ * org.apache.tika.sax.RecursiveParserWrapperHandler# endEmbeddedDocument(ContentHandler,
+ * Metadata)}. If the handler does not implement {@link java.io.Serializable} or if there is a
+ * {@link java.io.NotSerializableException} thrown during serialization, the server will call
+ * {@link ContentHandler#toString()} on the ContentHandler and set that value with the {@link
+ * TikaCoreProperties#TIKA_CONTENT} key and then serialize and proxy that data back.
+ *
+ * @param stream the document stream (input)
+ * @param handler handler for the XHTML SAX events (output)
* @param metadata document metadata (input and output)
- * @param context parse context
+ * @param context parse context
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
if (stream == null) {
throw new NullPointerException("null stream");
}
@@ -245,8 +223,9 @@
ForkClient client = acquireClient();
try {
ContentHandler tee =
- (handler instanceof AbstractRecursiveParserWrapperHandler) ? handler :
- new TeeContentHandler(handler, new MetadataContentHandler(metadata));
+ (handler instanceof AbstractRecursiveParserWrapperHandler)
+ ? handler
+ : new TeeContentHandler(handler, new MetadataContentHandler(metadata));
t = client.call("parse", stream, tee, metadata, context);
alive = true;
@@ -256,10 +235,12 @@
throw te;
} catch (IOException e) {
// Problem occurred on the other side
- throw new TikaException("Failed to communicate with a forked parser process." +
- " The process has most likely crashed due to some error" +
- " like running out of memory. A new process will be" +
- " started for the next parsing request.", e);
+ throw new TikaException(
+ "Failed to communicate with a forked parser process."
+ + " The process has most likely crashed due to some error"
+ + " like running out of memory. A new process will be"
+ + " started for the next parsing request.",
+ e);
} finally {
releaseClient(client, alive);
}
@@ -312,18 +293,23 @@
}
private ForkClient newClient() throws IOException, TikaException {
- TimeoutLimits timeoutLimits = new TimeoutLimits(serverPulseMillis, serverParseTimeoutMillis,
- serverWaitTimeoutMillis);
+ TimeoutLimits timeoutLimits =
+ new TimeoutLimits(
+ serverPulseMillis, serverParseTimeoutMillis, serverWaitTimeoutMillis);
if (loader == null && parser == null && tikaBin != null && parserFactoryFactory != null) {
return new ForkClient(tikaBin, parserFactoryFactory, java, timeoutLimits);
- } else if (loader != null && parser != null && tikaBin == null &&
- parserFactoryFactory == null) {
+ } else if (loader != null
+ && parser != null
+ && tikaBin == null
+ && parserFactoryFactory == null) {
return new ForkClient(loader, parser, java, timeoutLimits);
- } else if (loader != null && parser == null && tikaBin != null &&
- parserFactoryFactory != null) {
+ } else if (loader != null
+ && parser == null
+ && tikaBin != null
+ && parserFactoryFactory != null) {
return new ForkClient(tikaBin, parserFactoryFactory, loader, java, timeoutLimits);
} else {
- //TODO: make this more useful
+ // TODO: make this more useful
throw new IllegalStateException("Unexpected combination of state items");
}
}
@@ -331,8 +317,8 @@
private synchronized void releaseClient(ForkClient client, boolean alive) {
currentlyInUse--;
if (currentlyInUse + pool.size() < poolSize && alive) {
- if (maxFilesProcessedPerClient > 0 &&
- client.getFilesProcessed() >= maxFilesProcessedPerClient) {
+ if (maxFilesProcessedPerClient > 0
+ && client.getFilesProcessed() >= maxFilesProcessedPerClient) {
client.close();
} else {
pool.offer(client);
@@ -344,10 +330,8 @@
}
/**
- * The amount of time in milliseconds that the server
- * should wait before checking to see if the parse has timed out
- * or if the wait has timed out
- * The default is 5 seconds.
+ * The amount of time in milliseconds that the server should wait before checking to see if the
+ * parse has timed out or if the wait has timed out The default is 5 seconds.
*
* @param serverPulseMillis milliseconds to sleep before checking if there has been any activity
*/
@@ -356,9 +340,8 @@
}
/**
- * The maximum amount of time allowed for the server to try to parse a file.
- * If more than this time elapses, the server shuts down, and the ForkParser
- * throws an exception.
+ * The maximum amount of time allowed for the server to try to parse a file. If more than this
+ * time elapses, the server shuts down, and the ForkParser throws an exception.
*
* @param serverParseTimeoutMillis
*/
@@ -367,9 +350,9 @@
}
/**
- * The maximum amount of time allowed for the server to wait for a new request to parse
- * a file. The server will shutdown after this amount of time, and a new server will have
- * to be started by a new client.
+ * The maximum amount of time allowed for the server to wait for a new request to parse a file.
+ * The server will shutdown after this amount of time, and a new server will have to be started
+ * by a new client.
*
* @param serverWaitTimeoutMillis
*/
@@ -378,17 +361,15 @@
}
/**
- * If there is a slowly building memory leak in one of the parsers,
- * it is useful to set a limit on the number of files processed
- * by a server before it is shutdown and restarted. Default value is -1.
+ * If there is a slowly building memory leak in one of the parsers, it is useful to set a limit
+ * on the number of files processed by a server before it is shutdown and restarted. Default
+ * value is -1.
*
- * @param maxFilesProcessedPerClient maximum number of files that a server can handle
- * before the parser shuts down a client and creates
- * a new process. If set to -1, the server is never restarted
- * because of the number of files handled.
+ * @param maxFilesProcessedPerClient maximum number of files that a server can handle before the
+ * parser shuts down a client and creates a new process. If set to -1, the server is never
+ * restarted because of the number of files handled.
*/
public void setMaxFilesProcessedPerServer(int maxFilesProcessedPerClient) {
this.maxFilesProcessedPerClient = maxFilesProcessedPerClient;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java b/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java
index b10eac8..01aefc5 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ForkProxy.java
@@ -23,5 +23,4 @@
public interface ForkProxy extends Serializable {
void init(DataInputStream input, DataOutputStream output);
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java b/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java
index 9bbd82b..89dc932 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ForkResource.java
@@ -23,5 +23,4 @@
public interface ForkResource {
Throwable process(DataInputStream input, DataOutputStream output) throws IOException;
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java b/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java
index c3249c1..21002d1 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ForkServer.java
@@ -26,11 +26,9 @@
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
import java.net.URL;
-
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParserFactory;
+import org.xml.sax.SAXException;
class ForkServer implements Runnable {
@@ -52,36 +50,39 @@
public static final byte INIT_LOADER_PARSER = 7;
public static final byte INIT_PARSER_FACTORY_FACTORY_LOADER = 8;
private final Object[] lock = new Object[0];
- /**
- * Input stream for reading from the parent process
- */
+
+ /** Input stream for reading from the parent process */
private final DataInputStream input;
- /**
- * Output stream for writing to the parent process
- */
+
+ /** Output stream for writing to the parent process */
private final DataOutputStream output;
+
private final boolean active = true;
- //milliseconds to sleep before checking to see if there has been any reading/writing
- //If no reading or writing in this time, shutdown the server.
+ // milliseconds to sleep before checking to see if there has been any reading/writing
+ // If no reading or writing in this time, shutdown the server.
private long serverPulseMillis = 5000;
private long serverParserTimeoutMillis = 60000;
private long serverWaitTimeoutMillis = 60000;
- //can't be class Parser because then you'd
- //have to include that in bootstrap jar (legacy mode)
+ // can't be class Parser because then you'd
+ // have to include that in bootstrap jar (legacy mode)
private Object parser;
private ClassLoader classLoader;
private boolean parsing = false;
private long since;
+
/**
- * Sets up a forked server instance using the given stdin/out
- * communication channel.
+ * Sets up a forked server instance using the given stdin/out communication channel.
*
- * @param input input stream for reading from the parent process
+ * @param input input stream for reading from the parent process
* @param output output stream for writing to the parent process
* @throws IOException if the server instance could not be created
*/
- public ForkServer(InputStream input, OutputStream output, long serverPulseMillis,
- long serverParserTimeoutMillis, long serverWaitTimeoutMillis)
+ public ForkServer(
+ InputStream input,
+ OutputStream output,
+ long serverPulseMillis,
+ long serverParserTimeoutMillis,
+ long serverWaitTimeoutMillis)
throws IOException {
this.input = new DataInputStream(input);
this.output = new DataOutputStream(output);
@@ -93,10 +94,9 @@
}
/**
- * Starts a forked server process using the standard input and output
- * streams for communication with the parent process. Any attempts by
- * stray code to read from standard input or write to standard output
- * is redirected to avoid interfering with the communication channel.
+ * Starts a forked server process using the standard input and output streams for communication
+ * with the parent process. Any attempts by stray code to read from standard input or write to
+ * standard output is redirected to avoid interfering with the communication channel.
*
* @param args command line arguments, ignored
* @throws Exception if the server could not be started
@@ -109,7 +109,11 @@
URL.setURLStreamHandlerFactory(new MemoryURLStreamHandlerFactory());
ForkServer server =
- new ForkServer(System.in, System.out, serverPulseMillis, serverParseTimeoutMillis,
+ new ForkServer(
+ System.in,
+ System.out,
+ serverPulseMillis,
+ serverParseTimeoutMillis,
serverWaitTimeoutMillis);
System.setIn(new ByteArrayInputStream(new byte[0]));
System.setOut(System.err);
@@ -128,8 +132,9 @@
long elapsed = System.currentTimeMillis() - since;
if (parsing && elapsed > serverParserTimeoutMillis) {
break;
- } else if (!parsing && serverWaitTimeoutMillis > 0 &&
- elapsed > serverWaitTimeoutMillis) {
+ } else if (!parsing
+ && serverWaitTimeoutMillis > 0
+ && elapsed > serverWaitTimeoutMillis) {
break;
}
}
@@ -137,12 +142,12 @@
}
System.exit(0);
} catch (InterruptedException e) {
- //swallow
+ // swallow
}
}
public void processRequests() {
- //initialize
+ // initialize
try {
initializeParserAndLoader();
} catch (Throwable t) {
@@ -157,7 +162,7 @@
}
return;
}
- //main loop
+ // main loop
try {
while (true) {
int request = input.read();
@@ -192,7 +197,7 @@
switch (configIndex) {
case INIT_PARSER_FACTORY_FACTORY:
if (firstObject instanceof ParserFactoryFactory) {
- //the user has submitted a parser factory, but no class loader
+ // the user has submitted a parser factory, but no class loader
classLoader = ForkServer.class.getClassLoader();
ParserFactory parserFactory = ((ParserFactoryFactory) firstObject).build();
parser = parserFactory.build();
@@ -205,7 +210,7 @@
if (firstObject instanceof ClassLoader) {
classLoader = (ClassLoader) firstObject;
Thread.currentThread().setContextClassLoader(classLoader);
- //parser from parent process
+ // parser from parent process
parser = readObject(classLoader);
} else {
throw new IllegalArgumentException(
@@ -214,7 +219,7 @@
break;
case INIT_PARSER_FACTORY_FACTORY_LOADER:
if (firstObject instanceof ParserFactoryFactory) {
- //the user has submitted a parser factory and a class loader
+ // the user has submitted a parser factory and a class loader
ParserFactory parserFactory = ((ParserFactoryFactory) firstObject).build();
parser = parserFactory.build();
classLoader = (ClassLoader) readObject(ForkServer.class.getClassLoader());
@@ -255,7 +260,6 @@
te.setStackTrace(toSend.getStackTrace());
ForkObjectInputStream.sendObject(te, output);
}
-
}
} finally {
synchronized (lock) {
@@ -281,12 +285,12 @@
}
/**
- * Deserializes an object from the given stream. The serialized object
- * is expected to be preceded by a size integer, that is used for reading
- * the entire serialization into a memory before deserializing it.
+ * Deserializes an object from the given stream. The serialized object is expected to be
+ * preceded by a size integer, that is used for reading the entire serialization into a memory
+ * before deserializing it.
*
* @param loader class loader to be used for loading referenced classes
- * @throws IOException if the object could not be deserialized
+ * @throws IOException if the object could not be deserialized
* @throws ClassNotFoundException if a referenced class is not found
*/
private Object readObject(ClassLoader loader) throws IOException, ClassNotFoundException {
diff --git a/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java b/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java
index cca9b74..d6b9ed5 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/InputStreamProxy.java
@@ -23,9 +23,7 @@
class InputStreamProxy extends InputStream implements ForkProxy {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 4350939227765568438L;
private final int resource;
@@ -69,5 +67,4 @@
}
return n;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java b/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java
index 04ba93c..2fd768e 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/InputStreamResource.java
@@ -48,5 +48,4 @@
output.flush();
return null;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java
index 74a1687..8afea7c 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLConnection.java
@@ -31,12 +31,10 @@
}
@Override
- public void connect() {
- }
+ public void connect() {}
@Override
public InputStream getInputStream() {
return new ByteArrayInputStream(data);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java
index bfbb886..23a5270 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandler.java
@@ -31,8 +31,7 @@
private static final AtomicInteger counter = new AtomicInteger();
- private static final List<MemoryURLStreamRecord> records =
- new LinkedList<>();
+ private static final List<MemoryURLStreamRecord> records = new LinkedList<>();
public static URL createURL(byte[] data) {
try {
@@ -64,5 +63,4 @@
}
throw new IOException("Unknown URL: " + u);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java
index 5f3d818..4e07759 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamHandlerFactory.java
@@ -28,5 +28,4 @@
return null;
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java
index 8a72035..52a55e4 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/MemoryURLStreamRecord.java
@@ -23,5 +23,4 @@
public WeakReference<URL> url;
public byte[] data;
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java b/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java
index c1f1f56..1c99900 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/MetadataContentHandler.java
@@ -16,12 +16,11 @@
*/
package org.apache.tika.fork;
+import org.apache.tika.metadata.Metadata;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.metadata.Metadata;
-
class MetadataContentHandler extends DefaultHandler {
private final Metadata metadata;
@@ -38,5 +37,4 @@
metadata.add(aname, content);
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java b/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java
index 580b1ef..4e04498 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/ParserFactoryFactory.java
@@ -17,23 +17,19 @@
package org.apache.tika.fork;
-
import java.io.Serializable;
import java.lang.reflect.Constructor;
import java.util.Map;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.ParserFactory;
/**
- * Lightweight, easily serializable class that contains enough information
- * to build a {@link ParserFactory}
+ * Lightweight, easily serializable class that contains enough information to build a {@link
+ * ParserFactory}
*/
public class ParserFactoryFactory implements Serializable {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 4710974869988895410L;
private final String className;
@@ -53,5 +49,4 @@
throw new TikaException("Couldn't create factory", e);
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java
index 348c33d..05ecd93 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerProxy.java
@@ -23,22 +23,18 @@
import java.io.NotSerializableException;
import java.io.ObjectOutputStream;
import java.io.Serializable;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * <p>This class calls #toString() on the ContentHandler, inserts it into the Metadata object
- * and serializes the Metadata object.
- * </p>
- * Ideally, this would serialize the ContentHandler and the Metadata object as separate objects,
- * but we can't guarantee that the ContentHandler is Serializable (e.g. the StringWriter in
- * the WriteOutContentHandler).
+ * This class calls #toString() on the ContentHandler, inserts it into the Metadata object and
+ * serializes the Metadata object. Ideally, this would serialize the ContentHandler and the Metadata
+ * object as separate objects, but we can't guarantee that the ContentHandler is Serializable (e.g.
+ * the StringWriter in the WriteOutContentHandler).
*/
class RecursiveMetadataContentHandlerProxy extends RecursiveParserWrapperHandler
implements ForkProxy {
@@ -49,17 +45,15 @@
public static final byte METADATA_ONLY = 4;
public static final byte COMPLETE = 5;
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 737511106054617524L;
private final int resource;
private transient DataOutputStream output;
- public RecursiveMetadataContentHandlerProxy(int resource,
- ContentHandlerFactory contentHandlerFactory) {
+ public RecursiveMetadataContentHandlerProxy(
+ int resource, ContentHandlerFactory contentHandlerFactory) {
super(contentHandlerFactory);
this.resource = resource;
}
@@ -82,8 +76,9 @@
proxyBackToClient(MAIN_DOCUMENT, contentHandler, metadata);
}
- private void proxyBackToClient(int embeddedOrMainDocument, ContentHandler contentHandler,
- Metadata metadata) throws SAXException {
+ private void proxyBackToClient(
+ int embeddedOrMainDocument, ContentHandler contentHandler, Metadata metadata)
+ throws SAXException {
try {
output.write(ForkServer.RESOURCE);
output.writeByte(resource);
@@ -95,7 +90,7 @@
bytes = serialize(contentHandler);
success = true;
} catch (NotSerializableException e) {
- //object lied
+ // object lied
}
if (success) {
@@ -106,9 +101,9 @@
return;
}
}
- //if contenthandler is not allegedly or actually Serializable
- //fall back to adding contentHandler.toString() to the metadata object
- //and send that.
+ // if contenthandler is not allegedly or actually Serializable
+ // fall back to adding contentHandler.toString() to the metadata object
+ // and send that.
metadata.set(TikaCoreProperties.TIKA_CONTENT, contentHandler.toString());
output.writeByte(METADATA_ONLY);
send(metadata);
@@ -132,16 +127,15 @@
}
private byte[] serialize(Object object) throws IOException {
- //can't figure out why I'm getting an IllegalAccessException
- //when I try to use ForkedObjectInputStream, but
- //not when I do this manually ?!
+ // can't figure out why I'm getting an IllegalAccessException
+ // when I try to use ForkedObjectInputStream, but
+ // not when I do this manually ?!
ByteArrayOutputStream bos = new ByteArrayOutputStream();
try (ObjectOutputStream oos = new ObjectOutputStream(bos)) {
oos.writeObject(object);
oos.flush();
}
return bos.toByteArray();
-
}
private void doneSending() throws SAXException {
diff --git a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java
index 638e24d..77fcc5a 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/RecursiveMetadataContentHandlerResource.java
@@ -19,14 +19,12 @@
import java.io.DataInputStream;
import java.io.DataOutputStream;
import java.io.IOException;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
class RecursiveMetadataContentHandlerResource implements ForkResource {
@@ -51,14 +49,14 @@
byte handlerAndMetadataOrMetadataOnly = input.readByte();
ContentHandler localContentHandler = DEFAULT_HANDLER;
- if (handlerAndMetadataOrMetadataOnly ==
- RecursiveMetadataContentHandlerProxy.HANDLER_AND_METADATA) {
+ if (handlerAndMetadataOrMetadataOnly
+ == RecursiveMetadataContentHandlerProxy.HANDLER_AND_METADATA) {
localContentHandler = (ContentHandler) readObject(input);
- } else if (handlerAndMetadataOrMetadataOnly !=
- RecursiveMetadataContentHandlerProxy.METADATA_ONLY) {
+ } else if (handlerAndMetadataOrMetadataOnly
+ != RecursiveMetadataContentHandlerProxy.METADATA_ONLY) {
throw new IllegalArgumentException(
- "Expected HANDLER_AND_METADATA or METADATA_ONLY, but got:" +
- handlerAndMetadataOrMetadataOnly);
+ "Expected HANDLER_AND_METADATA or METADATA_ONLY, but got:"
+ + handlerAndMetadataOrMetadataOnly);
}
Metadata metadata = (Metadata) readObject(input);
@@ -82,6 +80,5 @@
} catch (ClassNotFoundException e) {
throw new IOException(e);
}
-
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java b/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java
index 6610437..254783a 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/TimeoutLimits.java
@@ -22,7 +22,6 @@
private final long parseTimeoutMS;
private final long waitTimeoutMS;
-
TimeoutLimits(long pulseMS, long parseTimeoutMS, long waitTimeoutMS) {
this.pulseMS = pulseMS;
this.parseTimeoutMS = parseTimeoutMS;
diff --git a/tika-core/src/main/java/org/apache/tika/fork/package-info.java b/tika-core/src/main/java/org/apache/tika/fork/package-info.java
index 74cdd06..f03be88 100644
--- a/tika-core/src/main/java/org/apache/tika/fork/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/fork/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Forked parser.
- */
+/** Forked parser. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.fork;
diff --git a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
index 31290cc..5d2a4aa 100644
--- a/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/BoundedInputStream.java
@@ -21,16 +21,14 @@
import java.io.OutputStream;
/**
- * Very slight modification of Commons' BoundedInputStream
- * so that we can figure out if this hit the bound or not.
- * <p>
- * This relies on IOUtils' skip and read to try to fully
- * read/skip inputstream.
+ * Very slight modification of Commons' BoundedInputStream so that we can figure out if this hit the
+ * bound or not.
+ *
+ * <p>This relies on IOUtils' skip and read to try to fully read/skip inputstream.
*/
public class BoundedInputStream extends InputStream {
-
- private final static int EOF = -1;
+ private static final int EOF = -1;
private final long max;
private final InputStream in;
private long pos;
@@ -54,8 +52,7 @@
* Invokes the delegate's <code>read(byte[])</code> method.
*
* @param b the buffer to read the bytes into
- * @return the number of bytes read or -1 if the end of stream or
- * the limit has been reached.
+ * @return the number of bytes read or -1 if the end of stream or the limit has been reached.
* @throws IOException if an I/O error occurs
*/
@Override
@@ -65,14 +62,13 @@
/**
* Invokes the delegate's <code>read(byte[], int, int)</code> method.
- * <p>
- * This does not have the same guarantees as IOUtil's readFully()...be careful.
*
- * @param b the buffer to read the bytes into
+ * <p>This does not have the same guarantees as IOUtil's readFully()...be careful.
+ *
+ * @param b the buffer to read the bytes into
* @param off The start offset
* @param len The number of bytes to read
- * @return the number of bytes read or -1 if the end of stream or
- * the limit has been reached.
+ * @return the number of bytes read or -1 if the end of stream or the limit has been reached.
* @throws IOException if an I/O error occurs
*/
@Override
@@ -92,9 +88,8 @@
}
/**
- * Invokes the delegate's <code>skip(long)</code> method.
- * As with InputStream generally, this does not guarantee reading n bytes.
- * Use IOUtils' skipFully for that functionality.
+ * Invokes the delegate's <code>skip(long)</code> method. As with InputStream generally, this
+ * does not guarantee reading n bytes. Use IOUtils' skipFully for that functionality.
*
* @param n the number of bytes to skip
* @return the actual number of bytes skipped
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 242dd8c..14d859b 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -18,16 +18,15 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.exception.TikaException;
/**
* General Endian Related Utilties.
- * <p>
- * This class provides static utility methods for input/output operations
- * on numbers in Big and Little Endian formats.
- * <p>
- * Origin of code: Based on the version in POI
+ *
+ * <p>This class provides static utility methods for input/output operations on numbers in Big and
+ * Little Endian formats.
+ *
+ * <p>Origin of code: Based on the version in POI
*/
public class EndianUtils {
private static final int LONG_SIZE = 8;
@@ -37,7 +36,7 @@
*
* @param stream the InputStream from which the short is to be read
* @return the short (16-bit) value
- * @throws IOException will be propagated back to the caller
+ * @throws IOException will be propagated back to the caller
* @throws BufferUnderrunException if the stream cannot provide enough bytes
*/
public static short readShortLE(InputStream stream)
@@ -50,7 +49,7 @@
*
* @param stream the InputStream from which the short is to be read
* @return the short (16-bit) value
- * @throws IOException will be propagated back to the caller
+ * @throws IOException will be propagated back to the caller
* @throws BufferUnderrunException if the stream cannot provide enough bytes
*/
public static short readShortBE(InputStream stream)
@@ -81,7 +80,7 @@
*
* @param stream the InputStream from which the int is to be read
* @return the int (32-bit) value
- * @throws IOException will be propagated back to the caller
+ * @throws IOException will be propagated back to the caller
* @throws BufferUnderrunException if the stream cannot provide enough bytes
*/
public static long readUIntLE(InputStream stream) throws IOException, BufferUnderrunException {
@@ -100,7 +99,7 @@
*
* @param stream the InputStream from which the int is to be read
* @return the int (32-bit) value
- * @throws IOException will be propagated back to the caller
+ * @throws IOException will be propagated back to the caller
* @throws BufferUnderrunException if the stream cannot provide enough bytes
*/
public static long readUIntBE(InputStream stream) throws IOException, BufferUnderrunException {
@@ -119,7 +118,7 @@
*
* @param stream the InputStream from which the int is to be read
* @return the int (32-bit) value
- * @throws IOException will be propagated back to the caller
+ * @throws IOException will be propagated back to the caller
* @throws BufferUnderrunException if the stream cannot provide enough bytes
*/
public static int readIntLE(InputStream stream) throws IOException, BufferUnderrunException {
@@ -138,7 +137,7 @@
*
* @param stream the InputStream from which the int is to be read
* @return the int (32-bit) value
- * @throws IOException will be propagated back to the caller
+ * @throws IOException will be propagated back to the caller
* @throws BufferUnderrunException if the stream cannot provide enough bytes
*/
public static int readIntBE(InputStream stream) throws IOException, BufferUnderrunException {
@@ -157,7 +156,7 @@
*
* @param stream the InputStream from which the int is to be read
* @return the int (32-bit) value
- * @throws IOException will be propagated back to the caller
+ * @throws IOException will be propagated back to the caller
* @throws BufferUnderrunException if the stream cannot provide enough bytes
*/
public static int readIntME(InputStream stream) throws IOException, BufferUnderrunException {
@@ -176,7 +175,7 @@
*
* @param stream the InputStream from which the long is to be read
* @return the long (64-bit) value
- * @throws IOException will be propagated back to the caller
+ * @throws IOException will be propagated back to the caller
* @throws BufferUnderrunException if the stream cannot provide enough bytes
*/
public static long readLongLE(InputStream stream) throws IOException, BufferUnderrunException {
@@ -192,9 +191,15 @@
throw new BufferUnderrunException();
}
- return ((long) ch8 << 56) + ((long) ch7 << 48) + ((long) ch6 << 40) + ((long) ch5 << 32) +
- ((long) ch4 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
- (ch3 << 16) + (ch2 << 8) + (ch1);
+ return ((long) ch8 << 56)
+ + ((long) ch7 << 48)
+ + ((long) ch6 << 40)
+ + ((long) ch5 << 32)
+ + ((long) ch4 << 24)
+ + // cast to long to preserve bit 31 (sign bit for ints)
+ (ch3 << 16)
+ + (ch2 << 8)
+ + (ch1);
}
/**
@@ -202,7 +207,7 @@
*
* @param stream the InputStream from which the long is to be read
* @return the long (64-bit) value
- * @throws IOException will be propagated back to the caller
+ * @throws IOException will be propagated back to the caller
* @throws BufferUnderrunException if the stream cannot provide enough bytes
*/
public static long readLongBE(InputStream stream) throws IOException, BufferUnderrunException {
@@ -218,14 +223,20 @@
throw new BufferUnderrunException();
}
- return ((long) ch1 << 56) + ((long) ch2 << 48) + ((long) ch3 << 40) + ((long) ch4 << 32) +
- ((long) ch5 << 24) + // cast to long to preserve bit 31 (sign bit for ints)
- (ch6 << 16) + (ch7 << 8) + (ch8);
+ return ((long) ch1 << 56)
+ + ((long) ch2 << 48)
+ + ((long) ch3 << 40)
+ + ((long) ch4 << 32)
+ + ((long) ch5 << 24)
+ + // cast to long to preserve bit 31 (sign bit for ints)
+ (ch6 << 16)
+ + (ch7 << 8)
+ + (ch8);
}
/**
- * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian
- * but with the high bit on each number indicating if it continues or not
+ * Gets the integer value that is stored in UTF-8 like fashion, in Big Endian but with the high
+ * bit on each number indicating if it continues or not
*/
public static long readUE7(InputStream stream) throws IOException {
int i;
@@ -262,7 +273,7 @@
/**
* Get a LE short value from a byte array
*
- * @param data the byte array
+ * @param data the byte array
* @param offset a starting offset into the byte array
* @return the short (16-bit) value
*/
@@ -283,7 +294,7 @@
/**
* Get a LE unsigned short value from a byte array
*
- * @param data the byte array
+ * @param data the byte array
* @param offset a starting offset into the byte array
* @return the unsigned short (16-bit) value in an integer
*/
@@ -306,7 +317,7 @@
/**
* Get a BE short value from a byte array
*
- * @param data the byte array
+ * @param data the byte array
* @param offset a starting offset into the byte array
* @return the short (16-bit) value
*/
@@ -327,7 +338,7 @@
/**
* Get a BE unsigned short value from a byte array
*
- * @param data the byte array
+ * @param data the byte array
* @param offset a starting offset into the byte array
* @return the unsigned short (16-bit) value in an integer
*/
@@ -350,7 +361,7 @@
/**
* Get a LE int value from a byte array
*
- * @param data the byte array
+ * @param data the byte array
* @param offset a starting offset into the byte array
* @return the int (32-bit) value
*/
@@ -376,7 +387,7 @@
/**
* Get a BE int value from a byte array
*
- * @param data the byte array
+ * @param data the byte array
* @param offset a starting offset into the byte array
* @return the int (32-bit) value
*/
@@ -402,7 +413,7 @@
/**
* Get a LE unsigned int value from a byte array
*
- * @param data the byte array
+ * @param data the byte array
* @param offset a starting offset into the byte array
* @return the unsigned int (32-bit) value in a long
*/
@@ -424,7 +435,7 @@
/**
* Get a BE unsigned int value from a byte array
*
- * @param data the byte array
+ * @param data the byte array
* @param offset a starting offset into the byte array
* @return the unsigned int (32-bit) value in a long
*/
@@ -436,7 +447,7 @@
/**
* Get a LE long value from a byte array
*
- * @param data the byte array
+ * @param data the byte array
* @param offset a starting offset into the byte array
* @return the long (64-bit) value
*/
@@ -451,8 +462,7 @@
}
/**
- * Convert an 'unsigned' byte to an integer. ie, don't carry across the
- * sign.
+ * Convert an 'unsigned' byte to an integer. ie, don't carry across the sign.
*
* @param b Description of the Parameter
* @return Description of the Return Value
@@ -464,7 +474,7 @@
/**
* get the unsigned value of a byte.
*
- * @param data the byte array.
+ * @param data the byte array.
* @param offset a starting offset into the byte array.
* @return the unsigned value of the byte as a 16 bit short
*/
@@ -472,7 +482,6 @@
return (short) (data[offset] & 0xFF);
}
-
public static class BufferUnderrunException extends TikaException {
private static final long serialVersionUID = 8358288231138076276L;
diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
index 17bc9e9..fccf0c9 100644
--- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
@@ -18,23 +18,18 @@
import java.util.HashSet;
import java.util.Locale;
-
import org.apache.tika.utils.StringUtils;
-
public class FilenameUtils {
+ /** Reserved characters */
+ public static final char[] RESERVED_FILENAME_CHARACTERS = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E,
+ 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D,
+ 0x1E, 0x1F, '?', ':', '*', '<', '>', '|'
+ };
- /**
- * Reserved characters
- */
- public final static char[] RESERVED_FILENAME_CHARACTERS =
- {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
- 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A,
- 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, '?', ':', '*', '<', '>', '|'};
-
- private final static HashSet<Character> RESERVED = new HashSet<>(38);
-
+ private static final HashSet<Character> RESERVED = new HashSet<>(38);
static {
for (char reservedFilenameCharacter : RESERVED_FILENAME_CHARACTERS) {
@@ -42,13 +37,12 @@
}
}
-
/**
- * Scans the given file name for reserved characters on different OSs and
- * file systems and returns a sanitized version of the name with the
- * reserved chars replaced by their hexadecimal value.
- * <p>
- * For example <code>why?.zip</code> will be converted into <code>why%3F.zip</code>
+ * Scans the given file name for reserved characters on different OSs and file systems and
+ * returns a sanitized version of the name with the reserved chars replaced by their hexadecimal
+ * value.
+ *
+ * <p>For example <code>why?.zip</code> will be converted into <code>why%3F.zip</code>
*
* @param name the file name to be normalized - NOT NULL
* @return the normalized file name
@@ -63,7 +57,8 @@
for (char c : name.toCharArray()) {
if (RESERVED.contains(c)) {
- sb.append('%').append((c < 16) ? "0" : "")
+ sb.append('%')
+ .append((c < 16) ? "0" : "")
.append(Integer.toHexString(c).toUpperCase(Locale.ROOT));
} else {
sb.append(c);
@@ -74,20 +69,17 @@
}
/**
- * This is a duplication of the algorithm and functionality
- * available in commons io FilenameUtils. If Java's File were
- * able handle Windows file paths correctly in linux,
- * we wouldn't need this.
- * <p>
- * The goal of this is to get a filename from a path.
- * The package parsers and some other embedded doc
- * extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY.
- * <p>
- * If a careless client used that filename as if it were a
- * filename and not a path when writing embedded files,
- * bad things could happen. Consider: "../../../my_ppt.ppt".
- * <p>
- * Consider using this in combination with {@link #normalize(String)}.
+ * This is a duplication of the algorithm and functionality available in commons io
+ * FilenameUtils. If Java's File were able handle Windows file paths correctly in linux, we
+ * wouldn't need this.
+ *
+ * <p>The goal of this is to get a filename from a path. The package parsers and some other
+ * embedded doc extractors could put anything into TikaCoreProperties.RESOURCE_NAME_KEY.
+ *
+ * <p>If a careless client used that filename as if it were a filename and not a path when
+ * writing embedded files, bad things could happen. Consider: "../../../my_ppt.ppt".
+ *
+ * <p>Consider using this in combination with {@link #normalize(String)}.
*
* @param path path to strip
* @return empty string or a filename, never null
@@ -99,8 +91,8 @@
}
int unix = path.lastIndexOf("/");
int windows = path.lastIndexOf("\\");
- //some macintosh file names are stored with : as the delimiter
- //also necessary to properly handle C:somefilename
+ // some macintosh file names are stored with : as the delimiter
+ // also necessary to properly handle C:somefilename
int colon = path.lastIndexOf(":");
String cand = path.substring(Math.max(colon, Math.max(unix, windows)) + 1);
if (cand.equals("..") || cand.equals(".")) {
@@ -111,13 +103,14 @@
/**
* This includes the period, e.g. ".pdf"
+ *
* @param path
* @return the suffix or an empty string if one could not be found
*/
public static String getSuffixFromPath(String path) {
String n = getName(path);
int i = n.lastIndexOf(".");
- //arbitrarily sets max extension length
+ // arbitrarily sets max extension length
if (i > -1 && n.length() - i < 6) {
return n.substring(i);
}
diff --git a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
index 247705b..f96935e 100644
--- a/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/IOUtils.java
@@ -36,7 +36,8 @@
*/
long remain = toSkip;
while (remain > 0) {
- // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than delegating to skip()
+ // See https://issues.apache.org/jira/browse/IO-203 for why we use read() rather than
+ // delegating to skip()
final long n = input.read(buffer, 0, (int) Math.min(remain, buffer.length));
if (n < 0) { // EOF
break;
diff --git a/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java b/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java
index 17e416a..6ca0e30 100644
--- a/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/io/InputStreamFactory.java
@@ -20,14 +20,13 @@
import java.io.InputStream;
/**
- * <p>A factory which returns a fresh {@link InputStream} for the <em>same</em>
- * resource each time.</p>
- * <p>This is typically desired where it is easier / quicker / simpler to
- * fetch a fresh {@link InputStream} to re-read a given resource, rather
- * than do any kind of buffering.</p>
- * <p>It is typically used with {@link TikaInputStream#get(InputStreamFactory)}
- * when combined with a Parser which needs to read the resource's stream
- * multiple times when processing.</p>
+ * A factory which returns a fresh {@link InputStream} for the <em>same</em> resource each time.
+ *
+ * <p>This is typically desired where it is easier / quicker / simpler to fetch a fresh {@link
+ * InputStream} to re-read a given resource, rather than do any kind of buffering.
+ *
+ * <p>It is typically used with {@link TikaInputStream#get(InputStreamFactory)} when combined with a
+ * Parser which needs to read the resource's stream multiple times when processing.
*/
public interface InputStreamFactory {
InputStream getInputStream() throws IOException;
diff --git a/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java b/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java
index 32e671e..146d830 100644
--- a/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/LookaheadInputStream.java
@@ -20,24 +20,23 @@
import java.io.InputStream;
/**
- * Stream wrapper that make it easy to read up to n bytes ahead from
- * a stream that supports the mark feature. This class insulates the
- * underlying stream from things like possible mark(), reset() and close()
- * calls by external components that might otherwise invalidate the marked
- * state of a stream.
- * <p>
- * The recommended usage pattern of this class is:
+ * Stream wrapper that make it easy to read up to n bytes ahead from a stream that supports the mark
+ * feature. This class insulates the underlying stream from things like possible mark(), reset() and
+ * close() calls by external components that might otherwise invalidate the marked state of a
+ * stream.
+ *
+ * <p>The recommended usage pattern of this class is:
+ *
* <pre>
* try (InputStream lookahead = new LookaheadInputStream(stream, n)) {
* processStream(lookahead);
* }
* </pre>
- * <p>
- * This usage pattern guarantees that only up to n bytes from the original
- * stream can ever be read, and that the stream will have been marked and
- * then reset to its original state once the above code block exits. No
- * code in the fictional processStream() method can affect the the state of
- * the original stream.
+ *
+ * <p>This usage pattern guarantees that only up to n bytes from the original stream can ever be
+ * read, and that the stream will have been marked and then reset to its original state once the
+ * above code block exits. No code in the fictional processStream() method can affect the the state
+ * of the original stream.
*
* @since Apache Tika 0.10
*/
@@ -52,14 +51,13 @@
private int mark = 0;
/**
- * Creates a lookahead wrapper for the given input stream.
- * The given input stream should support the mark feature,
- * as otherwise the state of that stream will be undefined
- * after the lookahead wrapper has been closed. As a special
- * case a <code>null</code> stream is treated as an empty stream.
+ * Creates a lookahead wrapper for the given input stream. The given input stream should support
+ * the mark feature, as otherwise the state of that stream will be undefined after the lookahead
+ * wrapper has been closed. As a special case a <code>null</code> stream is treated as an empty
+ * stream.
*
* @param stream input stream, can be <code>null</code>
- * @param n maximum number of bytes to look ahead
+ * @param n maximum number of bytes to look ahead
*/
public LookaheadInputStream(InputStream stream, int n) {
this.stream = stream;
@@ -138,5 +136,4 @@
public synchronized void reset() {
position = mark;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/io/TailStream.java b/tika-core/src/main/java/org/apache/tika/io/TailStream.java
index a1621c2..5f335b3 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TailStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TailStream.java
@@ -21,66 +21,46 @@
import java.io.InputStream;
/**
- * <p>
- * A specialized input stream implementation which records the last portion read
- * from an underlying stream.
- * </p>
- * <p>
- * This stream implementation is useful to deal with information which is known
- * to be located at the end of a stream (e.g. ID3 v1 tags). While reading bytes
- * from the underlying stream, a given number of bytes is kept in an internal
- * buffer. This buffer can then be queried after the whole stream was read. It
- * contains the last bytes read from the original input stream.
- * </p>
+ * A specialized input stream implementation which records the last portion read from an underlying
+ * stream.
*
- * @param in the underlying input stream
+ * <p>This stream implementation is useful to deal with information which is known to be located at
+ * the end of a stream (e.g. ID3 v1 tags). While reading bytes from the underlying stream, a given
+ * number of bytes is kept in an internal buffer. This buffer can then be queried after the whole
+ * stream was read. It contains the last bytes read from the original input stream.
+ *
+ * @param in the underlying input stream
* @param tailSize the size of the tail buffer
*/
public class TailStream extends FilterInputStream {
- /**
- * Constant for the default skip buffer size.
- */
+ /** Constant for the default skip buffer size. */
private static final int SKIP_SIZE = 4096;
- /**
- * The buffer in which the tail data is stored.
- */
+ /** The buffer in which the tail data is stored. */
private final byte[] tailBuffer;
- /**
- * The size of the internal tail buffer.
- */
+ /** The size of the internal tail buffer. */
private final int tailSize;
- /**
- * A copy of the internal tail buffer used for mark() operations.
- */
+ /** A copy of the internal tail buffer used for mark() operations. */
private byte[] markBuffer;
- /**
- * The number of bytes that have been read so far.
- */
+ /** The number of bytes that have been read so far. */
private long bytesRead;
- /**
- * The number of bytes read at the last mark() operation.
- */
+ /** The number of bytes read at the last mark() operation. */
private long markBytesRead;
- /**
- * The current index into the tail buffer.
- */
+ /** The current index into the tail buffer. */
private int currentIndex;
- /**
- * A copy of the current index used for mark() operations.
- */
+ /** A copy of the current index used for mark() operations. */
private int markIndex;
/**
* Creates a new instance of {@code TailStream}.
*
- * @param in the underlying input stream
+ * @param in the underlying input stream
* @param size the size of the tail buffer
*/
public TailStream(InputStream in, int size) {
@@ -89,10 +69,7 @@
tailBuffer = new byte[size];
}
- /**
- * {@inheritDoc} This implementation adds the read byte to the internal tail
- * buffer.
- */
+ /** {@inheritDoc} This implementation adds the read byte to the internal tail buffer. */
@Override
public int read() throws IOException {
int c = super.read();
@@ -103,9 +80,8 @@
}
/**
- * {@inheritDoc} This implementation delegates to the underlying stream and
- * then adds the correct portion of the read buffer to the internal tail
- * buffer.
+ * {@inheritDoc} This implementation delegates to the underlying stream and then adds the
+ * correct portion of the read buffer to the internal tail buffer.
*/
@Override
public int read(byte[] buf) throws IOException {
@@ -117,9 +93,8 @@
}
/**
- * {@inheritDoc} This implementation delegates to the underlying stream and
- * then adds the correct portion of the read buffer to the internal tail
- * buffer.
+ * {@inheritDoc} This implementation delegates to the underlying stream and then adds the
+ * correct portion of the read buffer to the internal tail buffer.
*/
@Override
public int read(byte[] buf, int ofs, int length) throws IOException {
@@ -131,8 +106,8 @@
}
/**
- * {@inheritDoc} This implementation delegates to the {@code read()} method
- * to ensure that the tail buffer is also filled if data is skipped.
+ * {@inheritDoc} This implementation delegates to the {@code read()} method to ensure that the
+ * tail buffer is also filled if data is skipped.
*/
@Override
public long skip(long n) throws IOException {
@@ -153,9 +128,8 @@
}
/**
- * {@inheritDoc} This implementation saves the internal state including the
- * content of the tail buffer so that it can be restored when ''reset()'' is
- * called later.
+ * {@inheritDoc} This implementation saves the internal state including the content of the tail
+ * buffer so that it can be restored when ''reset()'' is called later.
*/
@Override
public void mark(int limit) {
@@ -166,9 +140,9 @@
}
/**
- * {@inheritDoc} This implementation restores this stream's state to the
- * state when ''mark()'' was called the last time. If ''mark()'' has not
- * been called before, this method has no effect.
+ * {@inheritDoc} This implementation restores this stream's state to the state when ''mark()''
+ * was called the last time. If ''mark()'' has not been called before, this method has no
+ * effect.
*/
@Override
public void reset() {
@@ -180,10 +154,9 @@
}
/**
- * Returns an array with the last data read from the underlying stream. If
- * the underlying stream contained more data than the ''tailSize''
- * constructor argument, the returned array has a length of ''tailSize''.
- * Otherwise, its length equals the number of bytes read.
+ * Returns an array with the last data read from the underlying stream. If the underlying stream
+ * contained more data than the ''tailSize'' constructor argument, the returned array has a
+ * length of ''tailSize''. Otherwise, its length equals the number of bytes read.
*
* @return an array with the last data read from the underlying stream
*/
@@ -211,8 +184,8 @@
/**
* Adds the content of the given buffer to the internal tail buffer.
*
- * @param buf the buffer
- * @param ofs the start offset in the buffer
+ * @param buf the buffer
+ * @param ofs the start offset in the buffer
* @param length the number of bytes to be copied
*/
private void appendBuf(byte[] buf, int ofs, int length) {
@@ -226,12 +199,12 @@
}
/**
- * Replaces the content of the internal tail buffer by the last portion of
- * the given buffer. This method is called if a buffer was read from the
- * underlying stream whose length is larger than the tail buffer.
+ * Replaces the content of the internal tail buffer by the last portion of the given buffer.
+ * This method is called if a buffer was read from the underlying stream whose length is larger
+ * than the tail buffer.
*
- * @param buf the buffer
- * @param ofs the start offset in the buffer
+ * @param buf the buffer
+ * @param ofs the start offset in the buffer
* @param length the number of bytes to be copied
*/
private void replaceTailBuffer(byte[] buf, int ofs, int length) {
@@ -240,13 +213,12 @@
}
/**
- * Copies the given buffer into the internal tail buffer at the current
- * position. This method is called if a buffer is read from the underlying
- * stream whose length is smaller than the tail buffer. In this case the
- * tail buffer is only partly overwritten.
+ * Copies the given buffer into the internal tail buffer at the current position. This method is
+ * called if a buffer is read from the underlying stream whose length is smaller than the tail
+ * buffer. In this case the tail buffer is only partly overwritten.
*
- * @param buf the buffer
- * @param ofs the start offset in the buffer
+ * @param buf the buffer
+ * @param ofs the start offset in the buffer
* @param length the number of bytes to be copied
*/
private void copyToTailBuffer(byte[] buf, int ofs, int length) {
diff --git a/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java b/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java
index c1565ab..284ba3d 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TemporaryResources.java
@@ -22,20 +22,18 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.LinkedList;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.utils.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * Utility class for tracking and ultimately closing or otherwise disposing
- * a collection of temporary resources.
- * <p>
- * Note that this class is not thread-safe.
+ * Utility class for tracking and ultimately closing or otherwise disposing a collection of
+ * temporary resources.
+ *
+ * <p>Note that this class is not thread-safe.
*
* @since Apache Tika 0.10
*/
@@ -43,33 +41,27 @@
private static final Logger LOG = LoggerFactory.getLogger(TemporaryResources.class);
- /**
- * Tracked resources in LIFO order.
- */
+ /** Tracked resources in LIFO order. */
private final LinkedList<Closeable> resources = new LinkedList<>();
- /**
- * Directory for temporary files, <code>null</code> for the system default.
- */
+ /** Directory for temporary files, <code>null</code> for the system default. */
private Path tempFileDir = null;
/**
- * Sets the directory to be used for the temporary files created by
- * the {@link #createTempFile(String)} method.
+ * Sets the directory to be used for the temporary files created by the {@link
+ * #createTempFile(String)} method.
*
- * @param tempFileDir temporary file directory,
- * or <code>null</code> for the system default
+ * @param tempFileDir temporary file directory, or <code>null</code> for the system default
*/
public void setTemporaryFileDirectory(Path tempFileDir) {
this.tempFileDir = tempFileDir;
}
/**
- * Sets the directory to be used for the temporary files created by
- * the {@link #createTempFile(String)} method.
+ * Sets the directory to be used for the temporary files created by the {@link
+ * #createTempFile(String)} method.
*
- * @param tempFileDir temporary file directory,
- * or <code>null</code> for the system default
+ * @param tempFileDir temporary file directory, or <code>null</code> for the system default
* @see #setTemporaryFileDirectory(Path)
*/
public void setTemporaryFileDirectory(File tempFileDir) {
@@ -77,8 +69,9 @@
}
/**
- * Creates a temporary file that will automatically be deleted when
- * the {@link #close()} method is called, returning its path.
+ * Creates a temporary file that will automatically be deleted when the {@link #close()} method
+ * is called, returning its path.
+ *
* @param suffix -- the suffix of the file if known, starting with "." as in ".pdf"
* @return Path to created temporary file that will be deleted after closing
* @throws IOException
@@ -86,17 +79,20 @@
public Path createTempFile(String suffix) throws IOException {
String actualSuffix = StringUtils.isBlank(suffix) ? ".tmp" : suffix;
- final Path path = tempFileDir == null ? Files.createTempFile("apache-tika-", actualSuffix) :
- Files.createTempFile(tempFileDir, "apache-tika-", actualSuffix);
- addResource(() -> {
- try {
- Files.delete(path);
- } catch (IOException e) {
- // delete when exit if current delete fail
- LOG.warn("delete tmp file fail, will delete it on exit");
- path.toFile().deleteOnExit();
- }
- });
+ final Path path =
+ tempFileDir == null
+ ? Files.createTempFile("apache-tika-", actualSuffix)
+ : Files.createTempFile(tempFileDir, "apache-tika-", actualSuffix);
+ addResource(
+ () -> {
+ try {
+ Files.delete(path);
+ } catch (IOException e) {
+ // delete when exit if current delete fail
+ LOG.warn("delete tmp file fail, will delete it on exit");
+ path.toFile().deleteOnExit();
+ }
+ });
return path;
}
@@ -105,8 +101,8 @@
}
/**
- * Creates a temporary file that will automatically be deleted when
- * the {@link #close()} method is called, returning its path.
+ * Creates a temporary file that will automatically be deleted when the {@link #close()} method
+ * is called, returning its path.
*
* @return Path to created temporary file that will be deleted after closing
* @throws IOException
@@ -118,9 +114,10 @@
}
return createTempFile(FilenameUtils.getSuffixFromPath(resourceName));
}
+
/**
- * Creates and returns a temporary file that will automatically be
- * deleted when the {@link #close()} method is called.
+ * Creates and returns a temporary file that will automatically be deleted when the {@link
+ * #close()} method is called.
*
* @return Created temporary file that'll be deleted after closing
* @throws IOException
@@ -131,8 +128,8 @@
}
/**
- * Adds a new resource to the set of tracked resources that will all be
- * closed when the {@link #close()} method is called.
+ * Adds a new resource to the set of tracked resources that will all be closed when the {@link
+ * #close()} method is called.
*
* @param resource resource to be tracked
*/
@@ -141,8 +138,8 @@
}
/**
- * Returns the latest of the tracked resources that implements or
- * extends the given interface or class.
+ * Returns the latest of the tracked resources that implements or extends the given interface or
+ * class.
*
* @param klass interface or class
* @return matching resource, or <code>null</code> if not found
@@ -158,15 +155,13 @@
}
/**
- * Closes all tracked resources. The resources are closed in reverse order
- * from how they were added.
- * <p>
- * Any suppressed exceptions from managed resources are collected and
- * then added to the first thrown exception, which is re-thrown once
- * all the resources have been closed.
+ * Closes all tracked resources. The resources are closed in reverse order from how they were
+ * added.
*
- * @throws IOException if one or more of the tracked resources
- * could not be closed
+ * <p>Any suppressed exceptions from managed resources are collected and then added to the first
+ * thrown exception, which is re-thrown once all the resources have been closed.
+ *
+ * @throws IOException if one or more of the tracked resources could not be closed
*/
public void close() throws IOException {
// Release all resources and keep track of any exceptions
@@ -191,12 +186,10 @@
}
/**
- * Calls the {@link #close()} method and wraps the potential
- * {@link IOException} into a {@link TikaException} for convenience
- * when used within Tika.
+ * Calls the {@link #close()} method and wraps the potential {@link IOException} into a {@link
+ * TikaException} for convenience when used within Tika.
*
- * @throws TikaException if one or more of the tracked resources
- * could not be closed
+ * @throws TikaException if one or more of the tracked resources could not be closed
*/
public void dispose() throws TikaException {
try {
@@ -205,5 +198,4 @@
throw new TikaException("Failed to close temporary resources", e);
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index a70ade4..b121ad4 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -35,97 +35,88 @@
import java.nio.file.Paths;
import java.sql.Blob;
import java.sql.SQLException;
-
import org.apache.commons.io.input.TaggedInputStream;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.Parser;
import org.apache.tika.utils.StringUtils;
/**
- * Input stream with extended capabilities. The purpose of this class is
- * to allow files and other resources and information to be associated with
- * the {@link InputStream} instance passed through the
- * {@link org.apache.tika.parser.Parser} interface and other similar APIs.
- * <p>
- * TikaInputStream instances can be created using the various static
- * <code>get()</code> factory methods. Most of these methods take an optional
- * {@link Metadata} argument that is then filled with the available input
- * metadata from the given resource. The created TikaInputStream instance
- * keeps track of the original resource used to create it, while behaving
- * otherwise just like a normal, buffered {@link InputStream}.
- * A TikaInputStream instance is also guaranteed to support the
- * {@link #mark(int)} feature.
- * <p>
- * Code that wants to access the underlying file or other resources
- * associated with a TikaInputStream should first use the
- * {@link #get(InputStream)} factory method to cast or wrap a given
- * {@link InputStream} into a TikaInputStream instance.
- * <p>
- * TikaInputStream includes a few safety features to protect against parsers
- * that may fail to check for an EOF or may incorrectly rely on the unreliable
- * value returned from {@link FileInputStream#skip}. These parser failures
- * can lead to infinite loops. We strongly encourage the use of
- * TikaInputStream.
+ * Input stream with extended capabilities. The purpose of this class is to allow files and other
+ * resources and information to be associated with the {@link InputStream} instance passed through
+ * the {@link org.apache.tika.parser.Parser} interface and other similar APIs.
+ *
+ * <p>TikaInputStream instances can be created using the various static <code>get()</code> factory
+ * methods. Most of these methods take an optional {@link Metadata} argument that is then filled
+ * with the available input metadata from the given resource. The created TikaInputStream instance
+ * keeps track of the original resource used to create it, while behaving otherwise just like a
+ * normal, buffered {@link InputStream}. A TikaInputStream instance is also guaranteed to support
+ * the {@link #mark(int)} feature.
+ *
+ * <p>Code that wants to access the underlying file or other resources associated with a
+ * TikaInputStream should first use the {@link #get(InputStream)} factory method to cast or wrap a
+ * given {@link InputStream} into a TikaInputStream instance.
+ *
+ * <p>TikaInputStream includes a few safety features to protect against parsers that may fail to
+ * check for an EOF or may incorrectly rely on the unreliable value returned from {@link
+ * FileInputStream#skip}. These parser failures can lead to infinite loops. We strongly encourage
+ * the use of TikaInputStream.
*
* @since Apache Tika 0.8
*/
public class TikaInputStream extends TaggedInputStream {
private static final int MAX_CONSECUTIVE_EOFS = 1000;
+
/**
- * Blob size threshold that limits the largest BLOB size to be
- * buffered fully in memory by the {@link #get(Blob, Metadata)}
- * method.
+ * Blob size threshold that limits the largest BLOB size to be buffered fully in memory by the
+ * {@link #get(Blob, Metadata)} method.
*/
private static final int BLOB_SIZE_THRESHOLD = 1024 * 1024;
- /**
- * Tracker of temporary resources.
- */
+
+ /** Tracker of temporary resources. */
private final TemporaryResources tmp;
+
/**
- * The Factory that can create fresh {@link InputStream}s for
- * the resource this reads for, eg when needing to re-read.
+ * The Factory that can create fresh {@link InputStream}s for the resource this reads for, eg
+ * when needing to re-read.
*/
private InputStreamFactory streamFactory;
+
/**
- * The path to the file that contains the contents of this stream.
- * This is either the original file passed to the
- * {@link #TikaInputStream(Path)} constructor or a temporary file created
- * by a call to the {@link #getPath()} method. If neither has been called,
- * then the value is <code>null</code>.
+ * The path to the file that contains the contents of this stream. This is either the original
+ * file passed to the {@link #TikaInputStream(Path)} constructor or a temporary file created by
+ * a call to the {@link #getPath()} method. If neither has been called, then the value is <code>
+ * null</code>.
*/
private Path path;
- /**
- * Total length of the stream, or -1 if unknown.
- */
+
+ /** Total length of the stream, or -1 if unknown. */
private long length;
- /**
- * Current read position within this stream.
- */
+
+ /** Current read position within this stream. */
private long position = 0;
- /**
- * Marked position, or -1 if there is no current mark.
- */
+
+ /** Marked position, or -1 if there is no current mark. */
private long mark = -1;
+
/**
- * A opened container, such as a POIFS FileSystem
- * for an OLE2 document, or a Zip file for a
- * zip based (eg ooxml, odf) document.
+ * A opened container, such as a POIFS FileSystem for an OLE2 document, or a Zip file for a zip
+ * based (eg ooxml, odf) document.
*/
private Object openContainer;
+
private int consecutiveEOFs = 0;
private byte[] skipBuffer;
- //suffix of the file if known. This is used to create temp files
- //with the right suffixes. This should include the initial . as in ".doc"
+ // suffix of the file if known. This is used to create temp files
+ // with the right suffixes. This should include the initial . as in ".doc"
private String suffix = null;
/**
- * Creates a TikaInputStream instance. This private constructor is used
- * by the static factory methods based on the available information.
+ * Creates a TikaInputStream instance. This private constructor is used by the static factory
+ * methods based on the available information.
*
* @param path the path to the file that contains the stream
* @throws IOException if an I/O error occurs
@@ -147,8 +138,8 @@
}
/**
- * Creates a TikaInputStream instance. This private constructor is used
- * by the static factory methods based on the available information.
+ * Creates a TikaInputStream instance. This private constructor is used by the static factory
+ * methods based on the available information.
*
* @param file the file that contains the stream
* @throws FileNotFoundException if the file does not exist
@@ -161,22 +152,21 @@
this.tmp = new TemporaryResources();
this.length = file.length();
this.suffix = FilenameUtils.getSuffixFromPath(path.getFileName().toString());
-
}
/**
- * Creates a TikaInputStream instance. This private constructor is used
- * by the static factory methods based on the available information.
- * <p>
- * The given stream needs to be included in the given temporary resource
- * collection if the caller wants it also to get closed when the
- * {@link #close()} method is invoked.
+ * Creates a TikaInputStream instance. This private constructor is used by the static factory
+ * methods based on the available information.
+ *
+ * <p>The given stream needs to be included in the given temporary resource collection if the
+ * caller wants it also to get closed when the {@link #close()} method is invoked.
*
* @param stream <em>buffered</em> stream (must support the mark feature)
- * @param tmp tracker for temporary resources associated with this stream
+ * @param tmp tracker for temporary resources associated with this stream
* @param length total length of the stream, or -1 if unknown
*/
- private TikaInputStream(InputStream stream, TemporaryResources tmp, long length, String suffix) {
+ private TikaInputStream(
+ InputStream stream, TemporaryResources tmp, long length, String suffix) {
super(stream);
this.path = null;
this.tmp = tmp;
@@ -185,46 +175,44 @@
}
/**
- * Checks whether the given stream is a TikaInputStream instance.
- * The given stream can be <code>null</code>, in which case the return
- * value is <code>false</code>.
+ * Checks whether the given stream is a TikaInputStream instance. The given stream can be <code>
+ * null</code>, in which case the return value is <code>false</code>.
*
* @param stream input stream, possibly <code>null</code>
- * @return <code>true</code> if the stream is a TikaInputStream instance,
- * <code>false</code> otherwise
+ * @return <code>true</code> if the stream is a TikaInputStream instance, <code>false</code>
+ * otherwise
*/
public static boolean isTikaInputStream(InputStream stream) {
return stream instanceof TikaInputStream;
}
/**
- * Casts or wraps the given stream to a TikaInputStream instance.
- * This method can be used to access the functionality of this class
- * even when given just a normal input stream instance.
- * <p>
- * The given temporary file provider is used for any temporary files,
- * and should be disposed when the returned stream is no longer used.
- * <p>
- * Use this method instead of the {@link #get(InputStream)} alternative
- * when you <em>don't</em> explicitly close the returned stream. The
- * recommended access pattern is:
+ * Casts or wraps the given stream to a TikaInputStream instance. This method can be used to
+ * access the functionality of this class even when given just a normal input stream instance.
+ *
+ * <p>The given temporary file provider is used for any temporary files, and should be disposed
+ * when the returned stream is no longer used.
+ *
+ * <p>Use this method instead of the {@link #get(InputStream)} alternative when you
+ * <em>don't</em> explicitly close the returned stream. The recommended access pattern is:
+ *
* <pre>
* try (TemporaryResources tmp = new TemporaryResources()) {
* TikaInputStream stream = TikaInputStream.get(..., tmp);
* // process stream but don't close it
* }
* </pre>
- * <p>
- * The given stream instance will <em>not</em> be closed when the
- * {@link TemporaryResources#close()} method is called by the
- * try-with-resources statement. The caller is expected to explicitly
- * close the original stream when it's no longer used.
+ *
+ * <p>The given stream instance will <em>not</em> be closed when the {@link
+ * TemporaryResources#close()} method is called by the try-with-resources statement. The caller
+ * is expected to explicitly close the original stream when it's no longer used.
*
* @param stream normal input stream
* @return a TikaInputStream instance
* @since Apache Tika 0.10
*/
- public static TikaInputStream get(InputStream stream, TemporaryResources tmp, Metadata metadata) {
+ public static TikaInputStream get(
+ InputStream stream, TemporaryResources tmp, Metadata metadata) {
if (stream == null) {
throw new NullPointerException("The Stream must not be null");
}
@@ -241,23 +229,22 @@
}
/**
- * Casts or wraps the given stream to a TikaInputStream instance.
- * This method can be used to access the functionality of this class
- * even when given just a normal input stream instance.
- * <p>
- * Use this method instead of the
- * {@link #get(InputStream, TemporaryResources, Metadata)} alternative when you
- * <em>do</em> explicitly close the returned stream. The recommended
- * access pattern is:
+ * Casts or wraps the given stream to a TikaInputStream instance. This method can be used to
+ * access the functionality of this class even when given just a normal input stream instance.
+ *
+ * <p>Use this method instead of the {@link #get(InputStream, TemporaryResources, Metadata)}
+ * alternative when you <em>do</em> explicitly close the returned stream. The recommended access
+ * pattern is:
+ *
* <pre>
* try (TikaInputStream stream = TikaInputStream.get(...)) {
* // process stream
* }
* </pre>
- * <p>
- * The given stream instance will be closed along with any other resources
- * associated with the returned TikaInputStream instance when the
- * {@link #close()} method is called by the try-with-resources statement.
+ *
+ * <p>The given stream instance will be closed along with any other resources associated with
+ * the returned TikaInputStream instance when the {@link #close()} method is called by the
+ * try-with-resources statement.
*
* @param stream normal input stream
* @return a TikaInputStream instance
@@ -267,8 +254,8 @@
}
/**
- * Returns the given stream casts to a TikaInputStream, or
- * <code>null</code> if the stream is not a TikaInputStream.
+ * Returns the given stream casts to a TikaInputStream, or <code>null</code> if the stream is
+ * not a TikaInputStream.
*
* @param stream normal input stream
* @return a TikaInputStream instance
@@ -284,9 +271,9 @@
/**
* Creates a TikaInputStream from the given array of bytes.
- * <p>
- * Note that you must always explicitly close the returned stream as in
- * some cases it may end up writing the given data to a temporary file.
+ *
+ * <p>Note that you must always explicitly close the returned stream as in some cases it may end
+ * up writing the given data to a temporary file.
*
* @param data input data
* @return a TikaInputStream instance
@@ -296,28 +283,31 @@
}
/**
- * Creates a TikaInputStream from the given array of bytes. The length of
- * the array is stored as input metadata in the given metadata instance.
- * <p>
- * Note that you must always explicitly close the returned stream as in
- * some cases it may end up writing the given data to a temporary file.
+ * Creates a TikaInputStream from the given array of bytes. The length of the array is stored as
+ * input metadata in the given metadata instance.
*
- * @param data input data
+ * <p>Note that you must always explicitly close the returned stream as in some cases it may end
+ * up writing the given data to a temporary file.
+ *
+ * @param data input data
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws IOException
*/
public static TikaInputStream get(byte[] data, Metadata metadata) {
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
- return new TikaInputStream(new UnsynchronizedByteArrayInputStream(data), new TemporaryResources(),
- data.length, getExtension(metadata));
+ return new TikaInputStream(
+ new UnsynchronizedByteArrayInputStream(data),
+ new TemporaryResources(),
+ data.length,
+ getExtension(metadata));
}
/**
* Creates a TikaInputStream from the file at the given path.
- * <p>
- * Note that you must always explicitly close the returned stream to
- * prevent leaking open file handles.
+ *
+ * <p>Note that you must always explicitly close the returned stream to prevent leaking open
+ * file handles.
*
* @param path input file
* @return a TikaInputStream instance
@@ -328,16 +318,16 @@
}
/**
- * Creates a TikaInputStream from the file at the given path. The file name
- * and length are stored as input metadata in the given metadata instance.
- * <p>
- * If there's an {@link TikaCoreProperties#RESOURCE_NAME_KEY} in the
- * metadata object, this will not overwrite that value with the path's name.
- * <p>
- * Note that you must always explicitly close the returned stream to
- * prevent leaking open file handles.
+ * Creates a TikaInputStream from the file at the given path. The file name and length are
+ * stored as input metadata in the given metadata instance.
*
- * @param path input file
+ * <p>If there's an {@link TikaCoreProperties#RESOURCE_NAME_KEY} in the metadata object, this
+ * will not overwrite that value with the path's name.
+ *
+ * <p>Note that you must always explicitly close the returned stream to prevent leaking open
+ * file handles.
+ *
+ * @param path input file
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws IOException if an I/O error occurs
@@ -362,15 +352,15 @@
/**
* Creates a TikaInputStream from the given file.
- * <p>
- * Note that you must always explicitly close the returned stream to
- * prevent leaking open file handles.
+ *
+ * <p>Note that you must always explicitly close the returned stream to prevent leaking open
+ * file handles.
*
* @param file input file
* @return a TikaInputStream instance
* @throws FileNotFoundException if the file does not exist
- * @deprecated use {@link #get(Path)}. In Tika 2.0, this will be removed
- * or modified to throw an IOException.
+ * @deprecated use {@link #get(Path)}. In Tika 2.0, this will be removed or modified to throw an
+ * IOException.
*/
@Deprecated
public static TikaInputStream get(File file) throws FileNotFoundException {
@@ -378,19 +368,18 @@
}
/**
- * Creates a TikaInputStream from the given file. The file name and
- * length are stored as input metadata in the given metadata instance.
- * <p>
- * Note that you must always explicitly close the returned stream to
- * prevent leaking open file handles.
+ * Creates a TikaInputStream from the given file. The file name and length are stored as input
+ * metadata in the given metadata instance.
*
- * @param file input file
+ * <p>Note that you must always explicitly close the returned stream to prevent leaking open
+ * file handles.
+ *
+ * @param file input file
* @param metadata metadata instance
* @return a TikaInputStream instance
- * @throws FileNotFoundException if the file does not exist
- * or cannot be opened for reading
- * @deprecated use {@link #get(Path, Metadata)}. In Tika 2.0,
- * this will be removed or modified to throw an IOException.
+ * @throws FileNotFoundException if the file does not exist or cannot be opened for reading
+ * @deprecated use {@link #get(Path, Metadata)}. In Tika 2.0, this will be removed or modified
+ * to throw an IOException.
*/
@Deprecated
public static TikaInputStream get(File file, Metadata metadata) throws FileNotFoundException {
@@ -402,24 +391,24 @@
}
/**
- * Creates a TikaInputStream from a Factory which can create
- * fresh {@link InputStream}s for the same resource multiple times.
- * <p>This is typically desired when working with {@link Parser}s that
- * need to re-read the stream multiple times, where other forms
- * of buffering (eg File) are slower than just getting a fresh
- * new stream each time.
+ * Creates a TikaInputStream from a Factory which can create fresh {@link InputStream}s for the
+ * same resource multiple times.
+ *
+ * <p>This is typically desired when working with {@link Parser}s that need to re-read the
+ * stream multiple times, where other forms of buffering (eg File) are slower than just getting
+ * a fresh new stream each time.
*/
public static TikaInputStream get(InputStreamFactory factory) throws IOException {
return get(factory, new TemporaryResources());
}
/**
- * Creates a TikaInputStream from a Factory which can create
- * fresh {@link InputStream}s for the same resource multiple times.
- * <p>This is typically desired when working with {@link Parser}s that
- * need to re-read the stream multiple times, where other forms
- * of buffering (eg File) are slower than just getting a fresh
- * new stream each time.
+ * Creates a TikaInputStream from a Factory which can create fresh {@link InputStream}s for the
+ * same resource multiple times.
+ *
+ * <p>This is typically desired when working with {@link Parser}s that need to re-read the
+ * stream multiple times, where other forms of buffering (eg File) are slower than just getting
+ * a fresh new stream each time.
*/
public static TikaInputStream get(InputStreamFactory factory, TemporaryResources tmp)
throws IOException {
@@ -430,11 +419,10 @@
/**
* Creates a TikaInputStream from the given database BLOB.
- * <p>
- * Note that the result set containing the BLOB may need to be kept open
- * until the returned TikaInputStream has been processed and closed.
- * You must also always explicitly close the returned stream as in
- * some cases it may end up writing the blob data to a temporary file.
+ *
+ * <p>Note that the result set containing the BLOB may need to be kept open until the returned
+ * TikaInputStream has been processed and closed. You must also always explicitly close the
+ * returned stream as in some cases it may end up writing the blob data to a temporary file.
*
* @param blob database BLOB
* @return a TikaInputStream instance
@@ -445,16 +433,14 @@
}
/**
- * Creates a TikaInputStream from the given database BLOB. The BLOB
- * length (if available) is stored as input metadata in the given
- * metadata instance.
- * <p>
- * Note that the result set containing the BLOB may need to be kept open
- * until the returned TikaInputStream has been processed and closed.
- * You must also always explicitly close the returned stream as in
- * some cases it may end up writing the blob data to a temporary file.
+ * Creates a TikaInputStream from the given database BLOB. The BLOB length (if available) is
+ * stored as input metadata in the given metadata instance.
*
- * @param blob database BLOB
+ * <p>Note that the result set containing the BLOB may need to be kept open until the returned
+ * TikaInputStream has been processed and closed. You must also always explicitly close the
+ * returned stream as in some cases it may end up writing the blob data to a temporary file.
+ *
+ * @param blob database BLOB
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws SQLException if BLOB data can not be accessed
@@ -474,8 +460,10 @@
// the offset in Blob.getBytes() starts at 1
return get(blob.getBytes(1, (int) length), metadata);
} else {
- return new TikaInputStream(new BufferedInputStream(blob.getBinaryStream()),
- new TemporaryResources(), length,
+ return new TikaInputStream(
+ new BufferedInputStream(blob.getBinaryStream()),
+ new TemporaryResources(),
+ length,
getExtension(metadata));
}
}
@@ -490,9 +478,9 @@
/**
* Creates a TikaInputStream from the resource at the given URI.
- * <p>
- * Note that you must always explicitly close the returned stream as in
- * some cases it may end up writing the resource to a temporary file.
+ *
+ * <p>Note that you must always explicitly close the returned stream as in some cases it may end
+ * up writing the resource to a temporary file.
*
* @param uri resource URI
* @return a TikaInputStream instance
@@ -503,13 +491,13 @@
}
/**
- * Creates a TikaInputStream from the resource at the given URI. The
- * available input metadata is stored in the given metadata instance.
- * <p>
- * Note that you must always explicitly close the returned stream as in
- * some cases it may end up writing the resource to a temporary file.
+ * Creates a TikaInputStream from the resource at the given URI. The available input metadata is
+ * stored in the given metadata instance.
*
- * @param uri resource URI
+ * <p>Note that you must always explicitly close the returned stream as in some cases it may end
+ * up writing the resource to a temporary file.
+ *
+ * @param uri resource URI
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws IOException if the resource can not be accessed
@@ -528,9 +516,9 @@
/**
* Creates a TikaInputStream from the resource at the given URL.
- * <p>
- * Note that you must always explicitly close the returned stream as in
- * some cases it may end up writing the resource to a temporary file.
+ *
+ * <p>Note that you must always explicitly close the returned stream as in some cases it may end
+ * up writing the resource to a temporary file.
*
* @param url resource URL
* @return a TikaInputStream instance
@@ -541,13 +529,13 @@
}
/**
- * Creates a TikaInputStream from the resource at the given URL. The
- * available input metadata is stored in the given metadata instance.
- * <p>
- * Note that you must always explicitly close the returned stream as in
- * some cases it may end up writing the resource to a temporary file.
+ * Creates a TikaInputStream from the resource at the given URL. The available input metadata is
+ * stored in the given metadata instance.
*
- * @param url resource URL
+ * <p>Note that you must always explicitly close the returned stream as in some cases it may end
+ * up writing the resource to a temporary file.
+ *
+ * @param url resource URL
* @param metadata metadata instance
* @return a TikaInputStream instance
* @throws IOException if the resource can not be accessed
@@ -588,15 +576,17 @@
metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
}
- return new TikaInputStream(new BufferedInputStream(connection.getInputStream()),
- new TemporaryResources(), length, getExtension(metadata));
+ return new TikaInputStream(
+ new BufferedInputStream(connection.getInputStream()),
+ new TemporaryResources(),
+ length,
+ getExtension(metadata));
}
/**
- * Fills the given buffer with upcoming bytes from this stream without
- * advancing the current stream position. The buffer is filled up unless
- * the end of stream is encountered before that. This method will block
- * if not enough bytes are immediately available.
+ * Fills the given buffer with upcoming bytes from this stream without advancing the current
+ * stream position. The buffer is filled up unless the end of stream is encountered before that.
+ * This method will block if not enough bytes are immediately available.
*
* @param buffer byte buffer
* @return number of bytes written to the buffer
@@ -623,9 +613,8 @@
}
/**
- * Returns the open container object if any, such as a
- * POIFS FileSystem in the event of an OLE2 document
- * being detected and processed by the OLE2 detector.
+ * Returns the open container object if any, such as a POIFS FileSystem in the event of an OLE2
+ * document being detected and processed by the OLE2 detector.
*
* @return Open Container for this stream, or <code>null</code> if none
*/
@@ -634,10 +623,8 @@
}
/**
- * Stores the open container object against
- * the stream, eg after a Zip contents
- * detector has loaded the file to decide
- * what it contains.
+ * Stores the open container object against the stream, eg after a Zip contents detector has
+ * loaded the file to decide what it contains.
*/
public void setOpenContainer(Object container) {
openContainer = container;
@@ -647,7 +634,6 @@
}
/**
- *
* @param closeable
*/
public void addCloseableResource(Closeable closeable) {
@@ -659,8 +645,8 @@
}
/**
- * If the Stream was created from an {@link InputStreamFactory},
- * return that, otherwise <code>null</code>.
+ * If the Stream was created from an {@link InputStreamFactory}, return that, otherwise <code>
+ * null</code>.
*/
public InputStreamFactory getInputStreamFactory() {
return streamFactory;
@@ -670,12 +656,10 @@
return path != null;
}
-
/**
- * If the user created this TikaInputStream with a file,
- * the original file will be returned. If not, the entire stream
- * will be spooled to a temporary file which will be deleted
- * upon the close of this TikaInputStream
+ * If the user created this TikaInputStream with a file, the original file will be returned. If
+ * not, the entire stream will be spooled to a temporary file which will be deleted upon the
+ * close of this TikaInputStream
*
* @return
* @throws IOException
@@ -685,11 +669,11 @@
}
/**
- * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist,
- * the full file will be spooled to disk
- * @return the original path used in the initialization of this TikaInputStream,
- * a temporary file if the stream was shorter than <code>maxBytes</code>, or <code>null</code>
- * if the underlying stream was longer than maxBytes.
+ * @param maxBytes if this is less than 0 and if an underlying file doesn't already exist, the
+ * full file will be spooled to disk
+ * @return the original path used in the initialization of this TikaInputStream, a temporary
+ * file if the stream was shorter than <code>maxBytes</code>, or <code>null</code> if the
+ * underlying stream was longer than maxBytes.
* @throws IOException
*/
public Path getPath(int maxBytes) throws IOException {
@@ -701,10 +685,10 @@
if (maxBytes > -1) {
this.mark(maxBytes);
try (BoundedInputStream boundedInputStream =
- new BoundedInputStream(maxBytes, this)) {
+ new BoundedInputStream(maxBytes, this)) {
Files.copy(boundedInputStream, tmpFile, REPLACE_EXISTING);
if (boundedInputStream.hasHitBound()) {
- //tmpFile will be cleaned up when this TikaInputStream is closed
+ // tmpFile will be cleaned up when this TikaInputStream is closed
return null;
}
} finally {
@@ -714,7 +698,7 @@
// Spool the entire stream into a temporary file
Files.copy(this, tmpFile, REPLACE_EXISTING);
}
- //successful so far, set tis' path to tmpFile
+ // successful so far, set tis' path to tmpFile
path = tmpFile;
// Create a new input stream and make sure it'll get closed
@@ -726,12 +710,13 @@
// close() method is called. The closing of the new stream
// is already being handled as noted above.
final InputStream oldStream = in;
- in = new BufferedInputStream(newStream) {
- @Override
- public void close() throws IOException {
- oldStream.close();
- }
- };
+ in =
+ new BufferedInputStream(newStream) {
+ @Override
+ public void close() throws IOException {
+ oldStream.close();
+ }
+ };
// Update length to file size. Update position, mark
length = Files.size(path);
@@ -760,11 +745,10 @@
}
/**
- * Returns the length (in bytes) of this stream. Note that if the length
- * was not available when this stream was instantiated, then this method
- * will use the {@link #getPath()} method to buffer the entire stream to
- * a temporary file in order to calculate the stream length. This case
- * will only work if the stream has not yet been consumed.
+ * Returns the length (in bytes) of this stream. Note that if the length was not available when
+ * this stream was instantiated, then this method will use the {@link #getPath()} method to
+ * buffer the entire stream to a temporary file in order to calculate the stream length. This
+ * case will only work if the stream has not yet been consumed.
*
* @return stream length
* @throws IOException if the length can not be determined
@@ -786,19 +770,19 @@
}
/**
- * This relies on {@link IOUtils#skip(InputStream, long, byte[])} to ensure
- * that the alleged bytes skipped were actually skipped.
+ * This relies on {@link IOUtils#skip(InputStream, long, byte[])} to ensure that the alleged
+ * bytes skipped were actually skipped.
*
* @param ln the number of bytes to skip
* @return the number of bytes skipped
- * @throws IOException if the number of bytes requested to be skipped does not match the
- * number of bytes skipped or if there's an IOException during the read.
+ * @throws IOException if the number of bytes requested to be skipped does not match the number
+ * of bytes skipped or if there's an IOException during the read.
*/
@Override
public long skip(long ln) throws IOException {
- //On TIKA-3092, we found that using the static byte array buffer
- //caused problems with multithreading with the FlateInputStream
- //from a POIFS document stream
+ // On TIKA-3092, we found that using the static byte array buffer
+ // caused problems with multithreading with the FlateInputStream
+ // from a POIFS document stream
if (skipBuffer == null) {
skipBuffer = new byte[4096];
}
@@ -847,9 +831,10 @@
} else {
consecutiveEOFs++;
if (consecutiveEOFs > MAX_CONSECUTIVE_EOFS) {
- throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
- "If you think your file is not corrupt, please open an issue on Tika's " +
- "JIRA");
+ throw new IOException(
+ "Read too many -1 (EOFs); there could be an infinite loop."
+ + "If you think your file is not corrupt, please open an issue on Tika's "
+ + "JIRA");
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/io/package-info.java b/tika-core/src/main/java/org/apache/tika/io/package-info.java
index 36c7274..daef464 100644
--- a/tika-core/src/main/java/org/apache/tika/io/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/io/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * IO utilities.
- */
+/** IO utilities. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.io;
diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java
index e5b520f..00081aa 100644
--- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageConfidence.java
@@ -17,6 +17,8 @@
package org.apache.tika.language.detect;
public enum LanguageConfidence {
-
- HIGH, MEDIUM, LOW, NONE // Special value when no language is detected
+ HIGH,
+ MEDIUM,
+ LOW,
+ NONE // Special value when no language is detected
}
diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
index 722ded3..2166cf1 100644
--- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
@@ -20,7 +20,6 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
-
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.utils.CompareUtils;
@@ -49,8 +48,8 @@
private static final ServiceLoader DEFAULT_SERVICE_LOADER = new ServiceLoader();
- //if a user calls detect on a huge string, break it into this size
- //and add sequentially until hasEnoughText() is true
+ // if a user calls detect on a huge string, break it into this size
+ // and add sequentially until hasEnoughText() is true
private static final int BUFFER_LENGTH = 4096;
// True if text is expected to be a mix of languages, and thus higher-resolution
@@ -100,20 +99,18 @@
}
/**
- * Load (or re-load) all available language models. This must
- * be called after any settings that would impact the models
- * being loaded (e.g. mixed language/short text), but
- * before any of the document processing routines (below)
- * are called. Note that it only needs to be called once.
+ * Load (or re-load) all available language models. This must be called after any settings that
+ * would impact the models being loaded (e.g. mixed language/short text), but before any of the
+ * document processing routines (below) are called. Note that it only needs to be called once.
*
* @return this
*/
public abstract LanguageDetector loadModels() throws IOException;
/**
- * Load (or re-load) the models specified in <languages>. These use the
- * ISO 639-1 names, with an optional "-<country code>" for more
- * specific specification (e.g. "zh-CN" for Chinese in China).
+ * Load (or re-load) the models specified in <languages>. These use the ISO 639-1 names, with an
+ * optional "-<country code>" for more specific specification (e.g. "zh-CN" for Chinese in
+ * China).
*
* @param languages list of target languages.
* @return this
@@ -121,8 +118,7 @@
public abstract LanguageDetector loadModels(Set<String> languages) throws IOException;
/**
- * Provide information about whether a model exists for a specific
- * language.
+ * Provide information about whether a model exists for a specific language.
*
* @param language ISO 639-1 name for language
* @return true if a model for this language exists.
@@ -130,13 +126,14 @@
public abstract boolean hasModel(String language);
/**
- * Set the a-priori probabilities for these languages. The provided map uses the language
- * as the key, and the probability (0.0 > probability < 1.0) of text being in that language.
- * Note that if the probabilities don't sum to 1.0, these values will be normalized.
- * <p>
- * If hasModel() returns false for any of the languages, an IllegalArgumentException is thrown.
- * <p>
- * Use of these probabilities is detector-specific, and thus might not impact the results at
+ * Set the a-priori probabilities for these languages. The provided map uses the language as the
+ * key, and the probability (0.0 > probability < 1.0) of text being in that language. Note that
+ * if the probabilities don't sum to 1.0, these values will be normalized.
+ *
+ * <p>If hasModel() returns false for any of the languages, an IllegalArgumentException is
+ * thrown.
+ *
+ * <p>Use of these probabilities is detector-specific, and thus might not impact the results at
* all. As such, these should be viewed as a hint.
*
* @param languageProbabilities Map from language to probability
@@ -149,26 +146,22 @@
// The routines below are called when processing a document
// ============================================================
- /**
- * Reset statistics about the current document being processed
- */
+ /** Reset statistics about the current document being processed */
public abstract void reset();
/**
- * Add statistics about this text for the current document. Note
- * that we assume an implicit word break exists before/after
- * each of these runs of text.
+ * Add statistics about this text for the current document. Note that we assume an implicit word
+ * break exists before/after each of these runs of text.
*
* @param cbuf Character buffer
- * @param off Offset into cbuf to first character in the run of text
- * @param len Number of characters in the run of text.
+ * @param off Offset into cbuf to first character in the run of text
+ * @param len Number of characters in the run of text.
*/
public abstract void addText(char[] cbuf, int off, int len);
/**
- * Add <text> to the statistics being accumulated for the current
- * document. Note that this is a default implementation for adding
- * a string (not optimized)
+ * Add <text> to the statistics being accumulated for the current document. Note that this is a
+ * default implementation for adding a string (not optimized)
*
* @param text Characters to add to current statistics.
*/
@@ -180,24 +173,22 @@
return;
}
int start = 0;
- while (! hasEnoughText() && start < len) {
+ while (!hasEnoughText() && start < len) {
int end = Math.min(start + BUFFER_LENGTH, len);
char[] chars = text.subSequence(start, end).toString().toCharArray();
addText(chars, 0, chars.length);
start += BUFFER_LENGTH;
}
-
}
-
/**
- * Tell the caller whether more text is required for the current document
- * before the language can be reliably detected.
- * <p>
- * Implementations can override this to do early termination of stats
- * collection, which can improve performance with longer documents.
- * <p>
- * Note that detect() can be called even when this returns false
+ * Tell the caller whether more text is required for the current document before the language
+ * can be reliably detected.
+ *
+ * <p>Implementations can override this to do early termination of stats collection, which can
+ * improve performance with longer documents.
+ *
+ * <p>Note that detect() can be called even when this returns false
*
* @return true if we have enough text for reliable detection.
*/
@@ -208,9 +199,9 @@
/**
* Detect languages based on previously submitted text (via addText calls).
*
- * @return list of all possible languages with at least medium confidence,
- * sorted by confidence from highest to lowest. There will always
- * be at least one result, which might have a confidence of NONE.
+ * @return list of all possible languages with at least medium confidence, sorted by confidence
+ * from highest to lowest. There will always be at least one result, which might have a
+ * confidence of NONE.
*/
public abstract List<LanguageResult> detectAll();
@@ -223,8 +214,8 @@
* Utility wrapper that detects the language of a given chunk of text.
*
* @param text String to add to current statistics.
- * @return list of all possible languages with at least medium confidence,
- * sorted by confidence from highest to lowest.
+ * @return list of all possible languages with at least medium confidence, sorted by confidence
+ * from highest to lowest.
*/
public List<LanguageResult> detectAll(String text) {
reset();
@@ -237,5 +228,4 @@
addText(text);
return detect();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java
index af3e1bd..3ce358b 100644
--- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageHandler.java
@@ -17,12 +17,10 @@
package org.apache.tika.language.detect;
import java.io.IOException;
-
import org.apache.tika.sax.WriteOutContentHandler;
/**
- * SAX content handler that updates a language detector based on all the
- * received character content.
+ * SAX content handler that updates a language detector based on all the received character content.
*
* @since Apache Tika 0.10
*/
@@ -45,9 +43,8 @@
}
/**
- * Returns the language detector used by this content handler.
- * Note that the returned detector gets updated whenever new SAX events
- * are received by this content handler.
+ * Returns the language detector used by this content handler. Note that the returned detector
+ * gets updated whenever new SAX events are received by this content handler.
*
* @return language detector
*/
diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java
index ed52640..6e6db1a 100644
--- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageNames.java
@@ -20,18 +20,20 @@
/**
* Support for language tags (as defined by https://tools.ietf.org/html/bcp47)
- * <p>
- * See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of
- * three character language codes.
- * <p>
- * TODO change to LanguageTag, and use these vs. strings everywhere in the
- * language detector API?
+ *
+ * <p>See https://en.wikipedia.org/wiki/List_of_ISO_639-3_codes for a list of three character
+ * language codes.
+ *
+ * <p>TODO change to LanguageTag, and use these vs. strings everywhere in the language detector API?
*/
public class LanguageNames {
public static String makeName(String language, String script, String region) {
Locale locale =
- new Locale.Builder().setLanguage(language).setScript(script).setRegion(region)
+ new Locale.Builder()
+ .setLanguage(language)
+ .setScript(script)
+ .setRegion(region)
.build();
return locale.toLanguageTag();
}
diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
index dada5fd..e231e48 100644
--- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageResult.java
@@ -67,16 +67,11 @@
}
/**
- * Return true if the target language matches the detected language. We consider
- * it a match if, for the precision requested or detected, it matches. This means:
- * <p>
- * target | detected | match?
- * zh | en | false
- * zh | zh | true
- * zh | zh-CN | true
- * zh-CN | zh | true
- * zh-CN | zh-TW | false
- * zh-CN | zh-cn | true (case-insensitive)
+ * Return true if the target language matches the detected language. We consider it a match if,
+ * for the precision requested or detected, it matches. This means:
+ *
+ * <p>target | detected | match? zh | en | false zh | zh | true zh | zh-CN | true zh-CN | zh |
+ * true zh-CN | zh-TW | false zh-CN | zh-cn | true (case-insensitive)
*
* @param language
* @return
diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java
index 92cd630..539c861 100644
--- a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java
+++ b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageWriter.java
@@ -34,9 +34,8 @@
}
/**
- * Returns the language detector used by this writer. Note that
- * the returned language detector gets updated whenever new characters
- * are written.
+ * Returns the language detector used by this writer. Note that the returned language detector
+ * gets updated whenever new characters are written.
*
* @return language detector
*/
@@ -58,19 +57,13 @@
detector.addText(cbuf, off, len);
}
- /**
- * Ignored.
- */
+ /** Ignored. */
@Override
- public void close() throws IOException {
- }
+ public void close() throws IOException {}
- /**
- * Ignored.
- */
+ /** Ignored. */
@Override
- public void flush() {
- }
+ public void flush() {}
public void reset() {
detector.reset();
diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java b/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
index 11b45d5..31e65d4 100644
--- a/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
+++ b/tika-core/src/main/java/org/apache/tika/language/translate/DefaultTranslator.java
@@ -19,20 +19,18 @@
import java.io.IOException;
import java.util.List;
-
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.exception.TikaException;
import org.apache.tika.utils.CompareUtils;
/**
- * A translator which picks the first available {@link Translator}
- * implementations available through the
- * {@link javax.imageio.spi.ServiceRegistry service provider mechanism}.
+ * A translator which picks the first available {@link Translator} implementations available through
+ * the {@link javax.imageio.spi.ServiceRegistry service provider mechanism}.
*
* @since Apache Tika 1.6
*/
public class DefaultTranslator implements Translator {
- private transient final ServiceLoader loader;
+ private final transient ServiceLoader loader;
public DefaultTranslator(ServiceLoader loader) {
this.loader = loader;
@@ -43,8 +41,8 @@
}
/**
- * Finds all statically loadable translators and sort the list by name,
- * rather than discovery order.
+ * Finds all statically loadable translators and sort the list by name, rather than discovery
+ * order.
*
* @param loader service loader
* @return ordered list of statically loadable translators
@@ -55,9 +53,7 @@
return translators;
}
- /**
- * Returns the first available translator, or null if none are
- */
+ /** Returns the first available translator, or null if none are */
private static Translator getFirstAvailable(ServiceLoader loader) {
for (Translator t : getDefaultTranslators(loader)) {
if (t.isAvailable()) {
@@ -67,9 +63,7 @@
return null;
}
- /**
- * Translate, using the first available service-loaded translator
- */
+ /** Translate, using the first available service-loaded translator */
public String translate(String text, String sourceLanguage, String targetLanguage)
throws TikaException, IOException {
Translator t = getFirstAvailable(loader);
@@ -79,9 +73,7 @@
throw new TikaException("No translators currently available");
}
- /**
- * Translate, using the first available service-loaded translator
- */
+ /** Translate, using the first available service-loaded translator */
public String translate(String text, String targetLanguage) throws TikaException, IOException {
Translator t = getFirstAvailable(loader);
if (t != null) {
@@ -90,16 +82,12 @@
throw new TikaException("No translators currently available");
}
- /**
- * Returns all available translators
- */
+ /** Returns all available translators */
public List<Translator> getTranslators() {
return getDefaultTranslators(loader);
}
- /**
- * Returns the current translator
- */
+ /** Returns the current translator */
public Translator getTranslator() {
return getFirstAvailable(loader);
}
diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java b/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java
index 9324af2..10a1270 100644
--- a/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java
+++ b/tika-core/src/main/java/org/apache/tika/language/translate/EmptyTranslator.java
@@ -17,9 +17,8 @@
package org.apache.tika.language.translate;
/**
- * Dummy translator that always declines to give any text. Useful as a
- * sentinel translator for when none others are available.
- * for unknown document types.
+ * Dummy translator that always declines to give any text. Useful as a sentinel translator for when
+ * none others are available. for unknown document types.
*/
public class EmptyTranslator implements Translator {
public String translate(String text, String sourceLanguage, String targetLanguage) {
diff --git a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
index 563e6c4..4905d72 100644
--- a/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
+++ b/tika-core/src/main/java/org/apache/tika/language/translate/Translator.java
@@ -17,7 +17,6 @@
package org.apache.tika.language.translate;
import java.io.IOException;
-
import org.apache.tika.exception.TikaException;
/**
@@ -29,11 +28,11 @@
/**
* Translate text between given languages.
*
- * @param text The text to translate.
+ * @param text The text to translate.
* @param sourceLanguage The input text language (for example, "en").
* @param targetLanguage The desired language to translate to (for example, "fr").
* @return The translation result. If translation is unavailable, returns the same text back.
- * @throws TikaException When there is an error translating.
+ * @throws TikaException When there is an error translating.
* @throws java.io.IOException
* @since Tika 1.6
*/
@@ -41,13 +40,13 @@
throws TikaException, IOException;
/**
- * Translate text to the given language
- * This method attempts to auto-detect the source language of the text.
+ * Translate text to the given language This method attempts to auto-detect the source language
+ * of the text.
*
- * @param text The text to translate.
+ * @param text The text to translate.
* @param targetLanguage The desired language to translate to (for example, "hi").
* @return The translation result. If translation is unavailable, returns the same text back.
- * @throws TikaException When there is an error translating.
+ * @throws TikaException When there is an error translating.
* @throws java.io.IOException
* @since Tika 1.6
*/
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java b/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java
index db689f9..aa51e08 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/AccessPermissions.java
@@ -18,56 +18,36 @@
package org.apache.tika.metadata;
/**
- * Until we can find a common standard, we'll use these options. They
- * were mostly derived from PDFBox's AccessPermission, but some can
- * apply to other document formats, especially CAN_MODIFY and FILL_IN_FORM.
+ * Until we can find a common standard, we'll use these options. They were mostly derived from
+ * PDFBox's AccessPermission, but some can apply to other document formats, especially CAN_MODIFY
+ * and FILL_IN_FORM.
*/
public interface AccessPermissions {
- String PREFIX =
- "access_permission" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
+ String PREFIX = "access_permission" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
- /**
- * Can any modifications be made to the document
- */
+ /** Can any modifications be made to the document */
Property CAN_MODIFY = Property.externalTextBag(PREFIX + "can_modify");
- /**
- * Should content be extracted, generally.
- */
+ /** Should content be extracted, generally. */
Property EXTRACT_CONTENT = Property.externalText(PREFIX + "extract_content");
- /**
- * Should content be extracted for the purposes
- * of accessibility.
- */
+ /** Should content be extracted for the purposes of accessibility. */
Property EXTRACT_FOR_ACCESSIBILITY =
Property.externalText(PREFIX + "extract_for_accessibility");
- /**
- * Can the user insert/rotate/delete pages.
- */
+ /** Can the user insert/rotate/delete pages. */
Property ASSEMBLE_DOCUMENT = Property.externalText(PREFIX + "assemble_document");
-
- /**
- * Can the user fill in a form
- */
+ /** Can the user fill in a form */
Property FILL_IN_FORM = Property.externalText(PREFIX + "fill_in_form");
- /**
- * Can the user modify annotations
- */
+ /** Can the user modify annotations */
Property CAN_MODIFY_ANNOTATIONS = Property.externalText(PREFIX + "modify_annotations");
- /**
- * Can the user print the document
- */
+ /** Can the user print the document */
Property CAN_PRINT = Property.externalText(PREFIX + "can_print");
- /**
- * Can the user print an image-degraded version of the document.
- */
+ /** Can the user print an image-degraded version of the document. */
Property CAN_PRINT_FAITHFUL = Property.externalText(PREFIX + "can_print_faithful");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java b/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java
index 9ad1632..5c9772d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/ClimateForcast.java
@@ -17,8 +17,8 @@
package org.apache.tika.metadata;
/**
- * Met keys from NCAR CCSM files in the <a
- * href="http://cf-pcmdi.llnl.gov/">Climate Forecast Convention</a>.
+ * Met keys from NCAR CCSM files in the <a href="http://cf-pcmdi.llnl.gov/">Climate Forecast
+ * Convention</a>.
*/
public interface ClimateForcast {
@@ -51,5 +51,4 @@
String COMMENT = "comment";
String MODEL_NAME_ENGLISH = "model_name_english";
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java b/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java
index 122a1fc..1a57611 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/CreativeCommons.java
@@ -28,5 +28,4 @@
String LICENSE_LOCATION = "License-Location";
String WORK_TYPE = "Work-Type";
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
index 23750c3..33c8beb 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/DublinCore.java
@@ -29,162 +29,158 @@
String PREFIX_DC_TERMS = "dcterms";
/**
- * Typically, Format may include the media-type or dimensions of the
- * resource. Format may be used to determine the software, hardware or
- * other equipment needed to display or operate the resource. Examples
- * of dimensions include size and duration. Recommended best practice is
- * to select a value from a controlled vocabulary (for example, the list
- * of Internet Media Types [MIME] defining computer media formats).
+ * Typically, Format may include the media-type or dimensions of the resource. Format may be
+ * used to determine the software, hardware or other equipment needed to display or operate the
+ * resource. Examples of dimensions include size and duration. Recommended best practice is to
+ * select a value from a controlled vocabulary (for example, the list of Internet Media Types
+ * [MIME] defining computer media formats).
*/
- Property FORMAT = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "format");
+ Property FORMAT =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "format");
/**
- * Recommended best practice is to identify the resource by means of
- * a string or number conforming to a formal identification system.
- * Example formal identification systems include the Uniform Resource
- * Identifier (URI) (including the Uniform Resource Locator (URL)),
- * the Digital Object Identifier (DOI) and the International Standard
- * Book Number (ISBN).
+ * Recommended best practice is to identify the resource by means of a string or number
+ * conforming to a formal identification system. Example formal identification systems include
+ * the Uniform Resource Identifier (URI) (including the Uniform Resource Locator (URL)), the
+ * Digital Object Identifier (DOI) and the International Standard Book Number (ISBN).
*/
- Property IDENTIFIER = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "identifier");
+ Property IDENTIFIER =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "identifier");
+
+ /** Date on which the resource was changed. */
+ Property MODIFIED =
+ Property.internalDate(
+ PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "modified");
/**
- * Date on which the resource was changed.
+ * An entity responsible for making contributions to the content of the resource. Examples of a
+ * Contributor include a person, an organisation, or a service. Typically, the name of a
+ * Contributor should be used to indicate the entity.
*/
- Property MODIFIED = Property.internalDate(
- PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "modified");
+ Property CONTRIBUTOR =
+ Property.internalTextBag(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contributor");
/**
- * An entity responsible for making contributions to the content of the
- * resource. Examples of a Contributor include a person, an organisation,
- * or a service. Typically, the name of a Contributor should be used to
+ * The extent or scope of the content of the resource. Coverage will typically include spatial
+ * location (a place name or geographic coordinates), temporal period (a period label, date, or
+ * date range) or jurisdiction (such as a named administrative entity). Recommended best
+ * practice is to select a value from a controlled vocabulary (for example, the Thesaurus of
+ * Geographic Names [TGN]) and that, where appropriate, named places or time periods be used in
+ * preference to numeric identifiers such as sets of coordinates or date ranges.
+ */
+ Property COVERAGE =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "coverage");
+
+ /**
+ * An entity primarily responsible for making the content of the resource. Examples of a Creator
+ * include a person, an organisation, or a service. Typically, the name of a Creator should be
+ * used to indicate the entity.
+ */
+ Property CREATOR =
+ Property.internalTextBag(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creator");
+
+ /** Date of creation of the resource. */
+ Property CREATED =
+ Property.internalDate(
+ PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "created");
+
+ /**
+ * A date associated with an event in the life cycle of the resource. Typically, Date will be
+ * associated with the creation or availability of the resource. Recommended best practice for
+ * encoding the date value is defined in a profile of ISO 8601 [W3CDTF] and follows the
+ * YYYY-MM-DD format.
+ */
+ Property DATE =
+ Property.internalDate(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "date");
+
+ /**
+ * An account of the content of the resource. Description may include but is not limited to: an
+ * abstract, table of contents, reference to a graphical representation of content or a
+ * free-text account of the content.
+ */
+ Property DESCRIPTION =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "description");
+
+ /**
+ * A language of the intellectual content of the resource. Recommended best practice is to use
+ * RFC 3066 [RFC3066], which, in conjunction with ISO 639 [ISO639], defines two- and
+ * three-letter primary language tags with optional subtags. Examples include "en" or "eng" for
+ * English, "akk" for Akkadian, and "en-GB" for English used in the United Kingdom.
+ */
+ Property LANGUAGE =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "language");
+
+ /**
+ * An entity responsible for making the resource available. Examples of a Publisher include a
+ * person, an organisation, or a service. Typically, the name of a Publisher should be used to
* indicate the entity.
*/
- Property CONTRIBUTOR = Property.internalTextBag(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contributor");
+ Property PUBLISHER =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "publisher");
/**
- * The extent or scope of the content of the resource. Coverage will
- * typically include spatial location (a place name or geographic
- * coordinates), temporal period (a period label, date, or date range)
- * or jurisdiction (such as a named administrative entity). Recommended
- * best practice is to select a value from a controlled vocabulary (for
- * example, the Thesaurus of Geographic Names [TGN]) and that, where
- * appropriate, named places or time periods be used in preference to
- * numeric identifiers such as sets of coordinates or date ranges.
+ * A reference to a related resource. Recommended best practice is to reference the resource by
+ * means of a string or number conforming to a formal identification system.
*/
- Property COVERAGE = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "coverage");
+ Property RELATION =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "relation");
/**
- * An entity primarily responsible for making the content of the resource.
- * Examples of a Creator include a person, an organisation, or a service.
- * Typically, the name of a Creator should be used to indicate the entity.
+ * Information about rights held in and over the resource. Typically, a Rights element will
+ * contain a rights management statement for the resource, or reference a service providing such
+ * information. Rights information often encompasses Intellectual Property Rights (IPR),
+ * Copyright, and various Property Rights. If the Rights element is absent, no assumptions can
+ * be made about the status of these and other rights with respect to the resource.
*/
- Property CREATOR = Property.internalTextBag(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creator");
+ Property RIGHTS =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "rights");
/**
- * Date of creation of the resource.
- */
- Property CREATED = Property.internalDate(
- PREFIX_DC_TERMS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "created");
-
- /**
- * A date associated with an event in the life cycle of the resource.
- * Typically, Date will be associated with the creation or availability of
- * the resource. Recommended best practice for encoding the date value is
- * defined in a profile of ISO 8601 [W3CDTF] and follows the YYYY-MM-DD
- * format.
- */
- Property DATE = Property.internalDate(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "date");
-
- /**
- * An account of the content of the resource. Description may include
- * but is not limited to: an abstract, table of contents, reference to
- * a graphical representation of content or a free-text account of
- * the content.
- */
- Property DESCRIPTION = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "description");
-
- /**
- * A language of the intellectual content of the resource. Recommended
- * best practice is to use RFC 3066 [RFC3066], which, in conjunction
- * with ISO 639 [ISO639], defines two- and three-letter primary language
- * tags with optional subtags. Examples include "en" or "eng" for English,
- * "akk" for Akkadian, and "en-GB" for English used in the United Kingdom.
- */
- Property LANGUAGE = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "language");
-
- /**
- * An entity responsible for making the resource available. Examples of
- * a Publisher include a person, an organisation, or a service. Typically,
- * the name of a Publisher should be used to indicate the entity.
- */
- Property PUBLISHER = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "publisher");
-
- /**
- * A reference to a related resource. Recommended best practice is to
- * reference the resource by means of a string or number conforming to
- * a formal identification system.
- */
- Property RELATION = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "relation");
-
- /**
- * Information about rights held in and over the resource. Typically,
- * a Rights element will contain a rights management statement for
- * the resource, or reference a service providing such information.
- * Rights information often encompasses Intellectual Property Rights
- * (IPR), Copyright, and various Property Rights. If the Rights element
- * is absent, no assumptions can be made about the status of these and
- * other rights with respect to the resource.
- */
- Property RIGHTS = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "rights");
-
- /**
- * A reference to a resource from which the present resource is derived.
- * The present resource may be derived from the Source resource in whole
- * or in part. Recommended best practice is to reference the resource by
- * means of a string or number conforming to a formal identification
+ * A reference to a resource from which the present resource is derived. The present resource
+ * may be derived from the Source resource in whole or in part. Recommended best practice is to
+ * reference the resource by means of a string or number conforming to a formal identification
* system.
*/
- Property SOURCE = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "source");
+ Property SOURCE =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "source");
/**
- * The topic of the content of the resource. Typically, a Subject will
- * be expressed as keywords, key phrases or classification codes that
- * describe a topic of the resource. Recommended best practice is to
- * select a value from a controlled vocabulary or formal classification
- * scheme.
+ * The topic of the content of the resource. Typically, a Subject will be expressed as keywords,
+ * key phrases or classification codes that describe a topic of the resource. Recommended best
+ * practice is to select a value from a controlled vocabulary or formal classification scheme.
*/
- Property SUBJECT = Property.internalTextBag(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject");
+ Property SUBJECT =
+ Property.internalTextBag(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject");
/**
- * A name given to the resource. Typically, a Title will be a name by
- * which the resource is formally known.
+ * A name given to the resource. Typically, a Title will be a name by which the resource is
+ * formally known.
*/
- Property TITLE = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "title");
+ Property TITLE =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "title");
/**
- * The nature or genre of the content of the resource. Type includes terms
- * describing general categories, functions, genres, or aggregation levels
- * for content. Recommended best practice is to select a value from a
- * controlled vocabulary (for example, the DCMI Type Vocabulary
- * [DCMITYPE]). To describe the physical or digital manifestation of
- * the resource, use the Format element.
+ * The nature or genre of the content of the resource. Type includes terms describing general
+ * categories, functions, genres, or aggregation levels for content. Recommended best practice
+ * is to select a value from a controlled vocabulary (for example, the DCMI Type Vocabulary
+ * [DCMITYPE]). To describe the physical or digital manifestation of the resource, use the
+ * Format element.
*/
- Property TYPE = Property.internalText(
- PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "type");
-
+ Property TYPE =
+ Property.internalText(
+ PREFIX_DC + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "type");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Epub.java b/tika-core/src/main/java/org/apache/tika/metadata/Epub.java
index c6e3c3c..b967e04 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Epub.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Epub.java
@@ -26,11 +26,12 @@
String EPUB_PREFIX = "epub" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
/**
- * This is set to "pre-paginated" if any itemref on the spine or the
- * metadata has a "pre-paginated" value, "reflowable" otherwise.
+ * This is set to "pre-paginated" if any itemref on the spine or the metadata has a
+ * "pre-paginated" value, "reflowable" otherwise.
*/
- Property RENDITION_LAYOUT = Property.externalClosedChoise(EPUB_PREFIX + "rendition:layout",
- "pre-paginated", "reflowable");
+ Property RENDITION_LAYOUT =
+ Property.externalClosedChoise(
+ EPUB_PREFIX + "rendition:layout", "pre-paginated", "reflowable");
Property VERSION = Property.externalText(EPUB_PREFIX + "version");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java b/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java
index 8636969..6bd97c7 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/ExternalProcess.java
@@ -20,60 +20,65 @@
String PREFIX_EXTERNAL_META = "external-process";
- /**
- * STD_OUT
- */
- Property STD_OUT = Property.externalText(
- PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stdout");
+ /** STD_OUT */
+ Property STD_OUT =
+ Property.externalText(
+ PREFIX_EXTERNAL_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "stdout");
+
+ /** STD_ERR */
+ Property STD_ERR =
+ Property.externalText(
+ PREFIX_EXTERNAL_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "stderr");
+
+ /** Whether or not stdout was truncated */
+ Property STD_OUT_IS_TRUNCATED =
+ Property.externalBoolean(
+ PREFIX_EXTERNAL_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "stdout-truncated");
+
+ /** Whether or not stderr was truncated */
+ Property STD_ERR_IS_TRUNCATED =
+ Property.externalBoolean(
+ PREFIX_EXTERNAL_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "stderr-truncated");
/**
- * STD_ERR
+ * Stdout length whether or not it was truncated. If it was truncated, what would its length
+ * have been; if it wasn't, what is its length.
*/
- Property STD_ERR = Property.externalText(
- PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "stderr");
-
+ Property STD_OUT_LENGTH =
+ Property.externalReal(
+ PREFIX_EXTERNAL_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "stdout-length");
/**
- * Whether or not stdout was truncated
+ * Stderr length whether or not it was truncated. If it was truncated, what would its length
+ * have been; if it wasn't, what is its length.
*/
- Property STD_OUT_IS_TRUNCATED = Property.externalBoolean(
- PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "stdout-truncated");
+ Property STD_ERR_LENGTH =
+ Property.externalReal(
+ PREFIX_EXTERNAL_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "stderr-length");
- /**
- * Whether or not stderr was truncated
- */
- Property STD_ERR_IS_TRUNCATED = Property.externalBoolean(
- PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "stderr-truncated");
+ /** Exit value of the sub process */
+ Property EXIT_VALUE =
+ Property.externalInteger(
+ PREFIX_EXTERNAL_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "exit-value");
- /**
- * Stdout length whether or not it was truncated. If it was truncated,
- * what would its length have been; if it wasn't, what is its length.
- */
- Property STD_OUT_LENGTH = Property.externalReal(
- PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "stdout-length");
-
- /**
- * Stderr length whether or not it was truncated. If it was truncated,
- * what would its length have been; if it wasn't, what is its length.
- */
- Property STD_ERR_LENGTH = Property.externalReal(
- PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "stderr-length");
-
- /**
- * Exit value of the sub process
- */
- Property EXIT_VALUE = Property.externalInteger(
- PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "exit-value");
-
- /**
- * Was the process timed out
- */
- Property IS_TIMEOUT = Property.externalBoolean(
- PREFIX_EXTERNAL_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "timeout");
-
+ /** Was the process timed out */
+ Property IS_TIMEOUT =
+ Property.externalBoolean(
+ PREFIX_EXTERNAL_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "timeout");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java b/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java
index 87afab7..1ddda08 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/FileSystem.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.metadata;
-/**
- * A collection of metadata elements for file system level metadata
- */
+/** A collection of metadata elements for file system level metadata */
public interface FileSystem {
final String PREFIX = "fs:";
@@ -26,5 +24,4 @@
Property CREATED = Property.externalDate(PREFIX + "created");
Property MODIFIED = Property.externalDate(PREFIX + "modified");
Property ACCESSED = Property.externalDate(PREFIX + "accessed");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Font.java b/tika-core/src/main/java/org/apache/tika/metadata/Font.java
index 706e199..8e20bd9 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Font.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Font.java
@@ -1,28 +1,26 @@
package org.apache.tika.metadata; /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
public interface Font {
String PREFIX_FONT_META = "font";
- /**
- * Basic name of a font used in a file
- */
- Property FONT_NAME = Property.internalTextBag(
- PREFIX_FONT_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "name");
-
+ /** Basic name of a font used in a file */
+ Property FONT_NAME =
+ Property.internalTextBag(
+ PREFIX_FONT_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "name");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java b/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java
index 3c4006f..df6f427 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Geographic.java
@@ -17,29 +17,20 @@
package org.apache.tika.metadata;
/**
- * Geographic schema. This is a collection of
- * {@link Property property definition} constants for geographic
- * information, as defined in the W3C Geo Vocabularies.
+ * Geographic schema. This is a collection of {@link Property property definition} constants for
+ * geographic information, as defined in the W3C Geo Vocabularies.
*
- * @see <a href="http://www.w3.org/2003/01/geo/"
- * >W3C Basic Geo Vocabulary</a>
+ * @see <a href="http://www.w3.org/2003/01/geo/" >W3C Basic Geo Vocabulary</a>
* @since Apache Tika 0.8
*/
public interface Geographic {
- /**
- * The WGS84 Latitude of the Point
- */
+ /** The WGS84 Latitude of the Point */
Property LATITUDE = Property.internalReal("geo:lat");
- /**
- * The WGS84 Longitude of the Point
- */
+ /** The WGS84 Longitude of the Point */
Property LONGITUDE = Property.internalReal("geo:long");
- /**
- * The WGS84 Altitude of the Point
- */
+ /** The WGS84 Altitude of the Point */
Property ALTITUDE = Property.internalReal("geo:alt");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
index 3e37cf6..0c8533d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/HTML.java
@@ -1,29 +1,28 @@
package org.apache.tika.metadata; /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
public interface HTML {
String PREFIX_HTML_META = "html_meta";
-
/**
- * If a script element contains a src value, this value
- * is set in the embedded document's metadata
+ * If a script element contains a src value, this value is set in the embedded document's
+ * metadata
*/
- Property SCRIPT_SOURCE = Property.internalText(
- PREFIX_HTML_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "scriptSrc");
-
+ Property SCRIPT_SOURCE =
+ Property.internalText(
+ PREFIX_HTML_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "scriptSrc");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java b/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java
index 937f365..b6ed1bd 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/HttpHeaders.java
@@ -19,8 +19,8 @@
/**
* A collection of HTTP header names.
*
- * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol --
- * HTTP/1.1 (RFC 2616)</a>
+ * @see <a href="http://rfc-ref.org/RFC-TEXTS/2616/">Hypertext Transfer Protocol -- HTTP/1.1 (RFC
+ * 2616)</a>
*/
public interface HttpHeaders {
@@ -39,5 +39,4 @@
String CONTENT_TYPE = "Content-Type";
String LOCATION = "Location";
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java b/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java
index f5fa644..0dc4073 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/IPTC.java
@@ -22,12 +22,13 @@
/**
* IPTC photo metadata schema.
- * <p>
- * A collection of
- * {@link Property property definition} constants for the photo metadata
+ *
+ * <p>A collection of {@link Property property definition} constants for the photo metadata
* properties defined in the IPTC standard.
*
- * @see <a href="http://www.iptc.org/std/photometadata/specification/IPTC-PhotoMetadata-201007_1.pdf">IPTC Photo Metadata</a>
+ * @see <a
+ * href="http://www.iptc.org/std/photometadata/specification/IPTC-PhotoMetadata-201007_1.pdf">IPTC
+ * Photo Metadata</a>
* @since Apache Tika 1.1
*/
public interface IPTC {
@@ -41,69 +42,66 @@
String PREFIX_PLUS = "plus";
/**
- * Name of the city the content is focussing on -- either the place shown
- * in visual media or referenced by text or audio media. This element is at
- * the third level of a top-down geographical hierarchy.
- * <p>
- * This is a detail of a location with blurred semantics as it does not
- * clearly indicate whether it is the location in the image or the location
- * the photo was taken - which can be different. Two more concise properties
- * are available in IPTC Extension with Location Created and Location Shown
- * in the Image.
- * <p>
- * Maps to this IIM property: 2:90 City
+ * Name of the city the content is focussing on -- either the place shown in visual media or
+ * referenced by text or audio media. This element is at the third level of a top-down
+ * geographical hierarchy.
+ *
+ * <p>This is a detail of a location with blurred semantics as it does not clearly indicate
+ * whether it is the location in the image or the location the photo was taken - which can be
+ * different. Two more concise properties are available in IPTC Extension with Location Created
+ * and Location Shown in the Image.
+ *
+ * <p>Maps to this IIM property: 2:90 City
*
* @see Photoshop#CITY
*/
Property CITY = Photoshop.CITY;
/**
- * Full name of the country the content is focussing on -- either the
- * country shown in visual media or referenced in text or audio media. This
- * element is at the top/first level of a top- down geographical hierarchy.
- * The full name should be expressed as a verbal name and not as a code, a
- * code should go to the element "CountryCode"
- * <p>
- * This is a detail of a location with blurred semantics as it does not
- * clearly indicate whether it is the location in the image or the location
- * the photo was taken - which can be different. Two more concise properties
- * are available in IPTC Extension with Location Created and Location Shown
- * in the Image.
- * <p>
- * Maps to this IIM property: 2:101 Country/Primary Location Name
+ * Full name of the country the content is focussing on -- either the country shown in visual
+ * media or referenced in text or audio media. This element is at the top/first level of a top-
+ * down geographical hierarchy. The full name should be expressed as a verbal name and not as a
+ * code, a code should go to the element "CountryCode"
+ *
+ * <p>This is a detail of a location with blurred semantics as it does not clearly indicate
+ * whether it is the location in the image or the location the photo was taken - which can be
+ * different. Two more concise properties are available in IPTC Extension with Location Created
+ * and Location Shown in the Image.
+ *
+ * <p>Maps to this IIM property: 2:101 Country/Primary Location Name
*
* @see Photoshop#COUNTRY
*/
Property COUNTRY = Photoshop.COUNTRY;
/**
- * Code of the country the content is focussing on -- either the country
- * shown in visual media or referenced in text or audio media. This element
- * is at the top/first level of a top-down geographical hierarchy. The code
- * should be taken from ISO 3166 two or three letter code. The full name of
- * a country should go to the "Country" element.
- * <p>
- * This is a detail of a location with blurred semantics as it does not
- * clearly indicate whether it is the location in the image or the location
- * the photo was taken - which can be different. Two more concise properties
- * are available in IPTC Extension with Location Created and Location Shown
- * in the Image.
- * <p>
- * Maps to this IIM property: 2:100 Country/Primary Location Code
+ * Code of the country the content is focussing on -- either the country shown in visual media
+ * or referenced in text or audio media. This element is at the top/first level of a top-down
+ * geographical hierarchy. The code should be taken from ISO 3166 two or three letter code. The
+ * full name of a country should go to the "Country" element.
+ *
+ * <p>This is a detail of a location with blurred semantics as it does not clearly indicate
+ * whether it is the location in the image or the location the photo was taken - which can be
+ * different. Two more concise properties are available in IPTC Extension with Location Created
+ * and Location Shown in the Image.
+ *
+ * <p>Maps to this IIM property: 2:100 Country/Primary Location Code
*/
- Property COUNTRY_CODE = Property.internalText(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CountryCode");
+ Property COUNTRY_CODE =
+ Property.internalText(
+ PREFIX_IPTC_CORE
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "CountryCode");
/**
- * A textual description, including captions, of the item's content,
- * particularly used where the object is not text.
- * <p>
- * Note: the XMP property (dc:description) which stores the value of this
- * IPTC Core property is of type Lang Alt. Hence any software agent dealing
- * with this property must abide to the processing rules for
- * Lang Alt value type as specified by the XMP specifications.
- * <p>
- * Maps to this IIM property: 2:120 Caption/Abstract
+ * A textual description, including captions, of the item's content, particularly used where the
+ * object is not text.
+ *
+ * <p>Note: the XMP property (dc:description) which stores the value of this IPTC Core property
+ * is of type Lang Alt. Hence any software agent dealing with this property must abide to the
+ * processing rules for Lang Alt value type as specified by the XMP specifications.
+ *
+ * <p>Maps to this IIM property: 2:120 Caption/Abstract
*
* @see DublinCore#DESCRIPTION
*/
@@ -111,264 +109,246 @@
/**
* A brief synopsis of the caption. Headline is not the same as Title.
- * <p>
- * Maps to this IIM property: 2:105 Headline
+ *
+ * <p>Maps to this IIM property: 2:105 Headline
*
* @see Photoshop#HEADLINE
*/
Property HEADLINE = Photoshop.HEADLINE;
/**
- * Describes the nature, intellectual, artistic or journalistic
- * characteristic of a item, not specifically its content.
- * <p>
- * The IPTC recognizes that the corresponding IPTC Genre NewsCodes needs
- * photo specific extension to be better usable with this field (as of the
- * release of this standard in the year 2008).
- * <p>
- * Maps to this IIM property: 2:04 Object Attribute Reference
+ * Describes the nature, intellectual, artistic or journalistic characteristic of a item, not
+ * specifically its content.
+ *
+ * <p>The IPTC recognizes that the corresponding IPTC Genre NewsCodes needs photo specific
+ * extension to be better usable with this field (as of the release of this standard in the year
+ * 2008).
+ *
+ * <p>Maps to this IIM property: 2:04 Object Attribute Reference
*/
- Property INTELLECTUAL_GENRE = Property.internalText(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "IntellectualGenre");
+ Property INTELLECTUAL_GENRE =
+ Property.internalText(
+ PREFIX_IPTC_CORE
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "IntellectualGenre");
/**
- * Keywords to express the subject of the content. Keywords may be free
- * text and don't have to be taken from a controlled vocabulary. Codes from
- * the controlled vocabulary IPTC Subject NewsCodes must go to the
- * "Subject Code" field.
- * <p>
- * Single values of this field should not be restricted to single words
- * but must allow for phrases as well.
- * <p>
- * Maps to this IIM property: 2:25 Keywords
+ * Keywords to express the subject of the content. Keywords may be free text and don't have to
+ * be taken from a controlled vocabulary. Codes from the controlled vocabulary IPTC Subject
+ * NewsCodes must go to the "Subject Code" field.
+ *
+ * <p>Single values of this field should not be restricted to single words but must allow for
+ * phrases as well.
+ *
+ * <p>Maps to this IIM property: 2:25 Keywords
*
* @see DublinCore#SUBJECT
*/
Property KEYWORDS = DublinCore.SUBJECT;
/**
- * Name of the subregion of a country -- either called province or state or
- * anything else -- the content is focussing on -- either the subregion
- * shown in visual media or referenced by text or audio media. This element
- * is at the second level of a top-down geographical hierarchy.
- * <p>
- * This is a detail of a location with blurred semantics as it does not
- * clearly indicate whether it is the location in the image or the location
- * the photo was taken - which can be different. Two more concise properties
- * are available in IPTC Extension with Location Created and Location Shown
- * in the Image.
- * <p>
- * Maps to this IIM property: 2:95 Province/State
+ * Name of the subregion of a country -- either called province or state or anything else -- the
+ * content is focussing on -- either the subregion shown in visual media or referenced by text
+ * or audio media. This element is at the second level of a top-down geographical hierarchy.
+ *
+ * <p>This is a detail of a location with blurred semantics as it does not clearly indicate
+ * whether it is the location in the image or the location the photo was taken - which can be
+ * different. Two more concise properties are available in IPTC Extension with Location Created
+ * and Location Shown in the Image.
+ *
+ * <p>Maps to this IIM property: 2:95 Province/State
*
* @see Photoshop#STATE
*/
Property PROVINCE_OR_STATE = Photoshop.STATE;
/**
- * Describes the scene of a news content. Specifies one or more terms
- * from the IPTC "Scene-NewsCodes". Each Scene is represented as a string of
- * 6 digits in an unordered list.
- * <p>
- * Note: Only Scene values from this IPTC taxonomy should be used here. More
- * about the IPTC Scene-NewsCodes at www.newscodes.org.
+ * Describes the scene of a news content. Specifies one or more terms from the IPTC
+ * "Scene-NewsCodes". Each Scene is represented as a string of 6 digits in an unordered list.
+ *
+ * <p>Note: Only Scene values from this IPTC taxonomy should be used here. More about the IPTC
+ * Scene-NewsCodes at www.newscodes.org.
*/
- Property SCENE_CODE = Property.internalTextBag(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Scene");
+ Property SCENE_CODE =
+ Property.internalTextBag(
+ PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Scene");
/**
- * Specifies one or more Subjects from the IPTC Subject-NewsCodes taxonomy
- * to categorise the content. Each Subject is represented as a string of 8
- * digits in an unordered list.
- * <p>
- * Note: Only Subjects from a controlled vocabulary should be used here,
- * free text has to be put into the Keyword element. More about
- * IPTC Subject-NewsCodes at www.newscodes.org.
+ * Specifies one or more Subjects from the IPTC Subject-NewsCodes taxonomy to categorise the
+ * content. Each Subject is represented as a string of 8 digits in an unordered list.
+ *
+ * <p>Note: Only Subjects from a controlled vocabulary should be used here, free text has to be
+ * put into the Keyword element. More about IPTC Subject-NewsCodes at www.newscodes.org.
*/
- Property SUBJECT_CODE = Property.internalTextBag(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "SubjectCode");
+ Property SUBJECT_CODE =
+ Property.internalTextBag(
+ PREFIX_IPTC_CORE
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "SubjectCode");
/**
- * Name of a sublocation the content is focussing on -- either the
- * location shown in visual media or referenced by text or audio media. This
- * location name could either be the name of a sublocation to a city or the
- * name of a well known location or (natural) monument outside a city. In
- * the sense of a sublocation to a city this element is at the fourth level
- * of a top-down geographical hierarchy.
- * <p>
- * This is a detail of a location with blurred semantics as it does not
- * clearly indicate whether it is the location in the image or the location
- * the photo was taken - which can be different. Two more concise properties
- * are available in IPTC Extension with Location Created and Location Shown
- * in the Image.
- * <p>
- * Maps to this IIM property: 2:92 Sublocation
+ * Name of a sublocation the content is focussing on -- either the location shown in visual
+ * media or referenced by text or audio media. This location name could either be the name of a
+ * sublocation to a city or the name of a well known location or (natural) monument outside a
+ * city. In the sense of a sublocation to a city this element is at the fourth level of a
+ * top-down geographical hierarchy.
+ *
+ * <p>This is a detail of a location with blurred semantics as it does not clearly indicate
+ * whether it is the location in the image or the location the photo was taken - which can be
+ * different. Two more concise properties are available in IPTC Extension with Location Created
+ * and Location Shown in the Image.
+ *
+ * <p>Maps to this IIM property: 2:92 Sublocation
*/
- Property SUBLOCATION = Property.internalText(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Location");
+ Property SUBLOCATION =
+ Property.internalText(
+ PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Location");
/**
- * Designates the date and optionally the time the intellectual content was
- * created rather than the date of the creation of the physical
- * representation.
- * <p>
- * If a software system requires explicit time values and no time is given
- * by the Date Created property the software system should default the time
- * to 00:00:00. If the software system does not require an explicit time
- * value the time part should be left empty as it is.
- * <p>
- * Note 1: Any content of the IIM dataset 2:60, Time Created, should be
- * merged to this element.
- * Note 2: Implementers are encouraged to provide
- * the creation date and time from the EXIF data of a digital
- * camera to the user for entering this date for the first time.
- * <p>
- * Maps to this IIM property: 2:55 Date Created
+ * Designates the date and optionally the time the intellectual content was created rather than
+ * the date of the creation of the physical representation.
+ *
+ * <p>If a software system requires explicit time values and no time is given by the Date
+ * Created property the software system should default the time to 00:00:00. If the software
+ * system does not require an explicit time value the time part should be left empty as it is.
+ *
+ * <p>Note 1: Any content of the IIM dataset 2:60, Time Created, should be merged to this
+ * element. Note 2: Implementers are encouraged to provide the creation date and time from the
+ * EXIF data of a digital camera to the user for entering this date for the first time.
+ *
+ * <p>Maps to this IIM property: 2:55 Date Created
*
* @see Photoshop#DATE_CREATED
*/
Property DATE_CREATED = Photoshop.DATE_CREATED;
/**
- * Identifier or the name of the person involved in writing, editing or
- * correcting the description of the content.
- * <p>
- * Maps to this IIM property: 2:122 Writer/Editor
+ * Identifier or the name of the person involved in writing, editing or correcting the
+ * description of the content.
+ *
+ * <p>Maps to this IIM property: 2:122 Writer/Editor
*
* @see Photoshop#CAPTION_WRITER
*/
Property DESCRIPTION_WRITER = Photoshop.CAPTION_WRITER;
/**
- * Any of a number of instructions from the provider or creator to the
- * receiver of the item.
- * <p>
- * Maps to this IIM property: 2:40 Special Instruction
+ * Any of a number of instructions from the provider or creator to the receiver of the item.
+ *
+ * <p>Maps to this IIM property: 2:40 Special Instruction
*
* @see Photoshop#INSTRUCTIONS
*/
Property INSTRUCTIONS = Photoshop.INSTRUCTIONS;
/**
- * Number or identifier for the purpose of improved workflow handling. This
- * is a user created identifier related to the job for which the item is
- * supplied.
- * <p>
- * Note: As this identifier references a job of the receiver's workflow it
- * must first be issued by the receiver, then transmitted to the creator or
- * provider of the news object and finally added by the creator
- * to this field.
- * <p>
- * Maps to this IIM property: 2:103 Original Transmission Reference
+ * Number or identifier for the purpose of improved workflow handling. This is a user created
+ * identifier related to the job for which the item is supplied.
+ *
+ * <p>Note: As this identifier references a job of the receiver's workflow it must first be
+ * issued by the receiver, then transmitted to the creator or provider of the news object and
+ * finally added by the creator to this field.
+ *
+ * <p>Maps to this IIM property: 2:103 Original Transmission Reference
*
* @see Photoshop#TRANSMISSION_REFERENCE
*/
Property JOB_ID = Photoshop.TRANSMISSION_REFERENCE;
/**
- * A shorthand reference for the item. Title provides a short human readable
- * name which can be a text and/or numeric reference. It is not the same as
- * Headline.
- * <p>
- * Many use the Title field to store the filename of the image, though the
- * field may be used in many ways. Formal identifiers are provided by the
- * Digital Image Id, or the Registry Entry property of the IPTC Extension.
- * <p>
- * Note 1: This element aligns with the use of Dublin Core's "Title"
- * element.
- * Note 2: the XMP property (dc:title) which stores the value of
- * this IPTC Core property is of type Lang Alt. Hence any software agent
- * dealing with this property must abide to the processing rules for Lang
- * Alt value type as specified by the XMP specifications.
- * <p>
- * Maps to this IIM property: 2:05 Object Name
+ * A shorthand reference for the item. Title provides a short human readable name which can be a
+ * text and/or numeric reference. It is not the same as Headline.
+ *
+ * <p>Many use the Title field to store the filename of the image, though the field may be used
+ * in many ways. Formal identifiers are provided by the Digital Image Id, or the Registry Entry
+ * property of the IPTC Extension.
+ *
+ * <p>Note 1: This element aligns with the use of Dublin Core's "Title" element. Note 2: the XMP
+ * property (dc:title) which stores the value of this IPTC Core property is of type Lang Alt.
+ * Hence any software agent dealing with this property must abide to the processing rules for
+ * Lang Alt value type as specified by the XMP specifications.
+ *
+ * <p>Maps to this IIM property: 2:05 Object Name
*
* @see DublinCore#TITLE
*/
Property TITLE = DublinCore.TITLE;
/**
- * Contains any necessary copyright notice for claiming the intellectual
- * property for this item and should identify the current owner of the
- * copyright for the item. Other entities like the creator of the item may
- * be added in the corresponding field. Notes on usage rights should be
+ * Contains any necessary copyright notice for claiming the intellectual property for this item
+ * and should identify the current owner of the copyright for the item. Other entities like the
+ * creator of the item may be added in the corresponding field. Notes on usage rights should be
* provided in "Rights usage terms".
- * <p>
- * Copyright ownership can be expressed in a more controlled way using the
- * PLUS fields "Copyright Owner", "Copyright Owner ID",
- * "Copyright Owner Name" of the IPTC Extension. It is the user's
- * responsibility to keep the values of the four fields in sync.
- * <p>
- * Note: the XMP property (dc:rights) which stores the value of this IPTC
- * Core property is of type Lang Alt. Hence any software agent dealing with
- * this property must abide to the processing rules for Lang Alt
- * value type as specified by the XMP specifications.
- * <p>
- * Maps to this IIM property: 2:116 Copyright Notice
+ *
+ * <p>Copyright ownership can be expressed in a more controlled way using the PLUS fields
+ * "Copyright Owner", "Copyright Owner ID", "Copyright Owner Name" of the IPTC Extension. It is
+ * the user's responsibility to keep the values of the four fields in sync.
+ *
+ * <p>Note: the XMP property (dc:rights) which stores the value of this IPTC Core property is of
+ * type Lang Alt. Hence any software agent dealing with this property must abide to the
+ * processing rules for Lang Alt value type as specified by the XMP specifications.
+ *
+ * <p>Maps to this IIM property: 2:116 Copyright Notice
*
* @see DublinCore#RIGHTS
*/
Property COPYRIGHT_NOTICE = DublinCore.RIGHTS;
/**
- * Contains the name of the person who created the content of this item, a
- * photographer for photos, a graphic artist for graphics, or a writer for
- * textual news, but in cases where the photographer should not be
- * identified the name of a company or organisation may be appropriate.
- * <p>
- * The creator can be expressed in a more controlled way using the
- * "Image Creator" of PLUS in the IPTC Extension additionally. It is the
- * user's responsibility to keep the values of the IPTC Core and the PLUS
- * fields in sync.
- * <p>
- * Maps to this IIM property: 2:80 By-line
+ * Contains the name of the person who created the content of this item, a photographer for
+ * photos, a graphic artist for graphics, or a writer for textual news, but in cases where the
+ * photographer should not be identified the name of a company or organisation may be
+ * appropriate.
+ *
+ * <p>The creator can be expressed in a more controlled way using the "Image Creator" of PLUS in
+ * the IPTC Extension additionally. It is the user's responsibility to keep the values of the
+ * IPTC Core and the PLUS fields in sync.
+ *
+ * <p>Maps to this IIM property: 2:80 By-line
*
* @see DublinCore#CREATOR
*/
Property CREATOR = DublinCore.CREATOR;
/**
- * The creator's contact information provides all necessary information to
- * get in contact with the creator of this item and comprises a set of
- * sub-properties for proper addressing.
- * <p>
- * The IPTC Extension Licensor fields should be used instead of these
- * Creator's Contact Info fields if you are using IPTC Extension fields. If
- * the creator is also the licensor his or her contact information should be
- * provided in the Licensor fields.
- * <p>
- * Note 1 to user interface implementers: All sub-properties of "Creator's
- * contact information" should be shown as group on the form.
- * Note 2: the
- * CreatorContactInfo sub-properties' naming aligns with the vCard
- * specification RFC 2426.
+ * The creator's contact information provides all necessary information to get in contact with
+ * the creator of this item and comprises a set of sub-properties for proper addressing.
+ *
+ * <p>The IPTC Extension Licensor fields should be used instead of these Creator's Contact Info
+ * fields if you are using IPTC Extension fields. If the creator is also the licensor his or her
+ * contact information should be provided in the Licensor fields.
+ *
+ * <p>Note 1 to user interface implementers: All sub-properties of "Creator's contact
+ * information" should be shown as group on the form. Note 2: the CreatorContactInfo
+ * sub-properties' naming aligns with the vCard specification RFC 2426.
*/
- Property CREATORS_CONTACT_INFO = Property.internalText(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "CreatorContactInfo");
+ Property CREATORS_CONTACT_INFO =
+ Property.internalText(
+ PREFIX_IPTC_CORE
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "CreatorContactInfo");
/**
- * Contains the job title of the person who created the content of this
- * item. As this is sort of a qualifier the Creator element has to be filled
- * in as mandatory prerequisite for using Creator's Jobtitle.
- * <p>
- * Maps to this IIM property: 2:85 By-line Title
+ * Contains the job title of the person who created the content of this item. As this is sort of
+ * a qualifier the Creator element has to be filled in as mandatory prerequisite for using
+ * Creator's Jobtitle.
+ *
+ * <p>Maps to this IIM property: 2:85 By-line Title
*
* @see Photoshop#AUTHORS_POSITION
*/
Property CREATORS_JOB_TITLE = Photoshop.AUTHORS_POSITION;
/**
- * The credit to person(s) and/or organisation(s) required by the supplier
- * of the item to be used when published. This is a free-text field.
- * <p>
- * Note 1: For more formal identifications of the creator or the owner of
- * the copyrights of this image other rights properties may be used.
- * Note 2:
- * This property was named "Credit" by the IIM metadata, then it was renamed
- * to "Provider" in IPTC Core 1.0. In IPTC Core 1.1. it has been renamed to
- * "Credit Line" as the field is used for this purpose by many users.
- * <p>
- * Maps to this IIM property: 2:110 Credit
+ * The credit to person(s) and/or organisation(s) required by the supplier of the item to be
+ * used when published. This is a free-text field.
+ *
+ * <p>Note 1: For more formal identifications of the creator or the owner of the copyrights of
+ * this image other rights properties may be used. Note 2: This property was named "Credit" by
+ * the IIM metadata, then it was renamed to "Provider" in IPTC Core 1.0. In IPTC Core 1.1. it
+ * has been renamed to "Credit Line" as the field is used for this purpose by many users.
+ *
+ * <p>Maps to this IIM property: 2:110 Credit
*
* @see Photoshop#CREDIT
*/
@@ -376,900 +356,1048 @@
/**
* The licensing parameters of the item expressed in free-text.
- * <p>
- * The PLUS fields of the IPTC Extension can be used in parallel to express
- * the licensed usage in more controlled terms.
+ *
+ * <p>The PLUS fields of the IPTC Extension can be used in parallel to express the licensed
+ * usage in more controlled terms.
*/
Property RIGHTS_USAGE_TERMS = XMPRights.USAGE_TERMS;
/**
- * Identifies the original owner of the copyright for the intellectual
- * content of the item. This could be an agency, a member of an agency or an
- * individual. Source could be different from Creator and from the entities
- * in the CopyrightNotice.
- * <p>
- * The original owner can never change. For that reason the content of this
- * property should never be changed or deleted after the information is
- * entered following the news object's initial creation.
- * <p>
- * Maps to this IIM property: 2:115 Source
+ * Identifies the original owner of the copyright for the intellectual content of the item. This
+ * could be an agency, a member of an agency or an individual. Source could be different from
+ * Creator and from the entities in the CopyrightNotice.
+ *
+ * <p>The original owner can never change. For that reason the content of this property should
+ * never be changed or deleted after the information is entered following the news object's
+ * initial creation.
+ *
+ * <p>Maps to this IIM property: 2:115 Source
*
* @see Photoshop#SOURCE
*/
Property SOURCE = Photoshop.SOURCE;
/**
- * The contact information address part. Comprises an optional company name
- * and all required information to locate the building or postbox to which
- * mail should be sent. To that end, the address is a multiline field.
- * <p>
- * Note 1: to user interface implementers: This field should be part of a
- * "Contact information" group on the form.
- * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
+ * The contact information address part. Comprises an optional company name and all required
+ * information to locate the building or postbox to which mail should be sent. To that end, the
+ * address is a multiline field.
+ *
+ * <p>Note 1: to user interface implementers: This field should be part of a "Contact
+ * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard
+ * specification RFC 2426.
*/
- Property CONTACT_INFO_ADDRESS = Property.internalTextBag(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrExtadr");
+ Property CONTACT_INFO_ADDRESS =
+ Property.internalTextBag(
+ PREFIX_IPTC_CORE
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "CiAdrExtadr");
/**
* The contact information city part.
- * <p>
- * Note 1: to user interface implementers: This field should be part of a
- * "Contact information" group on the form.
- * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
+ *
+ * <p>Note 1: to user interface implementers: This field should be part of a "Contact
+ * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard
+ * specification RFC 2426.
*/
- Property CONTACT_INFO_CITY = Property.internalText(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCity");
+ Property CONTACT_INFO_CITY =
+ Property.internalText(
+ PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCity");
/**
* The contact information country part.
- * <p>
- * Note 1: to user interface implementers: This field should be part of a
- * "Contact information" group on the form.
- * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
+ *
+ * <p>Note 1: to user interface implementers: This field should be part of a "Contact
+ * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard
+ * specification RFC 2426.
*/
- Property CONTACT_INFO_COUNTRY = Property.internalText(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCtry");
+ Property CONTACT_INFO_COUNTRY =
+ Property.internalText(
+ PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrCtry");
/**
* The contact information email address part.
- * <p>
- * Multiple email addresses can be given. May have to be separated by a
- * comma in the user interface.
- * <p>
- * Note 1: to user interface implementers: This field should be part of a
- * "Contact information" group on the form.
- * Note 2 to user interface
- * implementers: provide sufficient space to fill in multiple e-mail
- * addresses.
- * Note 3: the ContactInfo naming aligns with the vCard
- * specification RFC 2426.
+ *
+ * <p>Multiple email addresses can be given. May have to be separated by a comma in the user
+ * interface.
+ *
+ * <p>Note 1: to user interface implementers: This field should be part of a "Contact
+ * information" group on the form. Note 2 to user interface implementers: provide sufficient
+ * space to fill in multiple e-mail addresses. Note 3: the ContactInfo naming aligns with the
+ * vCard specification RFC 2426.
*/
- Property CONTACT_INFO_EMAIL = Property.internalTextBag(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiEmailWork");
+ Property CONTACT_INFO_EMAIL =
+ Property.internalTextBag(
+ PREFIX_IPTC_CORE
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "CiEmailWork");
/**
* The contact information phone number part.
- * <p>
- * Multiple numbers can be given. May have to be separated by a
- * comma in the user interface.
- * <p>
- * Note 1: to user interface implementers: This field should be part of a
- * "Contact information" group on the form.
- * Note 2 to user interface
- * implementers: provide sufficient space to fill in multiple international
- * numbers.
- * Note 3: the ContactInfo naming aligns with the vCard
- * specification RFC 2426.
+ *
+ * <p>Multiple numbers can be given. May have to be separated by a comma in the user interface.
+ *
+ * <p>Note 1: to user interface implementers: This field should be part of a "Contact
+ * information" group on the form. Note 2 to user interface implementers: provide sufficient
+ * space to fill in multiple international numbers. Note 3: the ContactInfo naming aligns with
+ * the vCard specification RFC 2426.
*/
- Property CONTACT_INFO_PHONE = Property.internalTextBag(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiTelWork");
+ Property CONTACT_INFO_PHONE =
+ Property.internalTextBag(
+ PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiTelWork");
/**
* The contact information part denoting the local postal code.
- * <p>
- * Note 1: to user interface implementers: This field should be part of a
- * "Contact information" group on the form.
- * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
+ *
+ * <p>Note 1: to user interface implementers: This field should be part of a "Contact
+ * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard
+ * specification RFC 2426.
*/
- Property CONTACT_INFO_POSTAL_CODE = Property.internalText(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrPcode");
+ Property CONTACT_INFO_POSTAL_CODE =
+ Property.internalText(
+ PREFIX_IPTC_CORE
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "CiAdrPcode");
/**
* The contact information part denoting regional information such as state or province.
- * <p>
- * Note 1: to user interface implementers: This field should be part of a
- * "Contact information" group on the form.
- * Note 2: the ContactInfo naming aligns with the vCard specification RFC 2426.
+ *
+ * <p>Note 1: to user interface implementers: This field should be part of a "Contact
+ * information" group on the form. Note 2: the ContactInfo naming aligns with the vCard
+ * specification RFC 2426.
*/
- Property CONTACT_INFO_STATE_PROVINCE = Property.internalText(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiAdrRegion");
+ Property CONTACT_INFO_STATE_PROVINCE =
+ Property.internalText(
+ PREFIX_IPTC_CORE
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "CiAdrRegion");
/**
* The contact information web address part. Multiple addresses can be given, separated by a
* comma.
- * <p>
- * Note 1: to user interface implementers: This field should be part of a
- * "Contact information" group on the form.
- * Note 2 to user interface
- * implementers: provide sufficient space to fill in multiple URLs.
- * Note 3: the ContactInfo naming aligns with the vCard
+ *
+ * <p>Note 1: to user interface implementers: This field should be part of a "Contact
+ * information" group on the form. Note 2 to user interface implementers: provide sufficient
+ * space to fill in multiple URLs. Note 3: the ContactInfo naming aligns with the vCard
* specification RFC 2426.
*/
- Property CONTACT_INFO_WEB_URL = Property.internalTextBag(
- PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiUrlWork");
+ Property CONTACT_INFO_WEB_URL =
+ Property.internalTextBag(
+ PREFIX_IPTC_CORE + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CiUrlWork");
/**
- * As this metadata element pertains to distribution management, it was not
- * adopted. However, this data is still synchronised with the XMP property
- * [photoshop:Urgency], and hence, available for future use, but outside the
- * IPTC Core.
+ * As this metadata element pertains to distribution management, it was not adopted. However,
+ * this data is still synchronised with the XMP property [photoshop:Urgency], and hence,
+ * available for future use, but outside the IPTC Core.
*
* @deprecated
*/
- @Deprecated
- Property URGENCY = Photoshop.URGENCY;
+ @Deprecated Property URGENCY = Photoshop.URGENCY;
/**
- * As this metadata element was earmarked as deprecated already for IIM 4.1,
- * it was not adopted. However, this data is still synchronised with the XMP
- * property [photoshop:Category], and hence available for future use - but
- * outside the IPTC Core. For migrating from Category codes to Subject Codes
- * please read the Guideline for mapping Category Codes to Subject NewsCodes
+ * As this metadata element was earmarked as deprecated already for IIM 4.1, it was not adopted.
+ * However, this data is still synchronised with the XMP property [photoshop:Category], and
+ * hence available for future use - but outside the IPTC Core. For migrating from Category codes
+ * to Subject Codes please read the Guideline for mapping Category Codes to Subject NewsCodes
* section below.
*
* @deprecated
*/
- @Deprecated
- Property CATEGORY = Photoshop.CATEGORY;
+ @Deprecated Property CATEGORY = Photoshop.CATEGORY;
/**
- * As this metadata element was earmarked as deprecated already for IIM 4.1,
- * it was not adopted. However, this data is still synchronised with the XMP
- * property [photoshop:SupplementalCategories], and hence available for
- * future use - but outside the IPTC Core.
+ * As this metadata element was earmarked as deprecated already for IIM 4.1, it was not adopted.
+ * However, this data is still synchronised with the XMP property
+ * [photoshop:SupplementalCategories], and hence available for future use - but outside the IPTC
+ * Core.
*
* @deprecated
*/
- @Deprecated
- Property SUPPLEMENTAL_CATEGORIES = Photoshop.SUPPLEMENTAL_CATEGORIES;
+ @Deprecated Property SUPPLEMENTAL_CATEGORIES = Photoshop.SUPPLEMENTAL_CATEGORIES;
/**
- * Information about the ethnicity and other facets of the model(s) in a
- * model-released image.
- * <p>
- * Use the Model Age field for the age of model(s).
+ * Information about the ethnicity and other facets of the model(s) in a model-released image.
+ *
+ * <p>Use the Model Age field for the age of model(s).
*/
- Property ADDITIONAL_MODEL_INFO = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AddlModelInfo");
+ Property ADDITIONAL_MODEL_INFO =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "AddlModelInfo");
+
+ /** A set of metadata about artwork or an object in the item */
+ Property ARTWORK_OR_OBJECT =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "ArtworkOrObject");
+
+ /** A set of metadata about artwork or an object in the item */
+ Property ORGANISATION_CODE =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "OrganisationInImageCode");
/**
- * A set of metadata about artwork or an object in the item
+ * A term to describe the content of the image by a value from a Controlled Vocabulary.
+ *
+ * <p>This property is part of the Photo Metadata 2008 specifications, but should not released
+ * to the public on the standard Adobe Custom Panels for IPTC metadata or other user interfaces
+ * unless agreed by the IPTC.
*/
- Property ARTWORK_OR_OBJECT = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ArtworkOrObject");
+ Property CONTROLLED_VOCABULARY_TERM =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CVterm");
/**
- * A set of metadata about artwork or an object in the item
+ * A location the content of the item is about. For photos that is a location shown in the
+ * image.
+ *
+ * <p>If the location the image was taken in is different from this location the property
+ * Location Created should be used too.
*/
- Property ORGANISATION_CODE = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "OrganisationInImageCode");
+ Property LOCATION_SHOWN =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationShown");
/**
- * A term to describe the content of the image by a value from a Controlled
- * Vocabulary.
- * <p>
- * This property is part of the Photo Metadata 2008 specifications, but
- * should not released to the public on the standard Adobe Custom Panels for
- * IPTC metadata or other user interfaces unless agreed by the IPTC.
+ * Age of the human model(s) at the time this image was taken in a model released image.
+ *
+ * <p>The user should be aware of any legal implications of providing ages for young models.
+ * Ages below 18 years should not be included.
*/
- Property CONTROLLED_VOCABULARY_TERM = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CVterm");
-
- /**
- * A location the content of the item is about. For photos that is a
- * location shown in the image.
- * <p>
- * If the location the image was taken in is different from this location
- * the property Location Created should be used too.
- */
- Property LOCATION_SHOWN = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShown");
-
- /**
- * Age of the human model(s) at the time this image was taken in a model
- * released image.
- * <p>
- * The user should be aware of any legal implications of providing ages for
- * young models. Ages below 18 years should not be included.
- */
- Property MODEL_AGE = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelAge");
+ Property MODEL_AGE =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelAge");
/**
* Name of the organisation or company which is featured in the content.
- * <p>
- * May be supplemented by values from a controlled vocabulary in the
- * Organisation Code field.
+ *
+ * <p>May be supplemented by values from a controlled vocabulary in the Organisation Code field.
*/
- Property ORGANISATION_NAME = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "OrganisationInImageName");
+ Property ORGANISATION_NAME =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "OrganisationInImageName");
/**
- * Name of a person the content of the item is about. For photos that is a
- * person shown in the image.
+ * Name of a person the content of the item is about. For photos that is a person shown in the
+ * image.
*/
- Property PERSON = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PersonInImage");
+ Property PERSON =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "PersonInImage");
/**
- * Globally unique identifier for the item. It is created and applied by the
- * creator of the item at the time of its creation . This value shall not be
- * changed after that time.
- * <p>
- * The identifier will probably be generated by the technical means of an
- * imaging device or software and should be applied to the digital image
- * file as early as possible in its life cycle. This identifier does not
- * identify any pictured content, particularly in case of a scan of non-
- * digital images, only this digital representation.
- * <p>
- * Any algorithm to create this identifier has to comply with the technical
- * requirements to create a globally unique id. Any device creating digital
- * images - e.g. still image cameras, video cameras, scanners - should
- * create such an identifer right at the time of the creation of the digital
- * data and add the id to the set of metadata without compromising
- * performance. It is recommended that this image identifier allows
- * identifying the device by which the image data and the GUID were created.
- * IPTC's basic requirements for unique ids are:
- * - It must be globally unique. Algorithms for this purpose exist.
- * - It should identify the camera body.
- * - It should identify each individual photo from this camera body.
- * - It should identify the date and time of the creation of the picture.
- * - It should be secured against tampering.
- * This field should be implemented in a way to prove it has not been changed since its
- * value has been applied. If the identifier has been created by the imaging device
- * its type and brand can be found in the Exif/technical metadata.
+ * Globally unique identifier for the item. It is created and applied by the creator of the item
+ * at the time of its creation . This value shall not be changed after that time.
+ *
+ * <p>The identifier will probably be generated by the technical means of an imaging device or
+ * software and should be applied to the digital image file as early as possible in its life
+ * cycle. This identifier does not identify any pictured content, particularly in case of a scan
+ * of non- digital images, only this digital representation.
+ *
+ * <p>Any algorithm to create this identifier has to comply with the technical requirements to
+ * create a globally unique id. Any device creating digital images - e.g. still image cameras,
+ * video cameras, scanners - should create such an identifer right at the time of the creation
+ * of the digital data and add the id to the set of metadata without compromising performance.
+ * It is recommended that this image identifier allows identifying the device by which the image
+ * data and the GUID were created. IPTC's basic requirements for unique ids are: - It must be
+ * globally unique. Algorithms for this purpose exist. - It should identify the camera body. -
+ * It should identify each individual photo from this camera body. - It should identify the date
+ * and time of the creation of the picture. - It should be secured against tampering. This field
+ * should be implemented in a way to prove it has not been changed since its value has been
+ * applied. If the identifier has been created by the imaging device its type and brand can be
+ * found in the Exif/technical metadata.
*/
- Property DIGITAL_IMAGE_GUID = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DigImageGUID");
+ Property DIGITAL_IMAGE_GUID =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "DigImageGUID");
/**
* The type of the source digital file.
- * <p>
- * The IPTC recommends not to implement this property any longer.
+ *
+ * <p>The IPTC recommends not to implement this property any longer.
*
* @deprecated
*/
@Deprecated
- Property DIGITAL_SOURCE_FILE_TYPE = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "DigitalSourcefileType");
+ Property DIGITAL_SOURCE_FILE_TYPE =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "DigitalSourcefileType");
- /**
- * The type of the source of this digital image
- */
- Property DIGITAL_SOURCE_TYPE = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DigitalSourceType");
+ /** The type of the source of this digital image */
+ Property DIGITAL_SOURCE_TYPE =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "DigitalSourceType");
/**
* Names or describes the specific event the content relates to.
- * <p>
- * Examples are: a press conference, dedication ceremony, etc. If this is a
- * sub-event of a larger event both can be provided by the field: e.g. XXXIX
- * Olympic Summer Games (Beijing): opening ceremony. Unplanned events could
- * be named by this property too.
+ *
+ * <p>Examples are: a press conference, dedication ceremony, etc. If this is a sub-event of a
+ * larger event both can be provided by the field: e.g. XXXIX Olympic Summer Games (Beijing):
+ * opening ceremony. Unplanned events could be named by this property too.
*/
- Property EVENT = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Event");
+ Property EVENT =
+ Property.internalText(
+ PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Event");
/**
- * Both a Registry Item Id and a Registry Organisation Id to record any
- * registration of this item with a registry.
- * <p>
- * Typically an id from a registry is negotiated and applied after the
- * creation of the digital image.
- * <p>
- * Any user interface implementation must show both sub-properties - Item Id
- * and Organisation Id - as corresponding values. Further an input to both
- * fields should be made mandatory.
+ * Both a Registry Item Id and a Registry Organisation Id to record any registration of this
+ * item with a registry.
+ *
+ * <p>Typically an id from a registry is negotiated and applied after the creation of the
+ * digital image.
+ *
+ * <p>Any user interface implementation must show both sub-properties - Item Id and Organisation
+ * Id - as corresponding values. Further an input to both fields should be made mandatory.
*/
- Property IMAGE_REGISTRY_ENTRY = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegistryId");
+ Property IMAGE_REGISTRY_ENTRY =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegistryId");
/**
- * Identifies the most recent supplier of the item, who is not necessarily
- * its owner or creator.
- * <p>
- * For identifying the supplier either a well known and/or registered
- * company name or a URL of the company's web site may be used. This
- * property succeeds the Provider property of IPTC Core 1.0 by its semantics
- * as that Provider was renamed to Credit Line.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * Identifies the most recent supplier of the item, who is not necessarily its owner or creator.
+ *
+ * <p>For identifying the supplier either a well known and/or registered company name or a URL
+ * of the company's web site may be used. This property succeeds the Provider property of IPTC
+ * Core 1.0 by its semantics as that Provider was renamed to Credit Line.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property IMAGE_SUPPLIER = Property.internalText(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplier");
+ Property IMAGE_SUPPLIER =
+ Property.internalText(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplier");
+
/**
* @deprecated use {@link IPTC#IMAGE_SUPPLIER_ID}
*/
@Deprecated
String IMAGE_SUPPLIER_ID_WRONG_CASE =
PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierId";
+
/**
- * Identifies the most recent supplier of the item, who is not necessarily
- * its owner or creator.
- * <p>
- * For identifying the supplier either a well known and/or registered
- * company name or a URL of the company's web site may be used. This
- * property succeeds the Provider property of IPTC Core 1.0 by its semantics
- * as that Provider was renamed to Credit Line.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * Identifies the most recent supplier of the item, who is not necessarily its owner or creator.
+ *
+ * <p>For identifying the supplier either a well known and/or registered company name or a URL
+ * of the company's web site may be used. This property succeeds the Provider property of IPTC
+ * Core 1.0 by its semantics as that Provider was renamed to Credit Line.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property IMAGE_SUPPLIER_ID = Property.composite(Property.internalText(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierID"),
- new Property[]{Property.internalText(IPTC.IMAGE_SUPPLIER_ID_WRONG_CASE)});
+ Property IMAGE_SUPPLIER_ID =
+ Property.composite(
+ Property.internalText(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "ImageSupplierID"),
+ new Property[] {Property.internalText(IPTC.IMAGE_SUPPLIER_ID_WRONG_CASE)});
+
/**
- * Identifies the most recent supplier of the item, who is not necessarily
- * its owner or creator.
- * <p>
- * For identifying the supplier either a well known and/or registered
- * company name or a URL of the company's web site may be used. This
- * property succeeds the Provider property of IPTC Core 1.0 by its semantics
- * as that Provider was renamed to Credit Line.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * Identifies the most recent supplier of the item, who is not necessarily its owner or creator.
+ *
+ * <p>For identifying the supplier either a well known and/or registered company name or a URL
+ * of the company's web site may be used. This property succeeds the Provider property of IPTC
+ * Core 1.0 by its semantics as that Provider was renamed to Credit Line.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property IMAGE_SUPPLIER_NAME = Property.internalText(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierName");
+ Property IMAGE_SUPPLIER_NAME =
+ Property.internalText(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "ImageSupplierName");
/**
* Optional identifier assigned by the Image Supplier to the image.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property IMAGE_SUPPLIER_IMAGE_ID = Property.internalText(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageSupplierImageID");
+ Property IMAGE_SUPPLIER_IMAGE_ID =
+ Property.internalText(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "ImageSupplierImageID");
/**
- * The date and optionally time when any of the IPTC photo metadata fields
- * has been last edited
- * <p>
- * The public use of this property is deprecated by IPTC Extension version
- * 1.1. It may only still be used by a private user interface for a use
- * scoped to a company. If used this field should be a timestamp of the
- * latest change applied to any of the fields.
- * <p>
- * The value of this property should never be set by software. XMP-aware
- * software should reflect any changes to metadata by the xmp:MetadataDate
- * property of the XMP Basic scheme.
+ * The date and optionally time when any of the IPTC photo metadata fields has been last edited
+ *
+ * <p>The public use of this property is deprecated by IPTC Extension version 1.1. It may only
+ * still be used by a private user interface for a use scoped to a company. If used this field
+ * should be a timestamp of the latest change applied to any of the fields.
+ *
+ * <p>The value of this property should never be set by software. XMP-aware software should
+ * reflect any changes to metadata by the xmp:MetadataDate property of the XMP Basic scheme.
*/
- Property IPTC_LAST_EDITED = Property.internalDate(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "IptcLastEdited");
+ Property IPTC_LAST_EDITED =
+ Property.internalDate(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "IptcLastEdited");
/**
* The location the content of the item was created.
- * <p>
- * If the location in the image is different from the location the photo was
- * taken the IPTC Extension property Location Shown in the Image should be
- * used.
+ *
+ * <p>If the location in the image is different from the location the photo was taken the IPTC
+ * Extension property Location Shown in the Image should be used.
*/
- Property LOCATION_CREATED = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationCreated");
+ Property LOCATION_CREATED =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationCreated");
/**
- * The maximum available height in pixels of the original photo from which
- * this photo has been derived by downsizing.
+ * The maximum available height in pixels of the original photo from which this photo has been
+ * derived by downsizing.
*/
- Property MAX_AVAIL_HEIGHT = Property.internalInteger(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MaxAvailHeight");
+ Property MAX_AVAIL_HEIGHT =
+ Property.internalInteger(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "MaxAvailHeight");
/**
- * The maximum available width in pixels of the original photo from which
- * this photo has been derived by downsizing.
+ * The maximum available width in pixels of the original photo from which this photo has been
+ * derived by downsizing.
*/
- Property MAX_AVAIL_WIDTH = Property.internalInteger(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "MaxAvailWidth");
+ Property MAX_AVAIL_WIDTH =
+ Property.internalInteger(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "MaxAvailWidth");
/**
- * The version number of the PLUS standards in place at the time of the
- * transaction.
- * <p>
- * This property was included into the IPTC Extension schema from PLUS
- * version 1.2 as all other PLUS properties. To reflect this the value of
- * "PLUS Version" should be set to the string "1.2.0"
+ * The version number of the PLUS standards in place at the time of the transaction.
+ *
+ * <p>This property was included into the IPTC Extension schema from PLUS version 1.2 as all
+ * other PLUS properties. To reflect this the value of "PLUS Version" should be set to the
+ * string "1.2.0"
*/
- Property PLUS_VERSION = Property.internalText(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Version");
+ Property PLUS_VERSION =
+ Property.internalText(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Version");
/**
* Owner or owners of the copyright in the licensed image.
- * <p>
- * Serves to identify the rights holder/s for the image. The Copyright
- * Owner, Image Creator and Licensor may be the same or different entities.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>Serves to identify the rights holder/s for the image. The Copyright Owner, Image Creator
+ * and Licensor may be the same or different entities.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property COPYRIGHT_OWNER = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwner");
+ Property COPYRIGHT_OWNER =
+ Property.internalTextBag(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwner");
+
/**
* @deprecated use {@link IPTC#COPYRIGHT_OWNER_ID}
*/
@Deprecated
String COPYRIGHT_OWNER_ID_WRONG_CASE =
PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerId";
+
/**
* The ID of the owner or owners of the copyright in the licensed image.
- * <p>
- * Serves to identify the rights holder/s for the image. The Copyright
- * Owner, Image Creator and Licensor may be the same or different entities.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>Serves to identify the rights holder/s for the image. The Copyright Owner, Image Creator
+ * and Licensor may be the same or different entities.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property COPYRIGHT_OWNER_ID = Property.composite(Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerID"),
- new Property[]{Property.internalTextBag(IPTC.COPYRIGHT_OWNER_ID_WRONG_CASE)});
+ Property COPYRIGHT_OWNER_ID =
+ Property.composite(
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "CopyrightOwnerID"),
+ new Property[] {Property.internalTextBag(IPTC.COPYRIGHT_OWNER_ID_WRONG_CASE)});
+
/**
* The name of the owner or owners of the copyright in the licensed image.
- * <p>
- * Serves to identify the rights holder/s for the image. The Copyright
- * Owner, Image Creator and Licensor may be the same or different entities.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>Serves to identify the rights holder/s for the image. The Copyright Owner, Image Creator
+ * and Licensor may be the same or different entities.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property COPYRIGHT_OWNER_NAME = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CopyrightOwnerName");
+ Property COPYRIGHT_OWNER_NAME =
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "CopyrightOwnerName");
/**
* Creator or creators of the image.
- * <p>
- * The creator can be additionally expressed in free-text using the IPTC
- * Core Creator field. In many countries, the Image Creator must be
- * attributed in association with any use of the image. The Image Creator,
- * Copyright Owner, Image Supplier and Licensor may be the same or different
- * entities.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>The creator can be additionally expressed in free-text using the IPTC Core Creator field.
+ * In many countries, the Image Creator must be attributed in association with any use of the
+ * image. The Image Creator, Copyright Owner, Image Supplier and Licensor may be the same or
+ * different entities.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property IMAGE_CREATOR = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreator");
+ Property IMAGE_CREATOR =
+ Property.internalTextBag(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreator");
+
/**
* @deprecated use {@link IPTC#IMAGE_CREATOR_ID}
*/
@Deprecated
String IMAGE_CREATOR_ID_WRONG_CASE =
PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorId";
- /**
- * The ID of the creator or creators of the image.
- * <p>
- * The creator can be additionally expressed in free-text using the IPTC
- * Core Creator field. In many countries, the Image Creator must be
- * attributed in association with any use of the image. The Image Creator,
- * Copyright Owner, Image Supplier and Licensor may be the same or different
- * entities.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
- */
- Property IMAGE_CREATOR_ID = Property.composite(Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorID"),
- new Property[]{Property.internalTextBag(IPTC.IMAGE_CREATOR_ID_WRONG_CASE)});
- /**
- * The name of the creator or creators of the image.
- * <p>
- * The creator can be additionally expressed in free-text using the IPTC
- * Core Creator field. In many countries, the Image Creator must be
- * attributed in association with any use of the image. The Image Creator,
- * Copyright Owner, Image Supplier and Licensor may be the same or different
- * entities.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
- */
- Property IMAGE_CREATOR_NAME = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ImageCreatorName");
/**
- * A person or company that should be contacted to obtain a licence for
- * using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * The ID of the creator or creators of the image.
+ *
+ * <p>The creator can be additionally expressed in free-text using the IPTC Core Creator field.
+ * In many countries, the Image Creator must be attributed in association with any use of the
+ * image. The Image Creator, Copyright Owner, Image Supplier and Licensor may be the same or
+ * different entities.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Licensor");
+ Property IMAGE_CREATOR_ID =
+ Property.composite(
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "ImageCreatorID"),
+ new Property[] {Property.internalTextBag(IPTC.IMAGE_CREATOR_ID_WRONG_CASE)});
+
+ /**
+ * The name of the creator or creators of the image.
+ *
+ * <p>The creator can be additionally expressed in free-text using the IPTC Core Creator field.
+ * In many countries, the Image Creator must be attributed in association with any use of the
+ * image. The Image Creator, Copyright Owner, Image Supplier and Licensor may be the same or
+ * different entities.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
+ */
+ Property IMAGE_CREATOR_NAME =
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "ImageCreatorName");
+
+ /**
+ * A person or company that should be contacted to obtain a licence for using the item or who
+ * has licensed the item.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
+ */
+ Property LICENSOR =
+ Property.internalTextBag(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Licensor");
+
/**
* @deprecated use {@link IPTC#LICENSOR_ID}
*/
@Deprecated
String LICENSOR_ID_WRONG_CASE =
PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorId";
- /**
- * The ID of the person or company that should be contacted to obtain a licence for
- * using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
- */
- Property LICENSOR_ID = Property.composite(Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorID"),
- new Property[]{Property.internalTextBag(IPTC.LICENSOR_ID_WRONG_CASE)});
- /**
- * The name of the person or company that should be contacted to obtain a licence for
- * using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
- */
- Property LICENSOR_NAME = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorName");
/**
- * The city of a person or company that should be contacted to obtain a licence for
- * using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * The ID of the person or company that should be contacted to obtain a licence for using the
+ * item or who has licensed the item.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_CITY = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCity");
+ Property LICENSOR_ID =
+ Property.composite(
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LicensorID"),
+ new Property[] {Property.internalTextBag(IPTC.LICENSOR_ID_WRONG_CASE)});
/**
- * The country of a person or company that should be contacted to obtain a licence for
- * using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * The name of the person or company that should be contacted to obtain a licence for using the
+ * item or who has licensed the item.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_COUNTRY = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCountry");
+ Property LICENSOR_NAME =
+ Property.internalTextBag(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorName");
/**
- * The email of a person or company that should be contacted to obtain a licence for
- * using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * The city of a person or company that should be contacted to obtain a licence for using the
+ * item or who has licensed the item.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_EMAIL = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorEmail");
+ Property LICENSOR_CITY =
+ Property.internalTextBag(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorCity");
+
+ /**
+ * The country of a person or company that should be contacted to obtain a licence for using the
+ * item or who has licensed the item.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
+ */
+ Property LICENSOR_COUNTRY =
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LicensorCountry");
+
+ /**
+ * The email of a person or company that should be contacted to obtain a licence for using the
+ * item or who has licensed the item.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
+ */
+ Property LICENSOR_EMAIL =
+ Property.internalTextBag(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorEmail");
/**
* The extended address of a person or company that should be contacted to obtain a licence for
* using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_EXTENDED_ADDRESS = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LicensorExtendedAddress");
+ Property LICENSOR_EXTENDED_ADDRESS =
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LicensorExtendedAddress");
/**
- * The postal code of a person or company that should be contacted to obtain a licence for
- * using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * The postal code of a person or company that should be contacted to obtain a licence for using
+ * the item or who has licensed the item.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_POSTAL_CODE = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorPostalCode");
+ Property LICENSOR_POSTAL_CODE =
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LicensorPostalCode");
/**
- * The region of a person or company that should be contacted to obtain a licence for
- * using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * The region of a person or company that should be contacted to obtain a licence for using the
+ * item or who has licensed the item.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_REGION = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorRegion");
+ Property LICENSOR_REGION =
+ Property.internalTextBag(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorRegion");
/**
* The street address of a person or company that should be contacted to obtain a licence for
* using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_STREET_ADDRESS = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorStreetAddress");
+ Property LICENSOR_STREET_ADDRESS =
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LicensorStreetAddress");
/**
* The phone number of a person or company that should be contacted to obtain a licence for
* using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_TELEPHONE_1 = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone1");
+ Property LICENSOR_TELEPHONE_1 =
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LicensorTelephone1");
/**
* The phone number of a person or company that should be contacted to obtain a licence for
* using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_TELEPHONE_2 = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorTelephone2");
+ Property LICENSOR_TELEPHONE_2 =
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LicensorTelephone2");
/**
- * The URL of a person or company that should be contacted to obtain a licence for
- * using the item or who has licensed the item.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * The URL of a person or company that should be contacted to obtain a licence for using the
+ * item or who has licensed the item.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property LICENSOR_URL = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorURL");
+ Property LICENSOR_URL =
+ Property.internalTextBag(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LicensorURL");
/**
- * Age of the youngest model pictured in the image, at the time that the
- * image was made.
- * <p>
- * This age should not be displayed to the public on open web portals and
- * the like. But it may be used by image repositories in a
- * B2B enviroment.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * Age of the youngest model pictured in the image, at the time that the image was made.
+ *
+ * <p>This age should not be displayed to the public on open web portals and the like. But it
+ * may be used by image repositories in a B2B enviroment.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property MINOR_MODEL_AGE_DISCLOSURE = Property.internalText(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "MinorModelAgeDisclosure");
+ Property MINOR_MODEL_AGE_DISCLOSURE =
+ Property.internalText(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "MinorModelAgeDisclosure");
/**
* Optional identifier associated with each Model Release.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property MODEL_RELEASE_ID = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseID");
+ Property MODEL_RELEASE_ID =
+ Property.internalTextBag(
+ PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseID");
/**
- * Summarizes the availability and scope of model releases authorizing usage
- * of the likenesses of persons appearing in the photograph.
- * <p>
- * It is recommended to apply the PLUS controlled value Unlimited Model
- * Releases (MR- UMR) very carefully and to check the wording of the model
- * release thoroughly before applying it.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * Summarizes the availability and scope of model releases authorizing usage of the likenesses
+ * of persons appearing in the photograph.
+ *
+ * <p>It is recommended to apply the PLUS controlled value Unlimited Model Releases (MR- UMR)
+ * very carefully and to check the wording of the model release thoroughly before applying it.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property MODEL_RELEASE_STATUS = Property.internalText(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ModelReleaseStatus");
+ Property MODEL_RELEASE_STATUS =
+ Property.internalText(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "ModelReleaseStatus");
/**
* Optional identifier associated with each Property Release.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property PROPERTY_RELEASE_ID = Property.internalTextBag(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseID");
+ Property PROPERTY_RELEASE_ID =
+ Property.internalTextBag(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "PropertyReleaseID");
/**
- * Summarises the availability and scope of property releases authorizing
- * usage of the properties appearing in the photograph.
- * <p>
- * It is recommended to apply the value PR-UPR very carefully and to check
- * the wording of the property release thoroughly before applying it.
- * <p>
- * This is a PLUS version 1.2 property included in the IPTC Extension
- * schema.
+ * Summarises the availability and scope of property releases authorizing usage of the
+ * properties appearing in the photograph.
+ *
+ * <p>It is recommended to apply the value PR-UPR very carefully and to check the wording of the
+ * property release thoroughly before applying it.
+ *
+ * <p>This is a PLUS version 1.2 property included in the IPTC Extension schema.
*/
- Property PROPERTY_RELEASE_STATUS = Property.internalText(
- PREFIX_PLUS + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PropertyReleaseStatus");
+ Property PROPERTY_RELEASE_STATUS =
+ Property.internalText(
+ PREFIX_PLUS
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "PropertyReleaseStatus");
/**
- * Contains any necessary copyright notice for claiming the intellectual
- * property for artwork or an object in the image and should identify the
- * current owner of the copyright of this work with associated intellectual
- * property rights.
+ * Contains any necessary copyright notice for claiming the intellectual property for artwork or
+ * an object in the image and should identify the current owner of the copyright of this work
+ * with associated intellectual property rights.
*/
- Property ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCopyrightNotice");
+ Property ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "AOCopyrightNotice");
+
+ /** Contains the name of the artist who has created artwork or an object in the image. */
+ Property ARTWORK_OR_OBJECT_DETAIL_CREATOR =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCreator");
/**
- * Contains the name of the artist who has created artwork or an object in the image.
+ * Designates the date and optionally the time the artwork or object in the image was created.
+ * This relates to artwork or objects with associated intellectual property rights.
*/
- Property ARTWORK_OR_OBJECT_DETAIL_CREATOR = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOCreator");
+ Property ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "AODateCreated");
/**
- * Designates the date and optionally the time the artwork or object in the
- * image was created. This relates to artwork or objects with associated
- * intellectual property rights.
+ * The organisation or body holding and registering the artwork or object in the image for
+ * inventory purposes.
*/
- Property ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AODateCreated");
+ Property ARTWORK_OR_OBJECT_DETAIL_SOURCE =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOSource");
/**
- * The organisation or body holding and registering the artwork or object in
- * the image for inventory purposes.
+ * The inventory number issued by the organisation or body holding and registering the artwork
+ * or object in the image.
*/
- Property ARTWORK_OR_OBJECT_DETAIL_SOURCE = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOSource");
+ Property ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "AOSourceInvNo");
+
+ /** A reference for the artwork or object in the image. */
+ Property ARTWORK_OR_OBJECT_DETAIL_TITLE =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOTitle");
/**
- * The inventory number issued by the organisation or body holding and
- * registering the artwork or object in the image.
+ * Name of the city of a location. This element is at the fourth level of a top-down
+ * geographical hierarchy.
*/
- Property ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOSourceInvNo");
+ Property LOCATION_SHOWN_CITY =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationShownCity");
/**
- * A reference for the artwork or object in the image.
+ * The ISO code of a country of a location. This element is at the second level of a top-down
+ * geographical hierarchy.
+ *
+ * <p>Note 1: an implementer would have to derive from the length of the value string whether
+ * this is the country code from the two or three letter scheme as no explicit indication can be
+ * provided.
*/
- Property ARTWORK_OR_OBJECT_DETAIL_TITLE = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AOTitle");
+ Property LOCATION_SHOWN_COUNTRY_CODE =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationShownCountryCode");
/**
- * Name of the city of a location. This element is at the fourth level of a
- * top-down geographical hierarchy.
+ * The name of a country of a location. This element is at the second level of a top-down
+ * geographical hierarchy.
*/
- Property LOCATION_SHOWN_CITY = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "LocationShownCity");
+ Property LOCATION_SHOWN_COUNTRY_NAME =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationShownCountryName");
/**
- * The ISO code of a country of a location. This element is at the second
- * level of a top-down geographical hierarchy.
- * <p>
- * Note 1: an implementer would have to derive from the length of the value
- * string whether this is the country code from the two or three letter
- * scheme as no explicit indication can be provided.
+ * The name of a subregion of a country - a province or state - of a location. This element is
+ * at the third level of a top-down geographical hierarchy.
*/
- Property LOCATION_SHOWN_COUNTRY_CODE = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationShownCountryCode");
+ Property LOCATION_SHOWN_PROVINCE_OR_STATE =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationShownProvinceState");
/**
- * The name of a country of a location. This element is at the second level
- * of a top-down geographical hierarchy.
- */
- Property LOCATION_SHOWN_COUNTRY_NAME = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationShownCountryName");
-
- /**
- * The name of a subregion of a country - a province or state - of a
- * location. This element is at the third level of a top-down geographical
+ * Name of a sublocation. This sublocation name could either be the name of a sublocation to a
+ * city or the name of a well known location or (natural) monument outside a city. In the sense
+ * of a sublocation to a city this element is at the fifth level of a top-down geographical
* hierarchy.
*/
- Property LOCATION_SHOWN_PROVINCE_OR_STATE = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationShownProvinceState");
+ Property LOCATION_SHOWN_SUBLOCATION =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationShownSublocation");
/**
- * Name of a sublocation. This sublocation name could either be the name of
- * a sublocation to a city or the name of a well known location or (natural)
- * monument outside a city. In the sense of a sublocation to a city this
- * element is at the fifth level of a top-down geographical hierarchy.
+ * The name of a world region of a location. This element is at the first (topI) level of a top-
+ * down geographical hierarchy.
*/
- Property LOCATION_SHOWN_SUBLOCATION = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationShownSublocation");
+ Property LOCATION_SHOWN_WORLD_REGION =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationShownWorldRegion");
/**
- * The name of a world region of a location. This element is at the first
- * (topI) level of a top- down geographical hierarchy.
+ * Name of the city of a location. This element is at the fourth level of a top-down
+ * geographical hierarchy.
*/
- Property LOCATION_SHOWN_WORLD_REGION = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationShownWorldRegion");
+ Property LOCATION_CREATED_CITY =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationCreatedCity");
/**
- * Name of the city of a location. This element is at the fourth level of a
- * top-down geographical hierarchy.
+ * The ISO code of a country of a location. This element is at the second level of a top-down
+ * geographical hierarchy.
+ *
+ * <p>Note 1: an implementer would have to derive from the length of the value string whether
+ * this is the country code from the two or three letter scheme as no explicit indication can be
+ * provided.
*/
- Property LOCATION_CREATED_CITY = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationCreatedCity");
+ Property LOCATION_CREATED_COUNTRY_CODE =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationCreatedCountryCode");
/**
- * The ISO code of a country of a location. This element is at the second
- * level of a top-down geographical hierarchy.
- * <p>
- * Note 1: an implementer would have to derive from the length of the value
- * string whether this is the country code from the two or three letter
- * scheme as no explicit indication can be provided.
+ * The name of a country of a location. This element is at the second level of a top-down
+ * geographical hierarchy.
*/
- Property LOCATION_CREATED_COUNTRY_CODE = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationCreatedCountryCode");
+ Property LOCATION_CREATED_COUNTRY_NAME =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationCreatedCountryName");
/**
- * The name of a country of a location. This element is at the second level
- * of a top-down geographical hierarchy.
+ * The name of a subregion of a country - a province or state - of a location. This element is
+ * at the third level of a top-down geographical hierarchy.
*/
- Property LOCATION_CREATED_COUNTRY_NAME = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationCreatedCountryName");
+ Property LOCATION_CREATED_PROVINCE_OR_STATE =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationCreatedProvinceState");
/**
- * The name of a subregion of a country - a province or state - of a
- * location. This element is at the third level of a top-down geographical
+ * Name of a sublocation. This sublocation name could either be the name of a sublocation to a
+ * city or the name of a well known location or (natural) monument outside a city. In the sense
+ * of a sublocation to a city this element is at the fifth level of a top-down geographical
* hierarchy.
*/
- Property LOCATION_CREATED_PROVINCE_OR_STATE = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationCreatedProvinceState");
+ Property LOCATION_CREATED_SUBLOCATION =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationCreatedSublocation");
/**
- * Name of a sublocation. This sublocation name could either be the name of
- * a sublocation to a city or the name of a well known location or (natural)
- * monument outside a city. In the sense of a sublocation to a city this
- * element is at the fifth level of a top-down geographical hierarchy.
+ * The name of a world region of a location. This element is at the first (topI) level of a top-
+ * down geographical hierarchy.
*/
- Property LOCATION_CREATED_SUBLOCATION = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationCreatedSublocation");
+ Property LOCATION_CREATED_WORLD_REGION =
+ Property.internalText(
+ PREFIX_IPTC_EXT
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LocationCreatedWorldRegion");
/**
- * The name of a world region of a location. This element is at the first
- * (topI) level of a top- down geographical hierarchy.
+ * A unique identifier created by a registry and applied by the creator of the item. This value
+ * shall not be changed after being applied. This identifier is linked to a corresponding
+ * Registry Organisation Identifier.
*/
- Property LOCATION_CREATED_WORLD_REGION = Property.internalText(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LocationCreatedWorldRegion");
+ Property REGISTRY_ENTRY_CREATED_ITEM_ID =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegItemId");
- /**
- * A unique identifier created by a registry and applied by the creator of
- * the item. This value shall not be changed after being applied. This
- * identifier is linked to a corresponding Registry Organisation Identifier.
- */
- Property REGISTRY_ENTRY_CREATED_ITEM_ID = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegItemId");
-
- /**
- * An identifier for the registry which issued the corresponding Registry Image Id.
- */
- Property REGISTRY_ENTRY_CREATED_ORGANISATION_ID = Property.internalTextBag(
- PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegOrgId");
-
+ /** An identifier for the registry which issued the corresponding Registry Image Id. */
+ Property REGISTRY_ENTRY_CREATED_ORGANISATION_ID =
+ Property.internalTextBag(
+ PREFIX_IPTC_EXT + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "RegOrgId");
Property[] PROPERTY_GROUP_IPTC_CORE =
- new Property[]{CITY, COUNTRY, COUNTRY_CODE, DESCRIPTION, HEADLINE, INTELLECTUAL_GENRE,
- KEYWORDS, PROVINCE_OR_STATE, SCENE_CODE, SUBJECT_CODE, SUBLOCATION,
- DATE_CREATED, DESCRIPTION_WRITER, INSTRUCTIONS, JOB_ID, TITLE, COPYRIGHT_NOTICE,
- CREATOR, CREATORS_JOB_TITLE, CREDIT_LINE, RIGHTS_USAGE_TERMS, SOURCE,
- CONTACT_INFO_ADDRESS, CONTACT_INFO_CITY, CONTACT_INFO_COUNTRY,
- CONTACT_INFO_EMAIL, CONTACT_INFO_PHONE, CONTACT_INFO_POSTAL_CODE,
- CONTACT_INFO_STATE_PROVINCE, CONTACT_INFO_WEB_URL};
+ new Property[] {
+ CITY,
+ COUNTRY,
+ COUNTRY_CODE,
+ DESCRIPTION,
+ HEADLINE,
+ INTELLECTUAL_GENRE,
+ KEYWORDS,
+ PROVINCE_OR_STATE,
+ SCENE_CODE,
+ SUBJECT_CODE,
+ SUBLOCATION,
+ DATE_CREATED,
+ DESCRIPTION_WRITER,
+ INSTRUCTIONS,
+ JOB_ID,
+ TITLE,
+ COPYRIGHT_NOTICE,
+ CREATOR,
+ CREATORS_JOB_TITLE,
+ CREDIT_LINE,
+ RIGHTS_USAGE_TERMS,
+ SOURCE,
+ CONTACT_INFO_ADDRESS,
+ CONTACT_INFO_CITY,
+ CONTACT_INFO_COUNTRY,
+ CONTACT_INFO_EMAIL,
+ CONTACT_INFO_PHONE,
+ CONTACT_INFO_POSTAL_CODE,
+ CONTACT_INFO_STATE_PROVINCE,
+ CONTACT_INFO_WEB_URL
+ };
Property[] PROPERTY_GROUP_IPTC_EXT =
- new Property[]{ADDITIONAL_MODEL_INFO, ORGANISATION_CODE, CONTROLLED_VOCABULARY_TERM,
- MODEL_AGE, ORGANISATION_NAME, PERSON, DIGITAL_IMAGE_GUID, DIGITAL_SOURCE_TYPE,
- EVENT, IMAGE_SUPPLIER_ID, IMAGE_SUPPLIER_NAME, IMAGE_SUPPLIER_IMAGE_ID,
- IPTC_LAST_EDITED, MAX_AVAIL_HEIGHT, MAX_AVAIL_WIDTH, PLUS_VERSION,
- COPYRIGHT_OWNER_ID, COPYRIGHT_OWNER_NAME, IMAGE_CREATOR_ID, IMAGE_CREATOR_NAME,
- LICENSOR_ID, LICENSOR_NAME, LICENSOR_CITY, LICENSOR_COUNTRY, LICENSOR_EMAIL,
- LICENSOR_EXTENDED_ADDRESS, LICENSOR_POSTAL_CODE, LICENSOR_REGION,
- LICENSOR_STREET_ADDRESS, LICENSOR_TELEPHONE_1, LICENSOR_TELEPHONE_2,
- LICENSOR_URL, MINOR_MODEL_AGE_DISCLOSURE, MODEL_RELEASE_ID,
- MODEL_RELEASE_STATUS, PROPERTY_RELEASE_ID, PROPERTY_RELEASE_STATUS,
- ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE, ARTWORK_OR_OBJECT_DETAIL_CREATOR,
- ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED, ARTWORK_OR_OBJECT_DETAIL_SOURCE,
- ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER,
- ARTWORK_OR_OBJECT_DETAIL_TITLE, LOCATION_SHOWN_CITY,
- LOCATION_SHOWN_COUNTRY_CODE, LOCATION_SHOWN_COUNTRY_NAME,
- LOCATION_SHOWN_PROVINCE_OR_STATE, LOCATION_SHOWN_SUBLOCATION,
- LOCATION_SHOWN_WORLD_REGION, LOCATION_CREATED_CITY,
- LOCATION_CREATED_COUNTRY_CODE, LOCATION_CREATED_COUNTRY_NAME,
- LOCATION_CREATED_PROVINCE_OR_STATE, LOCATION_CREATED_SUBLOCATION,
- LOCATION_CREATED_WORLD_REGION, REGISTRY_ENTRY_CREATED_ITEM_ID,
- REGISTRY_ENTRY_CREATED_ORGANISATION_ID};
+ new Property[] {
+ ADDITIONAL_MODEL_INFO,
+ ORGANISATION_CODE,
+ CONTROLLED_VOCABULARY_TERM,
+ MODEL_AGE,
+ ORGANISATION_NAME,
+ PERSON,
+ DIGITAL_IMAGE_GUID,
+ DIGITAL_SOURCE_TYPE,
+ EVENT,
+ IMAGE_SUPPLIER_ID,
+ IMAGE_SUPPLIER_NAME,
+ IMAGE_SUPPLIER_IMAGE_ID,
+ IPTC_LAST_EDITED,
+ MAX_AVAIL_HEIGHT,
+ MAX_AVAIL_WIDTH,
+ PLUS_VERSION,
+ COPYRIGHT_OWNER_ID,
+ COPYRIGHT_OWNER_NAME,
+ IMAGE_CREATOR_ID,
+ IMAGE_CREATOR_NAME,
+ LICENSOR_ID,
+ LICENSOR_NAME,
+ LICENSOR_CITY,
+ LICENSOR_COUNTRY,
+ LICENSOR_EMAIL,
+ LICENSOR_EXTENDED_ADDRESS,
+ LICENSOR_POSTAL_CODE,
+ LICENSOR_REGION,
+ LICENSOR_STREET_ADDRESS,
+ LICENSOR_TELEPHONE_1,
+ LICENSOR_TELEPHONE_2,
+ LICENSOR_URL,
+ MINOR_MODEL_AGE_DISCLOSURE,
+ MODEL_RELEASE_ID,
+ MODEL_RELEASE_STATUS,
+ PROPERTY_RELEASE_ID,
+ PROPERTY_RELEASE_STATUS,
+ ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE,
+ ARTWORK_OR_OBJECT_DETAIL_CREATOR,
+ ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED,
+ ARTWORK_OR_OBJECT_DETAIL_SOURCE,
+ ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER,
+ ARTWORK_OR_OBJECT_DETAIL_TITLE,
+ LOCATION_SHOWN_CITY,
+ LOCATION_SHOWN_COUNTRY_CODE,
+ LOCATION_SHOWN_COUNTRY_NAME,
+ LOCATION_SHOWN_PROVINCE_OR_STATE,
+ LOCATION_SHOWN_SUBLOCATION,
+ LOCATION_SHOWN_WORLD_REGION,
+ LOCATION_CREATED_CITY,
+ LOCATION_CREATED_COUNTRY_CODE,
+ LOCATION_CREATED_COUNTRY_NAME,
+ LOCATION_CREATED_PROVINCE_OR_STATE,
+ LOCATION_CREATED_SUBLOCATION,
+ LOCATION_CREATED_WORLD_REGION,
+ REGISTRY_ENTRY_CREATED_ITEM_ID,
+ REGISTRY_ENTRY_CREATED_ORGANISATION_ID
+ };
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java b/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java
index 44faa14..26033a8 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/MachineMetadata.java
@@ -16,10 +16,7 @@
*/
package org.apache.tika.metadata;
-/**
- * Metadata for describing machines, such as their
- * architecture, type and endian-ness
- */
+/** Metadata for describing machines, such as their architecture, type and endian-ness */
public interface MachineMetadata {
String PREFIX = "machine:";
@@ -40,9 +37,19 @@
String PLATFORM_WINDOWS = "Windows";
Property PLATFORM =
- Property.internalClosedChoise(PREFIX + "platform", PLATFORM_SYSV, PLATFORM_HPUX,
- PLATFORM_NETBSD, PLATFORM_LINUX, PLATFORM_SOLARIS, PLATFORM_AIX, PLATFORM_IRIX,
- PLATFORM_FREEBSD, PLATFORM_TRU64, PLATFORM_ARM, PLATFORM_EMBEDDED,
+ Property.internalClosedChoise(
+ PREFIX + "platform",
+ PLATFORM_SYSV,
+ PLATFORM_HPUX,
+ PLATFORM_NETBSD,
+ PLATFORM_LINUX,
+ PLATFORM_SOLARIS,
+ PLATFORM_AIX,
+ PLATFORM_IRIX,
+ PLATFORM_FREEBSD,
+ PLATFORM_TRU64,
+ PLATFORM_ARM,
+ PLATFORM_EMBEDDED,
PLATFORM_WINDOWS);
String MACHINE_x86_32 = "x86-32";
@@ -66,10 +73,26 @@
String MACHINE_UNKNOWN = "Unknown";
Property MACHINE_TYPE =
- Property.internalClosedChoise(PREFIX + "machineType", MACHINE_x86_32, MACHINE_x86_64,
- MACHINE_IA_64, MACHINE_SPARC, MACHINE_M68K, MACHINE_M88K, MACHINE_MIPS,
- MACHINE_PPC, MACHINE_S370, MACHINE_S390, MACHINE_ARM, MACHINE_VAX,
- MACHINE_ALPHA, MACHINE_EFI, MACHINE_M32R, MACHINE_SH3, MACHINE_SH4, MACHINE_SH5,
+ Property.internalClosedChoise(
+ PREFIX + "machineType",
+ MACHINE_x86_32,
+ MACHINE_x86_64,
+ MACHINE_IA_64,
+ MACHINE_SPARC,
+ MACHINE_M68K,
+ MACHINE_M88K,
+ MACHINE_MIPS,
+ MACHINE_PPC,
+ MACHINE_S370,
+ MACHINE_S390,
+ MACHINE_ARM,
+ MACHINE_VAX,
+ MACHINE_ALPHA,
+ MACHINE_EFI,
+ MACHINE_M32R,
+ MACHINE_SH3,
+ MACHINE_SH4,
+ MACHINE_SH5,
MACHINE_UNKNOWN);
Property ENDIAN =
Property.internalClosedChoise(PREFIX + "endian", Endian.LITTLE.name, Endian.BIG.name);
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Message.java b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
index fcb1421..d0d6790 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Message.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Message.java
@@ -18,8 +18,8 @@
/**
* A collection of Message related property names.
- * <p>
- * See also {@link Office}'s MAPI-specific properties.
+ *
+ * <p>See also {@link Office}'s MAPI-specific properties.
*/
public interface Message {
String MESSAGE_PREFIX = "Message" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
@@ -42,51 +42,51 @@
String MULTIPART_BOUNDARY = "Multipart-Boundary";
/**
- * Where possible, this records the value from the name field.
- * Even in MAPI messages, though, this can be an email address.
+ * Where possible, this records the value from the name field. Even in MAPI messages, though,
+ * this can be an email address.
*/
Property MESSAGE_FROM_NAME = Property.internalTextBag(MESSAGE_PREFIX + "From-Name");
/**
- * Where possible, this records the value from the name field.
- * Even in MAPI messages, though, this can be a name.
- * <p/>
- * Note that the value may also be an X400/x500 Exchange format:
- * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other
+ * Where possible, this records the value from the name field. Even in MAPI messages, though,
+ * this can be a name.
+ *
+ * <p>Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange
+ * Administrative Group/cn=Recipients/cn=someone.or.other
*/
Property MESSAGE_FROM_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "From-Email");
/**
- * In Outlook messages, there are sometimes separate fields for "to-name" and
- * "to-display-name" name.
+ * In Outlook messages, there are sometimes separate fields for "to-name" and "to-display-name"
+ * name.
*/
Property MESSAGE_TO_NAME = Property.internalTextBag(MESSAGE_PREFIX + "To-Name");
Property MESSAGE_TO_DISPLAY_NAME = Property.internalTextBag(MESSAGE_PREFIX + "To-Display-Name");
/**
- * Where possible, this records the email value in the to field.
- * Even in MAPI messages, though, this can be a name.
- * <p/>
- * Note that the value may also be an X400/x500 Exchange format:
- * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other
+ * Where possible, this records the email value in the to field. Even in MAPI messages, though,
+ * this can be a name.
+ *
+ * <p>Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange
+ * Administrative Group/cn=Recipients/cn=someone.or.other
*/
Property MESSAGE_TO_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "To-Email");
/**
- * In Outlook messages, there are sometimes separate fields for "cc-name" and
- * "cc-display-name" name.
+ * In Outlook messages, there are sometimes separate fields for "cc-name" and "cc-display-name"
+ * name.
*/
Property MESSAGE_CC_NAME = Property.internalTextBag(MESSAGE_PREFIX + "CC-Name");
Property MESSAGE_CC_DISPLAY_NAME = Property.internalTextBag(MESSAGE_PREFIX + "CC-Display-Name");
/**
- * Where possible, this records the email value in the cc field.
- * Even in MAPI messages, though, this can be a name.
- * <p/>
- * Note that the value may also be an X400/x500 Exchange format:
- * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other
+ * Where possible, this records the email value in the cc field. Even in MAPI messages, though,
+ * this can be a name.
+ *
+ * <p>Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange
+ * Administrative Group/cn=Recipients/cn=someone.or.other
*/
Property MESSAGE_CC_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "CC-Email");
@@ -100,12 +100,11 @@
Property.internalTextBag(MESSAGE_PREFIX + "BCC-Display-Name");
/**
- * Where possible, this records the email value in the bcc field.
- * Even in MAPI messages, though, this can be a name.
- * <p/>
- * Note that the value may also be an X400/x500 Exchange format:
- * /o=ExchangeLabs/ou=Exchange Administrative Group/cn=Recipients/cn=someone.or.other
+ * Where possible, this records the email value in the bcc field. Even in MAPI messages, though,
+ * this can be a name.
+ *
+ * <p>Note that the value may also be an X400/x500 Exchange format: /o=ExchangeLabs/ou=Exchange
+ * Administrative Group/cn=Recipients/cn=someone.or.other
*/
Property MESSAGE_BCC_EMAIL = Property.internalTextBag(MESSAGE_PREFIX + "BCC-Email");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
index 9b8e3b8..70c0162 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
@@ -33,75 +33,74 @@
import java.util.Objects;
import java.util.Properties;
import java.util.TimeZone;
-
import org.apache.tika.metadata.Property.PropertyType;
import org.apache.tika.metadata.writefilter.MetadataWriteFilter;
import org.apache.tika.utils.DateUtils;
-/**
- * A multi-valued metadata container.
- */
+/** A multi-valued metadata container. */
public class Metadata
- implements CreativeCommons, Geographic, HttpHeaders, Message, ClimateForcast, TIFF,
- TikaMimeKeys, Serializable {
+ implements CreativeCommons,
+ Geographic,
+ HttpHeaders,
+ Message,
+ ClimateForcast,
+ TIFF,
+ TikaMimeKeys,
+ Serializable {
+ private static final MetadataWriteFilter ACCEPT_ALL =
+ new MetadataWriteFilter() {
+ @Override
+ public void filterExisting(Map<String, String[]> data) {
+ // no-op
+ }
- private static final MetadataWriteFilter ACCEPT_ALL = new MetadataWriteFilter() {
- @Override
- public void filterExisting(Map<String, String[]> data) {
- //no-op
- }
+ @Override
+ public void add(String field, String value, Map<String, String[]> data) {
+ String[] values = data.get(field);
+ if (values == null) {
+ set(field, value, data);
+ } else {
+ data.put(field, appendValues(values, value));
+ }
+ }
- @Override
- public void add(String field, String value, Map<String, String[]> data) {
- String[] values = data.get(field);
- if (values == null) {
- set(field, value, data);
- } else {
- data.put(field, appendValues(values, value));
- }
- }
+ // legacy behavior -- remove the field if value is null
+ @Override
+ public void set(String field, String value, Map<String, String[]> data) {
+ if (value != null) {
+ data.put(field, new String[] {value});
+ } else {
+ data.remove(field);
+ }
+ }
- //legacy behavior -- remove the field if value is null
- @Override
- public void set(String field, String value, Map<String, String[]> data) {
- if (value != null) {
- data.put(field, new String[]{ value });
- } else {
- data.remove(field);
- }
- }
+ private String[] appendValues(String[] values, final String value) {
+ if (value == null) {
+ return values;
+ }
+ String[] newValues = new String[values.length + 1];
+ System.arraycopy(values, 0, newValues, 0, values.length);
+ newValues[newValues.length - 1] = value;
+ return newValues;
+ }
+ };
- private String[] appendValues(String[] values, final String value) {
- if (value == null) {
- return values;
- }
- String[] newValues = new String[values.length + 1];
- System.arraycopy(values, 0, newValues, 0, values.length);
- newValues[newValues.length - 1] = value;
- return newValues;
- }
- };
-
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 5623926545693153182L;
+
/**
- * Some parsers will have the date as a ISO-8601 string
- * already, and will set that into the Metadata object.
+ * Some parsers will have the date as a ISO-8601 string already, and will set that into the
+ * Metadata object.
*/
private static final DateUtils DATE_UTILS = new DateUtils();
- /**
- * A map of all metadata attributes.
- */
+
+ /** A map of all metadata attributes. */
private Map<String, String[]> metadata = null;
-
private MetadataWriteFilter writeFilter = ACCEPT_ALL;
- /**
- * Constructs a new, empty metadata.
- */
+
+ /** Constructs a new, empty metadata. */
public Metadata() {
metadata = new HashMap<>();
}
@@ -115,8 +114,8 @@
}
/**
- * Parses the given date string. This method is synchronized to prevent
- * concurrent access to the thread-unsafe date formats.
+ * Parses the given date string. This method is synchronized to prevent concurrent access to the
+ * thread-unsafe date formats.
*
* @param date date string
* @return parsed date, or <code>null</code> if the date can't be parsed
@@ -133,8 +132,8 @@
* @return true is named value is multivalued, false if single value or null
*/
public boolean isMultiValued(final Property property) {
- return metadata.get(property.getName()) != null &&
- metadata.get(property.getName()).length > 1;
+ return metadata.get(property.getName()) != null
+ && metadata.get(property.getName()).length > 1;
}
/**
@@ -157,8 +156,8 @@
}
/**
- * Get the value associated to a metadata name. If many values are assiociated
- * to the specified name, then the first one is returned.
+ * Get the value associated to a metadata name. If many values are assiociated to the specified
+ * name, then the first one is returned.
*
* @param name of the metadata.
* @return the value associated to the specified metadata name.
@@ -173,13 +172,12 @@
}
/**
- * Sets the writeFilter that is called before {@link #set(String, String)}
- * {@link #set(String, String[])}, {@link #add(String, String)},
- * {@link #add(String, String[])}. The default is {@link #ACCEPT_ALL}.
+ * Sets the writeFilter that is called before {@link #set(String, String)} {@link #set(String,
+ * String[])}, {@link #add(String, String)}, {@link #add(String, String[])}. The default is
+ * {@link #ACCEPT_ALL}.
*
- * This is intended for expert use only. Some parsers rely on metadata
- * during the parse, and if the metadata they need is excluded, they
- * will not function properly.
+ * <p>This is intended for expert use only. Some parsers rely on metadata during the parse, and
+ * if the metadata they need is excluded, they will not function properly.
*
* @param writeFilter
* @since 2.4.0
@@ -204,8 +202,8 @@
* Returns the value of the identified Integer based metadata property.
*
* @param property simple integer property definition
- * @return property value as a Integer, or <code>null</code> if the property is not set, or
- * not a valid Integer
+ * @return property value as a Integer, or <code>null</code> if the property is not set, or not
+ * a valid Integer
* @since Apache Tika 0.8
*/
public Integer getInt(Property property) {
@@ -231,8 +229,8 @@
* Returns the value of the identified Date based metadata property.
*
* @param property simple date property definition
- * @return property value as a Date, or <code>null</code> if the property is not set, or not
- * a valid Date
+ * @return property value as a Date, or <code>null</code> if the property is not set, or not a
+ * valid Date
* @since Apache Tika 0.8
*/
public Date getDate(Property property) {
@@ -280,10 +278,10 @@
}
/**
- * Add a metadata name/value mapping. Add the specified value to the list of
- * values associated to the specified metadata name.
+ * Add a metadata name/value mapping. Add the specified value to the list of values associated
+ * to the specified metadata name.
*
- * @param name the metadata name.
+ * @param name the metadata name.
* @param value the metadata value.
*/
public void add(final String name, final String value) {
@@ -291,10 +289,10 @@
}
/**
- * Add a metadata name/value mapping. Add the specified value to the list of
- * values associated to the specified metadata name.
+ * Add a metadata name/value mapping. Add the specified value to the list of values associated
+ * to the specified metadata name.
*
- * @param name the metadata name.
+ * @param name the metadata name.
* @param newValues the metadata values
*/
protected void add(final String name, final String[] newValues) {
@@ -309,11 +307,11 @@
}
/**
- * Add a metadata property/value mapping. Add the specified value to the list of
- * values associated to the specified metadata property.
+ * Add a metadata property/value mapping. Add the specified value to the list of values
+ * associated to the specified metadata property.
*
* @param property the metadata property.
- * @param value the metadata value.
+ * @param value the metadata value.
*/
public void add(final Property property, final String value) {
@@ -353,17 +351,17 @@
Enumeration<String> names = (Enumeration<String>) properties.propertyNames();
while (names.hasMoreElements()) {
String name = names.nextElement();
- metadata.put(name, new String[]{properties.getProperty(name)});
+ metadata.put(name, new String[] {properties.getProperty(name)});
}
}
/**
- * Set metadata name/value. Associate the specified value to the specified
- * metadata name. If some previous values were associated to this name,
- * they are removed. If the given value is <code>null</code>, then the
- * metadata entry is removed.
+ * Set metadata name/value. Associate the specified value to the specified metadata name. If
+ * some previous values were associated to this name, they are removed. If the given value is
+ * <code>
+ * null</code>, then the metadata entry is removed.
*
- * @param name the metadata name.
+ * @param name the metadata name.
* @param value the metadata value, or <code>null</code>
*/
public void set(String name, String value) {
@@ -371,8 +369,8 @@
}
protected void set(String name, String[] values) {
- //TODO: optimize this to not copy if all
- //values are to be included "as is"
+ // TODO: optimize this to not copy if all
+ // values are to be included "as is"
if (values != null) {
metadata.remove(name);
for (String v : values) {
@@ -387,7 +385,7 @@
* Sets the value of the identified metadata property.
*
* @param property property definition
- * @param value property value
+ * @param value property value
* @since Apache Tika 0.7
*/
public void set(Property property, String value) {
@@ -410,7 +408,7 @@
* Sets the values of the identified metadata property.
*
* @param property property definition
- * @param values property values
+ * @param values property values
* @since Apache Tika 1.2
*/
public void set(Property property, String[] values) {
@@ -433,17 +431,17 @@
* Sets the integer value of the identified metadata property.
*
* @param property simple integer property definition
- * @param value property value
+ * @param value property value
* @since Apache Tika 0.8
*/
public void set(Property property, int value) {
if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
- throw new PropertyTypeException(Property.PropertyType.SIMPLE,
- property.getPrimaryProperty().getPropertyType());
+ throw new PropertyTypeException(
+ Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType());
}
if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
- throw new PropertyTypeException(Property.ValueType.INTEGER,
- property.getPrimaryProperty().getValueType());
+ throw new PropertyTypeException(
+ Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType());
}
set(property, Integer.toString(value));
}
@@ -452,35 +450,36 @@
* Sets the integer value of the identified metadata property.
*
* @param property simple integer property definition
- * @param value property value
+ * @param value property value
* @since Apache Tika 0.8
*/
public void set(Property property, long value) {
if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
- throw new PropertyTypeException(Property.PropertyType.SIMPLE,
- property.getPrimaryProperty().getPropertyType());
+ throw new PropertyTypeException(
+ Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType());
}
if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL) {
- throw new PropertyTypeException(Property.ValueType.REAL,
- property.getPrimaryProperty().getValueType());
+ throw new PropertyTypeException(
+ Property.ValueType.REAL, property.getPrimaryProperty().getValueType());
}
set(property, Long.toString(value));
}
+
/**
* Sets the integer value of the identified metadata property.
*
* @param property simple integer property definition
- * @param value property value
+ * @param value property value
* @since Apache Tika 2.1.1
*/
public void set(Property property, boolean value) {
if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
- throw new PropertyTypeException(Property.PropertyType.SIMPLE,
- property.getPrimaryProperty().getPropertyType());
+ throw new PropertyTypeException(
+ Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType());
}
if (property.getPrimaryProperty().getValueType() != Property.ValueType.BOOLEAN) {
- throw new PropertyTypeException(Property.ValueType.BOOLEAN,
- property.getPrimaryProperty().getValueType());
+ throw new PropertyTypeException(
+ Property.ValueType.BOOLEAN, property.getPrimaryProperty().getValueType());
}
set(property, Boolean.toString(value));
}
@@ -489,17 +488,17 @@
* Adds the integer value of the identified metadata property.
*
* @param property seq integer property definition
- * @param value property value
+ * @param value property value
* @since Apache Tika 1.21
*/
public void add(Property property, int value) {
if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
- throw new PropertyTypeException(PropertyType.SEQ,
- property.getPrimaryProperty().getPropertyType());
+ throw new PropertyTypeException(
+ PropertyType.SEQ, property.getPrimaryProperty().getPropertyType());
}
if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
- throw new PropertyTypeException(Property.ValueType.INTEGER,
- property.getPrimaryProperty().getValueType());
+ throw new PropertyTypeException(
+ Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType());
}
add(property, Integer.toString(value));
}
@@ -513,12 +512,12 @@
*/
public int[] getIntValues(Property property) {
if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
- throw new PropertyTypeException(PropertyType.SEQ,
- property.getPrimaryProperty().getPropertyType());
+ throw new PropertyTypeException(
+ PropertyType.SEQ, property.getPrimaryProperty().getPropertyType());
}
if (property.getPrimaryProperty().getValueType() != Property.ValueType.INTEGER) {
- throw new PropertyTypeException(Property.ValueType.INTEGER,
- property.getPrimaryProperty().getValueType());
+ throw new PropertyTypeException(
+ Property.ValueType.INTEGER, property.getPrimaryProperty().getValueType());
}
String[] vals = getValues(property);
int[] ret = new int[vals.length];
@@ -537,12 +536,12 @@
*/
public long[] getLongValues(Property property) {
if (property.getPrimaryProperty().getPropertyType() != PropertyType.SEQ) {
- throw new PropertyTypeException(PropertyType.SEQ,
- property.getPrimaryProperty().getPropertyType());
+ throw new PropertyTypeException(
+ PropertyType.SEQ, property.getPrimaryProperty().getPropertyType());
}
if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL) {
- throw new PropertyTypeException(Property.ValueType.REAL,
- property.getPrimaryProperty().getValueType());
+ throw new PropertyTypeException(
+ Property.ValueType.REAL, property.getPrimaryProperty().getValueType());
}
String[] vals = getValues(property);
long[] ret = new long[vals.length];
@@ -556,14 +555,14 @@
* Sets the real or rational value of the identified metadata property.
*
* @param property simple real or simple rational property definition
- * @param value property value
+ * @param value property value
* @since Apache Tika 0.8
*/
public void set(Property property, double value) {
- if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL &&
- property.getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) {
- throw new PropertyTypeException(Property.ValueType.REAL,
- property.getPrimaryProperty().getValueType());
+ if (property.getPrimaryProperty().getValueType() != Property.ValueType.REAL
+ && property.getPrimaryProperty().getValueType() != Property.ValueType.RATIONAL) {
+ throw new PropertyTypeException(
+ Property.ValueType.REAL, property.getPrimaryProperty().getValueType());
}
set(property, Double.toString(value));
}
@@ -572,17 +571,17 @@
* Sets the date value of the identified metadata property.
*
* @param property simple integer property definition
- * @param date property value
+ * @param date property value
* @since Apache Tika 0.8
*/
public void set(Property property, Date date) {
if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
- throw new PropertyTypeException(Property.PropertyType.SIMPLE,
- property.getPrimaryProperty().getPropertyType());
+ throw new PropertyTypeException(
+ Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType());
}
if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) {
- throw new PropertyTypeException(Property.ValueType.DATE,
- property.getPrimaryProperty().getValueType());
+ throw new PropertyTypeException(
+ Property.ValueType.DATE, property.getPrimaryProperty().getValueType());
}
String dateString = null;
if (date != null) {
@@ -595,17 +594,17 @@
* Sets the date value of the identified metadata property.
*
* @param property simple integer property definition
- * @param date property value
+ * @param date property value
* @since Apache Tika 0.8
*/
public void set(Property property, Calendar date) {
if (property.getPrimaryProperty().getPropertyType() != Property.PropertyType.SIMPLE) {
- throw new PropertyTypeException(Property.PropertyType.SIMPLE,
- property.getPrimaryProperty().getPropertyType());
+ throw new PropertyTypeException(
+ Property.PropertyType.SIMPLE, property.getPrimaryProperty().getPropertyType());
}
if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) {
- throw new PropertyTypeException(Property.ValueType.DATE,
- property.getPrimaryProperty().getValueType());
+ throw new PropertyTypeException(
+ Property.ValueType.DATE, property.getPrimaryProperty().getValueType());
}
String dateString = null;
if (date != null) {
@@ -618,13 +617,13 @@
* Adds the date value of the identified metadata property.
*
* @param property simple calendar property definition
- * @param date property value
+ * @param date property value
* @since Apache Tika 2.5.0
*/
public void add(Property property, Calendar date) {
if (property.getPrimaryProperty().getValueType() != Property.ValueType.DATE) {
- throw new PropertyTypeException(Property.ValueType.DATE,
- property.getPrimaryProperty().getValueType());
+ throw new PropertyTypeException(
+ Property.ValueType.DATE, property.getPrimaryProperty().getValueType());
}
String dateString = null;
if (date != null) {
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 2a9e428..bbfbc6e 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -17,11 +17,9 @@
package org.apache.tika.metadata;
/**
- * Office Document properties collection. These properties apply to
- * Office / Productivity Documents of all forms, including (but not limited
- * to) MS Office and OpenDocument formats.
- * This is a logical collection of properties, which may be drawn from a
- * few different external definitions.
+ * Office Document properties collection. These properties apply to Office / Productivity Documents
+ * of all forms, including (but not limited to) MS Office and OpenDocument formats. This is a
+ * logical collection of properties, which may be drawn from a few different external definitions.
*
* @since Apache Tika 1.2
*/
@@ -31,157 +29,190 @@
String PREFIX_DOC_META = "meta";
/**
- * For user defined metadata entries in the document,
- * what prefix should be attached to the key names.
- * eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes custom:Info1=Text1
+ * For user defined metadata entries in the document, what prefix should be attached to the key
+ * names. eg <meta:user-defined meta:name="Info1">Text1</meta:user-defined> becomes
+ * custom:Info1=Text1
*/
String USER_DEFINED_METADATA_NAME_PREFIX = "custom:";
+ /** Keywords pertaining to a document. Also populates {@link DublinCore#SUBJECT}. */
+ Property KEYWORDS =
+ Property.composite(
+ Property.internalTextBag(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "keyword"),
+ new Property[] {
+ DublinCore.SUBJECT,
+ });
+
+ /** Name of the initial creator/author of a document */
+ Property INITIAL_AUTHOR =
+ Property.internalText(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "initial-author");
+
+ /** Name of the last (most recent) author of a document */
+ Property LAST_AUTHOR =
+ Property.internalText(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "last-author");
+
+ /** Name of the principal author(s) of a document */
+ Property AUTHOR =
+ Property.internalTextBag(
+ PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "author");
+
+ /** When was the document created? */
+ Property CREATION_DATE =
+ Property.internalDate(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "creation-date");
+
+ /** When was the document last saved? */
+ Property SAVE_DATE =
+ Property.internalDate(
+ PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "save-date");
+
+ /** When was the document last printed? */
+ Property PRINT_DATE =
+ Property.internalDate(
+ PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "print-date");
+
+ /** The number of Slides are there in the (presentation) document */
+ Property SLIDE_COUNT =
+ Property.internalInteger(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "slide-count");
+
+ /** The number of Pages are there in the (paged) document */
+ Property PAGE_COUNT =
+ Property.internalInteger(
+ PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "page-count");
+
+ /** The number of individual Paragraphs in the document */
+ Property PARAGRAPH_COUNT =
+ Property.internalInteger(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "paragraph-count");
+
+ /** The number of lines in the document */
+ Property LINE_COUNT =
+ Property.internalInteger(
+ PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "line-count");
+
+ /** The number of Words in the document */
+ Property WORD_COUNT =
+ Property.internalInteger(
+ PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "word-count");
+
+ /** The number of Characters in the document */
+ Property CHARACTER_COUNT =
+ Property.internalInteger(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "character-count");
+
+ /** The number of Characters in the document, including spaces */
+ Property CHARACTER_COUNT_WITH_SPACES =
+ Property.internalInteger(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "character-count-with-spaces");
+
+ /** The number of Tables in the document */
+ Property TABLE_COUNT =
+ Property.internalInteger(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "table-count");
+
+ /** The number of Images in the document */
+ Property IMAGE_COUNT =
+ Property.internalInteger(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "image-count");
/**
- * Keywords pertaining to a document. Also populates {@link DublinCore#SUBJECT}.
+ * The number of Objects in the document. These are typically non-Image resources embedded in
+ * the document, such as other documents or non-Image media.
*/
- Property KEYWORDS = Property.composite(Property.internalTextBag(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "keyword"),
- new Property[]{DublinCore.SUBJECT,});
+ Property OBJECT_COUNT =
+ Property.internalInteger(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "object-count");
+
+ /** MAPI message class. What type of .msg/MAPI file is it? */
+ Property MAPI_MESSAGE_CLASS =
+ Property.internalClosedChoise(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "mapi-message-class",
+ "APPOINTMENT",
+ "CONTACT",
+ "NOTE",
+ "STICKY_NOTE",
+ "POST",
+ "TASK",
+ "UNKNOWN",
+ "UNSPECIFIED");
+
+ Property MAPI_SENT_BY_SERVER_TYPE =
+ Property.internalText(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "mapi-sent-by-server-type");
+
+ Property MAPI_FROM_REPRESENTING_NAME =
+ Property.internalText(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "mapi-from-representing-name");
+
+ Property MAPI_FROM_REPRESENTING_EMAIL =
+ Property.internalText(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "mapi-from-representing-email");
+
+ Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME =
+ Property.internalDate(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "mapi-msg-client-submit-time");
/**
- * Name of the initial creator/author of a document
- */
- Property INITIAL_AUTHOR = Property.internalText(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "initial-author");
-
- /**
- * Name of the last (most recent) author of a document
- */
- Property LAST_AUTHOR = Property.internalText(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "last-author");
-
- /**
- * Name of the principal author(s) of a document
- */
- Property AUTHOR = Property.internalTextBag(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "author");
-
-
- /**
- * When was the document created?
- */
- Property CREATION_DATE = Property.internalDate(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "creation-date");
-
- /**
- * When was the document last saved?
- */
- Property SAVE_DATE = Property.internalDate(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "save-date");
-
- /**
- * When was the document last printed?
- */
- Property PRINT_DATE = Property.internalDate(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "print-date");
-
-
- /**
- * The number of Slides are there in the (presentation) document
- */
- Property SLIDE_COUNT = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "slide-count");
-
- /**
- * The number of Pages are there in the (paged) document
- */
- Property PAGE_COUNT = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "page-count");
-
- /**
- * The number of individual Paragraphs in the document
- */
- Property PARAGRAPH_COUNT = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "paragraph-count");
-
- /**
- * The number of lines in the document
- */
- Property LINE_COUNT = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "line-count");
-
- /**
- * The number of Words in the document
- */
- Property WORD_COUNT = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "word-count");
-
- /**
- * The number of Characters in the document
- */
- Property CHARACTER_COUNT = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "character-count");
-
- /**
- * The number of Characters in the document, including spaces
- */
- Property CHARACTER_COUNT_WITH_SPACES = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "character-count-with-spaces");
-
- /**
- * The number of Tables in the document
- */
- Property TABLE_COUNT = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "table-count");
-
- /**
- * The number of Images in the document
- */
- Property IMAGE_COUNT = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "image-count");
-
- /**
- * The number of Objects in the document. These are typically non-Image resources
- * embedded in the document, such as other documents or non-Image media.
- */
- Property OBJECT_COUNT = Property.internalInteger(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "object-count");
-
- /**
- * MAPI message class. What type of .msg/MAPI file is it?
- */
- Property MAPI_MESSAGE_CLASS = Property.internalClosedChoise(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-message-class",
- "APPOINTMENT", "CONTACT", "NOTE", "STICKY_NOTE", "POST", "TASK", "UNKNOWN",
- "UNSPECIFIED");
-
- Property MAPI_SENT_BY_SERVER_TYPE = Property.internalText(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "mapi-sent-by-server-type");
-
- Property MAPI_FROM_REPRESENTING_NAME = Property.internalText(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "mapi-from-representing-name");
-
- Property MAPI_FROM_REPRESENTING_EMAIL = Property.internalText(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "mapi-from-representing-email");
-
- Property MAPI_MESSAGE_CLIENT_SUBMIT_TIME = Property.internalDate(
- PREFIX_DOC_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "mapi-msg-client-submit-time");
-
- /**
- * Embedded files may have a "progID" associated with them, such as
- * Word.Document.12 or AcroExch.Document.DC
+ * Embedded files may have a "progID" associated with them, such as Word.Document.12 or
+ * AcroExch.Document.DC
*/
Property PROG_ID = Property.internalText("msoffice:progID");
Property OCX_NAME = Property.internalText("msoffice:ocxName");
- Property MAPI_RECIPIENTS_STRING = Property.internalText(PREFIX_DOC_META +
- TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-recipients-string");
- Property MAPI_IMPORTANCE = Property.internalInteger(PREFIX_DOC_META +
- TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
- Property MAPI_PRIORTY = Property.internalInteger(PREFIX_DOC_META +
- TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
- Property MAPI_IS_FLAGGED = Property.internalBoolean(PREFIX_DOC_META +
- TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-is-flagged");
+ Property MAPI_RECIPIENTS_STRING =
+ Property.internalText(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "mapi-recipients-string");
+ Property MAPI_IMPORTANCE =
+ Property.internalInteger(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "mapi-importance");
+ Property MAPI_PRIORTY =
+ Property.internalInteger(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "mapi-importance");
+ Property MAPI_IS_FLAGGED =
+ Property.internalBoolean(
+ PREFIX_DOC_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "mapi-is-flagged");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java
index 1259719..5e3b321 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLCore.java
@@ -17,62 +17,58 @@
package org.apache.tika.metadata;
/**
- * Core properties as defined in the Office Open XML specification part Two that are not
- * in the DublinCore namespace.
- * There is also a keyword property definition in the specification which is omitted here,
- * because Tika should stick to the DublinCore/IPTC definition.
+ * Core properties as defined in the Office Open XML specification part Two that are not in the
+ * DublinCore namespace. There is also a keyword property definition in the specification which is
+ * omitted here, because Tika should stick to the DublinCore/IPTC definition.
*
- * @see <a href="http://www.iso.org/iso/iso_catalogue/catalogue_tc/catalogue_detail.htm?csnumber=59575"
- * >ISO document of Office Open XML specification</a>
- * @see <a href="http://www.ecma-international.org/publications/standards/Ecma-376.htm"
- * >ECMA document of Office Open XML specification</a>
+ * @see <a
+ * href="http://www.iso.org/iso/iso_catalogue/catalogue_tc/catalogue_detail.htm?csnumber=59575"
+ * >ISO document of Office Open XML specification</a>
+ * @see <a href="http://www.ecma-international.org/publications/standards/Ecma-376.htm" >ECMA
+ * document of Office Open XML specification</a>
*/
public interface OfficeOpenXMLCore {
String NAMESPACE_URI =
"http://schemas.openxmlformats.org/package/2006/metadata/core-properties/";
String PREFIX = "cp";
- /**
- * A categorization of the content of this package.
- */
- Property CATEGORY = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "category");
+ /** A categorization of the content of this package. */
+ Property CATEGORY =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "category");
- /**
- * The status of the content.
- */
- Property CONTENT_STATUS = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contentStatus");
+ /** The status of the content. */
+ Property CONTENT_STATUS =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "contentStatus");
- /**
- * The user who performed the last modification. The identification is environment-specific.
- */
- Property LAST_MODIFIED_BY = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastModifiedBy");
+ /** The user who performed the last modification. The identification is environment-specific. */
+ Property LAST_MODIFIED_BY =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastModifiedBy");
- /**
- * The date and time of the last printing.
- */
- Property LAST_PRINTED = Property.externalDate(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastPrinted");
+ /** The date and time of the last printing. */
+ Property LAST_PRINTED =
+ Property.externalDate(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "lastPrinted");
- /**
- * The revision number.
- */
- Property REVISION = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "revision");
+ /** The revision number. */
+ Property REVISION =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "revision");
- /**
- * The version number. This value is set by the user or by the application.
- */
- Property VERSION = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "version");
+ /** The version number. This value is set by the user or by the application. */
+ Property VERSION =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "version");
- /**
- * The document's subject. Also populates {@link DublinCore#SUBJECT}
- */
+ /** The document's subject. Also populates {@link DublinCore#SUBJECT} */
@Deprecated
- Property SUBJECT = Property.composite(Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"),
- new Property[]{DublinCore.SUBJECT,});
+ Property SUBJECT =
+ Property.composite(
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "subject"),
+ new Property[] {
+ DublinCore.SUBJECT,
+ });
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
index 6919c21..8bd6a00 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/OfficeOpenXMLExtended.java
@@ -17,16 +17,16 @@
package org.apache.tika.metadata;
/**
- * Extended properties as defined in the Office Open XML specification part Four.
- * Those properties are omitted which have equivalent properties defined in the ODF
- * namespace like "word count".
- * Also not all properties from the specification are defined here, yet. Only those which have
- * been in use by the parsers so far.
+ * Extended properties as defined in the Office Open XML specification part Four. Those properties
+ * are omitted which have equivalent properties defined in the ODF namespace like "word count". Also
+ * not all properties from the specification are defined here, yet. Only those which have been in
+ * use by the parsers so far.
*
- * @see <a href="http://www.iso.org/iso/iso_catalogue/catalogue_tc/catalogue_detail.htm?csnumber=59575"
- * >ISO document of Office Open XML specification</a>
- * @see <a href="http://www.ecma-international.org/publications/standards/Ecma-376.htm"
- * >ECMA document of Office Open XML specification</a>
+ * @see <a
+ * href="http://www.iso.org/iso/iso_catalogue/catalogue_tc/catalogue_detail.htm?csnumber=59575"
+ * >ISO document of Office Open XML specification</a>
+ * @see <a href="http://www.ecma-international.org/publications/standards/Ecma-376.htm" >ECMA
+ * document of Office Open XML specification</a>
*/
public interface OfficeOpenXMLExtended {
String NAMESPACE_URI =
@@ -42,42 +42,60 @@
String SECURITY_LOCKED_FOR_ANNOTATIONS = "LockedForAnnotations";
String SECURITY_UNKNOWN = "Unknown";
- Property TEMPLATE = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Template");
+ Property TEMPLATE =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Template");
- Property MANAGER = Property.externalTextBag(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Manager");
+ Property MANAGER =
+ Property.externalTextBag(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Manager");
- Property COMPANY = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Company");
+ Property COMPANY =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Company");
- Property PRESENTATION_FORMAT = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PresentationFormat");
+ Property PRESENTATION_FORMAT =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "PresentationFormat");
- Property NOTES = Property.externalInteger(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Notes");
+ Property NOTES =
+ Property.externalInteger(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Notes");
- Property TOTAL_TIME = Property.externalInteger(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "TotalTime");
+ Property TOTAL_TIME =
+ Property.externalInteger(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "TotalTime");
- Property HIDDEN_SLIDES = Property.externalInteger(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "HiddedSlides");
+ Property HIDDEN_SLIDES =
+ Property.externalInteger(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "HiddedSlides");
- Property APPLICATION = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Application");
+ Property APPLICATION =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Application");
- Property APP_VERSION = Property.externalText(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AppVersion");
- //Integer flag
- Property DOC_SECURITY = Property.externalInteger(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurity");
+ Property APP_VERSION =
+ Property.externalText(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AppVersion");
+ // Integer flag
+ Property DOC_SECURITY =
+ Property.externalInteger(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurity");
- //Human readable string explaining doc security flag
- Property DOC_SECURITY_STRING = Property.externalClosedChoise(
- PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurityString",
- SECURITY_NONE, SECURITY_PASSWORD_PROTECTED, SECURITY_READ_ONLY_RECOMMENDED,
- SECURITY_READ_ONLY_ENFORCED, SECURITY_LOCKED_FOR_ANNOTATIONS, SECURITY_UNKNOWN);
+ // Human readable string explaining doc security flag
+ Property DOC_SECURITY_STRING =
+ Property.externalClosedChoise(
+ PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DocSecurityString",
+ SECURITY_NONE,
+ SECURITY_PASSWORD_PROTECTED,
+ SECURITY_READ_ONLY_RECOMMENDED,
+ SECURITY_READ_ONLY_ENFORCED,
+ SECURITY_LOCKED_FOR_ANNOTATIONS,
+ SECURITY_UNKNOWN);
- Property COMMENTS = Property.externalTextBag(
- WORD_PROCESSING_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Comments");
+ Property COMMENTS =
+ Property.externalTextBag(
+ WORD_PROCESSING_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "Comments");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
index b15c103..475af4f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PDF.java
@@ -27,20 +27,18 @@
String PDFA_PREFIX = "pdfa" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
String PDFAID_PREFIX = "pdfaid" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
-
/**
- * Number of %%EOF as extracted by the StartXRefScanner. See
- * that class for limitations.
+ * Number of %%EOF as extracted by the StartXRefScanner. See that class for limitations.
*
- * This includes the final %%EOF, which may or may not be at the literal
- * end of the file. This does not include an %%EOF
- * if the startxref=0, as would happen in a dummy %%EOF in a linearized PDF.
+ * <p>This includes the final %%EOF, which may or may not be at the literal end of the file.
+ * This does not include an %%EOF if the startxref=0, as would happen in a dummy %%EOF in a
+ * linearized PDF.
*/
Property EOF_OFFSETS = Property.externalRealSeq(PDF_PREFIX + "eofOffsets");
/**
- * Prefix to be used for properties that record what was stored
- * in the docinfo section (as opposed to XMP)
+ * Prefix to be used for properties that record what was stored in the docinfo section (as
+ * opposed to XMP)
*/
String PDF_DOC_INFO_PREFIX =
PDF_PREFIX + "docinfo" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
@@ -92,17 +90,14 @@
Property PRODUCER = Property.internalText(PDF_PREFIX + "producer");
/**
- * This specifies where an action or destination would be found/triggered
- * in the document: on document open, before close, etc.
+ * This specifies where an action or destination would be found/triggered in the document: on
+ * document open, before close, etc.
*
- * This is included in the embedded document (js only for now?), not the container PDF.
+ * <p>This is included in the embedded document (js only for now?), not the container PDF.
*/
Property ACTION_TRIGGER = Property.internalText(PDF_PREFIX + "actionTrigger");
- /**
- * This is a list of all action or destination triggers contained
- * within a given PDF.
- */
+ /** This is a list of all action or destination triggers contained within a given PDF. */
Property ACTION_TRIGGERS = Property.internalTextBag(PDF_PREFIX + "actionTriggers");
Property ACTION_TYPES = Property.internalTextBag(PDF_PREFIX + "actionTypes");
@@ -118,101 +113,81 @@
Property OVERALL_PERCENTAGE_UNMAPPED_UNICODE_CHARS =
Property.internalReal(PDF_PREFIX + "overallPercentageUnmappedUnicodeChars");
- /**
- * Contains at least one damaged font for at least one character
- */
- Property CONTAINS_DAMAGED_FONT =
- Property.internalBoolean(PDF_PREFIX + "containsDamagedFont");
+ /** Contains at least one damaged font for at least one character */
+ Property CONTAINS_DAMAGED_FONT = Property.internalBoolean(PDF_PREFIX + "containsDamagedFont");
- /**
- * Contains at least one font that is not embedded
- */
+ /** Contains at least one font that is not embedded */
Property CONTAINS_NON_EMBEDDED_FONT =
Property.internalBoolean(PDF_PREFIX + "containsNonEmbeddedFont");
- /**
- * Has XFA
- */
+ /** Has XFA */
Property HAS_XFA = Property.internalBoolean(PDF_PREFIX + "hasXFA");
- /**
- * Has XMP, whether or not it is valid
- */
+ /** Has XMP, whether or not it is valid */
Property HAS_XMP = Property.internalBoolean(PDF_PREFIX + "hasXMP");
/**
- * If xmp is extracted by, e.g. the XMLProfiler, where did it come from?
- * The document's document catalog or a specific page...or?
+ * If xmp is extracted by, e.g. the XMLProfiler, where did it come from? The document's document
+ * catalog or a specific page...or?
*/
Property XMP_LOCATION = Property.internalText(PDF_PREFIX + "xmpLocation");
- /**
- * Has > 0 AcroForm fields
- */
+ /** Has > 0 AcroForm fields */
Property HAS_ACROFORM_FIELDS = Property.internalBoolean(PDF_PREFIX + "hasAcroFormFields");
Property HAS_MARKED_CONTENT = Property.internalBoolean(PDF_PREFIX + "hasMarkedContent");
- /**
- * Has a collection element in the root. If true, this is likely a PDF Portfolio.
- */
+ /** Has a collection element in the root. If true, this is likely a PDF Portfolio. */
Property HAS_COLLECTION = Property.internalBoolean(PDF_PREFIX + "hasCollection");
- Property EMBEDDED_FILE_DESCRIPTION = Property.externalText(PDF_PREFIX +
- "embeddedFileDescription");
+ Property EMBEDDED_FILE_DESCRIPTION =
+ Property.externalText(PDF_PREFIX + "embeddedFileDescription");
+
+ /** If the file came from an annotation and there was a type */
+ Property EMBEDDED_FILE_ANNOTATION_TYPE =
+ Property.internalText(PDF_PREFIX + "embeddedFileAnnotationType");
/**
- * If the file came from an annotation and there was a type
+ * literal string from the PDEmbeddedFile#getSubtype(), should be what the PDF alleges is the
+ * embedded file's mime type
*/
- Property EMBEDDED_FILE_ANNOTATION_TYPE = Property.internalText(PDF_PREFIX +
- "embeddedFileAnnotationType");
+ Property EMBEDDED_FILE_SUBTYPE = Property.internalText(PDF_PREFIX + "embeddedFileSubtype");
- /**
- * literal string from the PDEmbeddedFile#getSubtype(), should be what the PDF
- * alleges is the embedded file's mime type
- */
- Property EMBEDDED_FILE_SUBTYPE = Property.internalText(PDF_PREFIX +
- "embeddedFileSubtype");
- /**
- * If the PDF has an annotation of type 3D
- */
+ /** If the PDF has an annotation of type 3D */
Property HAS_3D = Property.internalBoolean(PDF_PREFIX + "has3D");
Property ANNOTATION_TYPES = Property.internalTextBag(PDF_PREFIX + "annotationTypes");
Property ANNOTATION_SUBTYPES = Property.internalTextBag(PDF_PREFIX + "annotationSubtypes");
- /**
- * Number of 3D annotations a PDF contains. This makes {@link PDF#HAS_3D} redundant.
- */
+ /** Number of 3D annotations a PDF contains. This makes {@link PDF#HAS_3D} redundant. */
Property NUM_3D_ANNOTATIONS = Property.internalInteger(PDF_PREFIX + "num3DAnnotations");
- Property ASSOCIATED_FILE_RELATIONSHIP = Property.internalText(PDF_PREFIX +
- "associatedFileRelationship");
+ Property ASSOCIATED_FILE_RELATIONSHIP =
+ Property.internalText(PDF_PREFIX + "associatedFileRelationship");
+
/**
- * This is a zero-based number for incremental updates within a PDF -- 0 is the first
- * update, 1 is the second, etc. The final version of the PDF (e.g. the last update)
- * does not have an incremental update number.
+ * This is a zero-based number for incremental updates within a PDF -- 0 is the first update, 1
+ * is the second, etc. The final version of the PDF (e.g. the last update) does not have an
+ * incremental update number.
*
- * This value is populated with the parse incremental updates feature is selected
- * in the PDFParser.
+ * <p>This value is populated with the parse incremental updates feature is selected in the
+ * PDFParser.
*/
Property INCREMENTAL_UPDATE_NUMBER =
- Property.composite(Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"),
- new Property[]{ TikaCoreProperties.VERSION_NUMBER });
+ Property.composite(
+ Property.internalInteger(PDF_PREFIX + "incrementalUpdateNumber"),
+ new Property[] {TikaCoreProperties.VERSION_NUMBER});
- /**
- * Incremental updates as extracted by the StartXRefScanner. See
- * that class for limitations.
- */
+ /** Incremental updates as extracted by the StartXRefScanner. See that class for limitations. */
Property PDF_INCREMENTAL_UPDATE_COUNT =
- Property.composite( Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
- new Property[]{ TikaCoreProperties.VERSION_COUNT });
+ Property.composite(
+ Property.externalInteger(PDF_PREFIX + "incrementalUpdateCount"),
+ new Property[] {TikaCoreProperties.VERSION_COUNT});
/**
- * This counts the number of pages that would have been OCR'd or were OCR'd depending
- * on the OCR settings. If NO_OCR is selected, this will
+ * This counts the number of pages that would have been OCR'd or were OCR'd depending on the OCR
+ * settings. If NO_OCR is selected, this will
*/
Property OCR_PAGE_COUNT = Property.externalInteger(PDF_PREFIX + "ocrPageCount");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java b/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
index 4ba7909..944ba97 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PagedText.java
@@ -17,22 +17,19 @@
package org.apache.tika.metadata;
/**
- * XMP Paged-text schema. This is a collection of
- * {@link Property property definition} constants for the paged text
- * properties defined in the XMP standard.
+ * XMP Paged-text schema. This is a collection of {@link Property property definition} constants for
+ * the paged text properties defined in the XMP standard.
*
- * @see <a href="http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/xmp/pdfs/cc-201306/XMPSpecificationPart2.pdf"
- * >XMP Specification, Part 2: Standard Schemas</a>
+ * @see <a
+ * href="http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/xmp/pdfs/cc-201306/XMPSpecificationPart2.pdf"
+ * >XMP Specification, Part 2: Standard Schemas</a>
* @since Apache Tika 0.8
*/
public interface PagedText {
- /**
- * "The number of pages in the document (including any in contained
- * documents)."
- */
+ /** "The number of pages in the document (including any in contained documents)." */
Property N_PAGES = Property.internalInteger("xmpTPg:NPages");
- //TODO MaxPageSize, Fonts, Colorants, PlateNames
+ // TODO MaxPageSize, Fonts, Colorants, PlateNames
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java b/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java
index af4abab..1b8036c 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Photoshop.java
@@ -22,12 +22,12 @@
/**
* XMP Photoshop metadata schema.
- * <p>
- * A collection of property constants for the
- * Photo Metadata properties defined in the XMP Photoshop
- * standard.
*
- * @see <a href="http://partners.adobe.com/public/developer/en/xmp/sdk/XMPspecification.pdf">XMP Photoshop</a>
+ * <p>A collection of property constants for the Photo Metadata properties defined in the XMP
+ * Photoshop standard.
+ *
+ * @see <a href="http://partners.adobe.com/public/developer/en/xmp/sdk/XMPspecification.pdf">XMP
+ * Photoshop</a>
* @since Apache Tika 1.2
*/
public interface Photoshop {
@@ -35,57 +35,92 @@
String NAMESPACE_URI_PHOTOSHOP = "http://ns.adobe.com/photoshop/1.0/";
String PREFIX_PHOTOSHOP = "photoshop";
- Property AUTHORS_POSITION = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "AuthorsPosition");
+ Property AUTHORS_POSITION =
+ Property.internalText(
+ PREFIX_PHOTOSHOP
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "AuthorsPosition");
// TODO Replace this with proper indexed choices support
- String[] _COLOR_MODE_CHOICES_INDEXED =
- {"Bitmap", "Greyscale", "Indexed Colour", "RGB Color", "CMYK Colour", "Multi-Channel",
- "Duotone", "LAB Colour", "reserved", "reserved", "YCbCr Colour", "YCgCo Colour",
- "YCbCrK Colour"};
- Property COLOR_MODE = Property.internalClosedChoise(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ColorMode",
- _COLOR_MODE_CHOICES_INDEXED);
+ String[] _COLOR_MODE_CHOICES_INDEXED = {
+ "Bitmap",
+ "Greyscale",
+ "Indexed Colour",
+ "RGB Color",
+ "CMYK Colour",
+ "Multi-Channel",
+ "Duotone",
+ "LAB Colour",
+ "reserved",
+ "reserved",
+ "YCbCr Colour",
+ "YCgCo Colour",
+ "YCbCrK Colour"
+ };
+ Property COLOR_MODE =
+ Property.internalClosedChoise(
+ PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "ColorMode",
+ _COLOR_MODE_CHOICES_INDEXED);
- Property CAPTION_WRITER = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "CaptionWriter");
+ Property CAPTION_WRITER =
+ Property.internalText(
+ PREFIX_PHOTOSHOP
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "CaptionWriter");
- Property CATEGORY = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Category");
+ Property CATEGORY =
+ Property.internalText(
+ PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Category");
- Property CITY = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "City");
+ Property CITY =
+ Property.internalText(
+ PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "City");
- Property COUNTRY = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Country");
+ Property COUNTRY =
+ Property.internalText(
+ PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Country");
- Property CREDIT = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Credit");
+ Property CREDIT =
+ Property.internalText(
+ PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Credit");
- Property DATE_CREATED = Property.internalDate(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "DateCreated");
+ Property DATE_CREATED =
+ Property.internalDate(
+ PREFIX_PHOTOSHOP
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "DateCreated");
- Property HEADLINE = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Headline");
+ Property HEADLINE =
+ Property.internalText(
+ PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Headline");
- Property INSTRUCTIONS = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Instructions");
+ Property INSTRUCTIONS =
+ Property.internalText(
+ PREFIX_PHOTOSHOP
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "Instructions");
- Property SOURCE = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Source");
+ Property SOURCE =
+ Property.internalText(
+ PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Source");
- Property STATE = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "State");
+ Property STATE =
+ Property.internalText(
+ PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "State");
- Property SUPPLEMENTAL_CATEGORIES = Property.internalTextBag(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "SupplementalCategories");
+ Property SUPPLEMENTAL_CATEGORIES =
+ Property.internalTextBag(
+ PREFIX_PHOTOSHOP
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "SupplementalCategories");
- Property TRANSMISSION_REFERENCE = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "TransmissionReference");
+ Property TRANSMISSION_REFERENCE =
+ Property.internalText(
+ PREFIX_PHOTOSHOP
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "TransmissionReference");
- Property URGENCY = Property.internalText(
- PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Urgency");
-
+ Property URGENCY =
+ Property.internalText(
+ PREFIX_PHOTOSHOP + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Urgency");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Property.java b/tika-core/src/main/java/org/apache/tika/metadata/Property.java
index 3d67141..a40f840 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Property.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Property.java
@@ -26,10 +26,9 @@
import java.util.concurrent.ConcurrentHashMap;
/**
- * XMP property definition. Each instance of this class defines a single
- * metadata property like "dc:format". In addition to the property name,
- * the {@link ValueType value type} and category (internal or external)
- * of the property are included in the property definition. The available
+ * XMP property definition. Each instance of this class defines a single metadata property like
+ * "dc:format". In addition to the property name, the {@link ValueType value type} and category
+ * (internal or external) of the property are included in the property definition. The available
* choice values are also stored for open and closed choice value types.
*
* @since Apache Tika 0.7
@@ -43,21 +42,25 @@
private final ValueType valueType;
private final Property primaryProperty;
private final Property[] secondaryExtractProperties;
- /**
- * The available choices for the open and closed choice value types.
- */
+
+ /** The available choices for the open and closed choice value types. */
private final Set<String> choices;
- private Property(String name, boolean internal, PropertyType propertyType, ValueType valueType,
- String[] choices, Property primaryProperty,
- Property[] secondaryExtractProperties) {
+ private Property(
+ String name,
+ boolean internal,
+ PropertyType propertyType,
+ ValueType valueType,
+ String[] choices,
+ Property primaryProperty,
+ Property[] secondaryExtractProperties) {
this.name = name;
this.internal = internal;
this.propertyType = propertyType;
this.valueType = valueType;
if (choices != null) {
- this.choices = Collections
- .unmodifiableSet(new HashSet<>(Arrays.asList(choices.clone())));
+ this.choices =
+ Collections.unmodifiableSet(new HashSet<>(Arrays.asList(choices.clone())));
} else {
this.choices = null;
}
@@ -76,8 +79,12 @@
}
}
- private Property(String name, boolean internal, PropertyType propertyType, ValueType valueType,
- String[] choices) {
+ private Property(
+ String name,
+ boolean internal,
+ PropertyType propertyType,
+ ValueType valueType,
+ String[] choices) {
this(name, internal, propertyType, valueType, choices, null, null);
}
@@ -89,8 +96,8 @@
this(name, internal, PropertyType.SIMPLE, valueType, null);
}
- private Property(String name, boolean internal, PropertyType propertyType,
- ValueType valueType) {
+ private Property(
+ String name, boolean internal, PropertyType propertyType, ValueType valueType) {
this(name, internal, propertyType, valueType, null);
}
@@ -222,16 +229,16 @@
/**
* Constructs a new composite property from the given primary and array of secondary properties.
- * <p>
- * Note that name of the composite property is taken from its primary property,
- * and primary and secondary properties must not be composite properties themselves.
+ *
+ * <p>Note that name of the composite property is taken from its primary property, and primary
+ * and secondary properties must not be composite properties themselves.
*
* @param primaryProperty
* @param secondaryExtractProperties
* @return the composite property
*/
- public static Property composite(Property primaryProperty,
- Property[] secondaryExtractProperties) {
+ public static Property composite(
+ Property primaryProperty, Property[] secondaryExtractProperties) {
if (primaryProperty == null) {
throw new NullPointerException("primaryProperty must not be null");
}
@@ -249,8 +256,13 @@
if (primaryProperty.getChoices() != null) {
choices = primaryProperty.getChoices().toArray(new String[0]);
}
- return new Property(primaryProperty.getName(), primaryProperty.isInternal(),
- PropertyType.COMPOSITE, ValueType.PROPERTY, choices, primaryProperty,
+ return new Property(
+ primaryProperty.getName(),
+ primaryProperty.isInternal(),
+ PropertyType.COMPOSITE,
+ ValueType.PROPERTY,
+ choices,
+ primaryProperty,
secondaryExtractProperties);
}
@@ -266,12 +278,11 @@
return !internal;
}
- /**
- * Is the PropertyType one which accepts multiple values?
- */
+ /** Is the PropertyType one which accepts multiple values? */
public boolean isMultiValuePermitted() {
- if (propertyType == PropertyType.BAG || propertyType == PropertyType.SEQ ||
- propertyType == PropertyType.ALT) {
+ if (propertyType == PropertyType.BAG
+ || propertyType == PropertyType.SEQ
+ || propertyType == PropertyType.ALT) {
return true;
} else if (propertyType == PropertyType.COMPOSITE) {
// Base it on the primary property's behaviour
@@ -289,9 +300,9 @@
}
/**
- * Returns the (immutable) set of choices for the values of this property.
- * Only defined for {@link ValueType#OPEN_CHOICE open} and
- * {@link ValueType#CLOSED_CHOICE closed choice} value types.
+ * Returns the (immutable) set of choices for the values of this property. Only defined for
+ * {@link ValueType#OPEN_CHOICE open} and {@link ValueType#CLOSED_CHOICE closed choice} value
+ * types.
*
* @return available choices, or <code>null</code>
*/
@@ -325,40 +336,43 @@
return o instanceof Property && name.equals(((Property) o).name);
}
- //----------------------------------------------------------< Comparable >
+ // ----------------------------------------------------------< Comparable >
public int hashCode() {
return name.hashCode();
}
- //--------------------------------------------------------------< Object >
+ // --------------------------------------------------------------< Object >
public enum PropertyType {
- /**
- * A single value
- */
- SIMPLE, STRUCTURE,
- /**
- * An un-ordered array
- */
+ /** A single value */
+ SIMPLE,
+ STRUCTURE,
+ /** An un-ordered array */
BAG,
- /**
- * An ordered array
- */
+ /** An ordered array */
SEQ,
- /**
- * An ordered array with some sort of criteria
- */
+ /** An ordered array with some sort of criteria */
ALT,
- /**
- * Multiple child properties
- */
+ /** Multiple child properties */
COMPOSITE
}
public enum ValueType {
- BOOLEAN, OPEN_CHOICE, CLOSED_CHOICE, DATE, INTEGER, LOCALE, MIME_TYPE, PROPER_NAME,
- RATIONAL, REAL, TEXT, URI, URL, XPATH, PROPERTY
+ BOOLEAN,
+ OPEN_CHOICE,
+ CLOSED_CHOICE,
+ DATE,
+ INTEGER,
+ LOCALE,
+ MIME_TYPE,
+ PROPER_NAME,
+ RATIONAL,
+ REAL,
+ TEXT,
+ URI,
+ URL,
+ XPATH,
+ PROPERTY
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java b/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java
index ff1f926..29947e6 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/PropertyTypeException.java
@@ -19,11 +19,10 @@
import org.apache.tika.metadata.Property.PropertyType;
import org.apache.tika.metadata.Property.ValueType;
-
/**
- * XMP property definition violation exception. This is thrown when
- * you try to set a {@link Property} value with an incorrect type,
- * such as storing an Integer when the property is of type Date.
+ * XMP property definition violation exception. This is thrown when you try to set a {@link
+ * Property} value with an incorrect type, such as storing an Integer when the property is of type
+ * Date.
*
* @since Apache Tika 0.8
*/
@@ -42,9 +41,10 @@
}
public PropertyTypeException(PropertyType unsupportedPropertyType) {
- super((unsupportedPropertyType != PropertyType.COMPOSITE) ?
- unsupportedPropertyType + " is not supported" :
- "Composite Properties must not include other Composite" +
- " Properties as either Primary or Secondary");
+ super(
+ (unsupportedPropertyType != PropertyType.COMPOSITE)
+ ? unsupportedPropertyType + " is not supported"
+ : "Composite Properties must not include other Composite"
+ + " Properties as either Primary or Secondary");
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java
index 0663488..c056bb6 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/QuattroPro.java
@@ -26,27 +26,31 @@
public interface QuattroPro {
String QUATTROPRO_METADATA_NAME_PREFIX = "wordperfect";
- /**
- * ID.
- */
- Property ID = Property.internalText(
- QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "Id");
- /**
- * Version.
- */
- Property VERSION = Property.internalInteger(
- QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "Version");
- /**
- * Build.
- */
- Property BUILD = Property.internalInteger(
- QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "Build");
- /**
- * Lowest version.
- */
- Property LOWEST_VERSION = Property.internalInteger(
- QUATTROPRO_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "LowestVersion");
+ /** ID. */
+ Property ID =
+ Property.internalText(
+ QUATTROPRO_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "Id");
+
+ /** Version. */
+ Property VERSION =
+ Property.internalInteger(
+ QUATTROPRO_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "Version");
+
+ /** Build. */
+ Property BUILD =
+ Property.internalInteger(
+ QUATTROPRO_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "Build");
+
+ /** Lowest version. */
+ Property LOWEST_VERSION =
+ Property.internalInteger(
+ QUATTROPRO_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "LowestVersion");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java
index e4572e3..de0d45a 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java
@@ -1,47 +1,52 @@
package org.apache.tika.metadata; /*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
public interface RTFMetadata {
String PREFIX_RTF_META = "rtf_meta";
-
String RTF_PICT_META_PREFIX = "rtf_pict:";
/**
- * if set to true, this means that an image file is probably a "thumbnail"
- * any time a pict/emf/wmf is in an object
+ * if set to true, this means that an image file is probably a "thumbnail" any time a
+ * pict/emf/wmf is in an object
*/
- Property THUMBNAIL = Property.internalBoolean(
- PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "thumbnail");
+ Property THUMBNAIL =
+ Property.internalBoolean(
+ PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "thumbnail");
/**
- * if an application and version is given as part of the
- * embedded object, this is the literal string
+ * if an application and version is given as part of the embedded object, this is the literal
+ * string
*/
- Property EMB_APP_VERSION = Property.internalText(
- PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_app_version");
+ Property EMB_APP_VERSION =
+ Property.internalText(
+ PREFIX_RTF_META
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "emb_app_version");
- Property EMB_CLASS = Property.internalText(
- PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_class");
+ Property EMB_CLASS =
+ Property.internalText(
+ PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_class");
- Property EMB_TOPIC = Property.internalText(
- PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_topic");
+ Property EMB_TOPIC =
+ Property.internalText(
+ PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_topic");
- Property EMB_ITEM = Property.internalText(
- PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_item");
-
+ Property EMB_ITEM =
+ Property.internalText(
+ PREFIX_RTF_META + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "emb_item");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
index fe5fd0e..b8d2992 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TIFF.java
@@ -17,112 +17,75 @@
package org.apache.tika.metadata;
/**
- * XMP Exif TIFF schema. This is a collection of
- * {@link Property property definition} constants for the Exif TIFF
- * properties defined in the XMP standard.
+ * XMP Exif TIFF schema. This is a collection of {@link Property property definition} constants for
+ * the Exif TIFF properties defined in the XMP standard.
*
- * @see <a href="http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/xmp/pdfs/cc-201306/XMPSpecificationPart2.pdf"
- * >XMP Specification, Part 2: Standard Schemas</a>
+ * @see <a
+ * href="http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/xmp/pdfs/cc-201306/XMPSpecificationPart2.pdf"
+ * >XMP Specification, Part 2: Standard Schemas</a>
* @since Apache Tika 0.8
*/
public interface TIFF {
- /**
- * "Number of bits per component in each channel."
- */
+ /** "Number of bits per component in each channel." */
Property BITS_PER_SAMPLE = Property.internalIntegerSequence("tiff:BitsPerSample");
- /**
- * "Image height in pixels."
- */
+ /** "Image height in pixels." */
Property IMAGE_LENGTH = Property.internalInteger("tiff:ImageLength");
- /**
- * "Image width in pixels."
- */
+ /** "Image width in pixels." */
Property IMAGE_WIDTH = Property.internalInteger("tiff:ImageWidth");
- /**
- * "Number of components per pixel."
- */
+ /** "Number of components per pixel." */
Property SAMPLES_PER_PIXEL = Property.internalInteger("tiff:SamplesPerPixel");
- /**
- * Did the Flash fire when taking this image?
- */
+ /** Did the Flash fire when taking this image? */
Property FLASH_FIRED = Property.internalBoolean("exif:Flash");
- /**
- * "Exposure time in seconds."
- */
+ /** "Exposure time in seconds." */
Property EXPOSURE_TIME = Property.internalRational("exif:ExposureTime");
/**
- * "F-Number."
- * The f-number is the focal length divided by the "effective" aperture
- * diameter. It is a dimensionless number that is a measure of lens speed.
+ * "F-Number." The f-number is the focal length divided by the "effective" aperture diameter. It
+ * is a dimensionless number that is a measure of lens speed.
*/
Property F_NUMBER = Property.internalRational("exif:FNumber");
- /**
- * "Focal length of the lens, in millimeters."
- */
+ /** "Focal length of the lens, in millimeters." */
Property FOCAL_LENGTH = Property.internalRational("exif:FocalLength");
- /**
- * "ISO Speed and ISO Latitude of the input device as specified in ISO 12232"
- */
+ /** "ISO Speed and ISO Latitude of the input device as specified in ISO 12232" */
Property ISO_SPEED_RATINGS = Property.internalIntegerSequence("exif:IsoSpeedRatings");
- /**
- * "Manufacturer of the recording equipment."
- */
+ /** "Manufacturer of the recording equipment." */
Property EQUIPMENT_MAKE = Property.internalText("tiff:Make");
- /**
- * "Model name or number of the recording equipment."
- */
+ /** "Model name or number of the recording equipment." */
Property EQUIPMENT_MODEL = Property.internalText("tiff:Model");
- /**
- * "Software or firmware used to generate the image."
- */
+ /** "Software or firmware used to generate the image." */
Property SOFTWARE = Property.internalText("tiff:Software");
/**
- * "The Orientation of the image."
- * 1 = 0th row at top, 0th column at left
- * 2 = 0th row at top, 0th column at right
- * 3 = 0th row at bottom, 0th column at right
- * 4 = 0th row at bottom, 0th column at left
- * 5 = 0th row at left, 0th column at top
- * 6 = 0th row at right, 0th column at top
- * 7 = 0th row at right, 0th column at bottom
- * 8 = 0th row at left, 0th column at bottom
+ * "The Orientation of the image." 1 = 0th row at top, 0th column at left 2 = 0th row at top,
+ * 0th column at right 3 = 0th row at bottom, 0th column at right 4 = 0th row at bottom, 0th
+ * column at left 5 = 0th row at left, 0th column at top 6 = 0th row at right, 0th column at top
+ * 7 = 0th row at right, 0th column at bottom 8 = 0th row at left, 0th column at bottom
*/
Property ORIENTATION =
- Property.internalClosedChoise("tiff:Orientation", "1", "2", "3", "4", "5", "6", "7",
- "8");
+ Property.internalClosedChoise(
+ "tiff:Orientation", "1", "2", "3", "4", "5", "6", "7", "8");
- /**
- * "Horizontal resolution in pixels per unit."
- */
+ /** "Horizontal resolution in pixels per unit." */
Property RESOLUTION_HORIZONTAL = Property.internalRational("tiff:XResolution");
- /**
- * "Vertical resolution in pixels per unit."
- */
+ /** "Vertical resolution in pixels per unit." */
Property RESOLUTION_VERTICAL = Property.internalRational("tiff:YResolution");
- /**
- * "Units used for Horizontal and Vertical Resolutions."
- * One of "Inch" or "cm"
- */
+ /** "Units used for Horizontal and Vertical Resolutions." One of "Inch" or "cm" */
Property RESOLUTION_UNIT = Property.internalClosedChoise("tiff:ResolutionUnit", "Inch", "cm");
- /**
- * "Date and time when original image was generated"
- */
+ /** "Date and time when original image was generated" */
Property ORIGINAL_DATE = Property.internalDate("exif:DateTimeOriginal");
Property EXIF_PAGE_COUNT = Property.externalInteger("exif:PageCount");
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index effa4a6..54a7677 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -17,229 +17,232 @@
package org.apache.tika.metadata;
/**
- * Contains a core set of basic Tika metadata properties, which all parsers
- * will attempt to supply (where the file format permits). These are all
- * defined in terms of other standard namespaces.
- * <p>
- * Users of Tika who wish to have consistent metadata across file formats
- * can make use of these Properties, knowing that where present they will
- * have consistent semantic meaning between different file formats. (No
- * matter if one file format calls it Title, another Long-Title and another
- * Long-Name, if they all mean the same thing as defined by
- * {@link DublinCore#TITLE} then they will all be present as such)
- * <p>
- * For now, most of these properties are composite ones including the deprecated
- * non-prefixed String properties from the Metadata class. In Tika 2.0, most
- * of these will revert back to simple assignments.
+ * Contains a core set of basic Tika metadata properties, which all parsers will attempt to supply
+ * (where the file format permits). These are all defined in terms of other standard namespaces.
+ *
+ * <p>Users of Tika who wish to have consistent metadata across file formats can make use of these
+ * Properties, knowing that where present they will have consistent semantic meaning between
+ * different file formats. (No matter if one file format calls it Title, another Long-Title and
+ * another Long-Name, if they all mean the same thing as defined by {@link DublinCore#TITLE} then
+ * they will all be present as such)
+ *
+ * <p>For now, most of these properties are composite ones including the deprecated non-prefixed
+ * String properties from the Metadata class. In Tika 2.0, most of these will revert back to simple
+ * assignments.
*
* @since Apache Tika 1.2
*/
@SuppressWarnings("deprecation")
public interface TikaCoreProperties {
- /**
- * The common delimiter used between the namespace abbreviation and the property name
- */
+ /** The common delimiter used between the namespace abbreviation and the property name */
String NAMESPACE_PREFIX_DELIMITER = ":";
/**
- * Use this to prefix metadata properties that store information
- * about the parsing process. Users should be able to distinguish
- * between metadata that was contained within the document and
- * metadata about the parsing process.
+ * Use this to prefix metadata properties that store information about the parsing process.
+ * Users should be able to distinguish between metadata that was contained within the document
+ * and metadata about the parsing process.
*/
String TIKA_META_PREFIX = "X-TIKA" + NAMESPACE_PREFIX_DELIMITER;
+
Property EMBEDDED_DEPTH = Property.internalInteger(TIKA_META_PREFIX + "embedded_depth");
/**
- * This tracks the embedded file paths based on the name of embedded files
- * where available. There is a small risk that there may be path collisions
- * and that these paths may not be unique within a file.
+ * This tracks the embedded file paths based on the name of embedded files where available.
+ * There is a small risk that there may be path collisions and that these paths may not be
+ * unique within a file.
*
- * For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
+ * <p>For a more robust path, see {@link TikaCoreProperties#EMBEDDED_ID_PATH}.
*/
Property EMBEDDED_RESOURCE_PATH =
Property.internalText(TIKA_META_PREFIX + "embedded_resource_path");
/**
- * This tracks the embedded file paths based on the embedded file's
- * {@link TikaCoreProperties#EMBEDDED_ID}.
+ * This tracks the embedded file paths based on the embedded file's {@link
+ * TikaCoreProperties#EMBEDDED_ID}.
*/
- Property EMBEDDED_ID_PATH =
- Property.internalText(TIKA_META_PREFIX + "embedded_id_path");
+ Property EMBEDDED_ID_PATH = Property.internalText(TIKA_META_PREFIX + "embedded_id_path");
- /**
- * This is a 1-index counter for embedded files, used by the RecursiveParserWrapper
- */
- Property EMBEDDED_ID =
- Property.internalInteger(TIKA_META_PREFIX + "embedded_id");
+ /** This is a 1-index counter for embedded files, used by the RecursiveParserWrapper */
+ Property EMBEDDED_ID = Property.internalInteger(TIKA_META_PREFIX + "embedded_id");
Property PARSE_TIME_MILLIS = Property.internalText(TIKA_META_PREFIX + "parse_time_millis");
- /**
- * Simple class name of the content handler
- */
+
+ /** Simple class name of the content handler */
Property TIKA_CONTENT_HANDLER = Property.internalText(TIKA_META_PREFIX + "content_handler");
+
Property TIKA_CONTENT = Property.internalText(TIKA_META_PREFIX + "content");
- /**
- * Use this to store parse exception information in the Metadata object.
- */
+
+ /** Use this to store parse exception information in the Metadata object. */
String TIKA_META_EXCEPTION_PREFIX = TIKA_META_PREFIX + "EXCEPTION" + NAMESPACE_PREFIX_DELIMITER;
- /**
- * Use this to store warnings that happened during the parse.
- */
+ /** Use this to store warnings that happened during the parse. */
String TIKA_META_WARN_PREFIX = TIKA_META_PREFIX + "WARN" + NAMESPACE_PREFIX_DELIMITER;
- //exception in main file
+ // exception in main file
Property CONTAINER_EXCEPTION =
Property.internalText(TIKA_META_EXCEPTION_PREFIX + "container_exception");
- //exception in an embedded file
+ // exception in an embedded file
Property EMBEDDED_EXCEPTION =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
- //exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore
+ // exception handling the raw bytes of an embedded file by an EmbeddedDocumentByteStore
Property EMBEDDED_BYTES_EXCEPTION =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_bytes_exception");
- //warning while parsing in an embedded file
+ // warning while parsing in an embedded file
Property EMBEDDED_WARNING =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning");
Property WRITE_LIMIT_REACHED =
Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
+
/**
- * Use this to store exceptions caught during a parse that are
- * non-fatal, e.g. if a parser is in lenient mode and more
- * content can be extracted if we ignore an exception thrown by
- * a dependency.
+ * Use this to store exceptions caught during a parse that are non-fatal, e.g. if a parser is in
+ * lenient mode and more content can be extracted if we ignore an exception thrown by a
+ * dependency.
*/
Property TIKA_META_EXCEPTION_WARNING =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "warn");
/**
- * This means that metadata keys or metadata values were truncated.
- * If there is an "include" filter, this should not be set if
- * a field is not in the "include" set.
+ * This means that metadata keys or metadata values were truncated. If there is an "include"
+ * filter, this should not be set if a field is not in the "include" set.
*/
Property TRUNCATED_METADATA =
Property.internalBoolean(TIKA_META_WARN_PREFIX + "truncated_metadata");
/**
- * Use this to store exceptions caught while trying to read the
- * stream of an embedded resource. Do not use this if there is
- * a parse exception on the embedded resource.
+ * Use this to store exceptions caught while trying to read the stream of an embedded resource.
+ * Do not use this if there is a parse exception on the embedded resource.
*/
Property TIKA_META_EXCEPTION_EMBEDDED_STREAM =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_stream_exception");
+
Property TIKA_PARSED_BY = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By");
/**
- * Use this to store a record of all parsers that touched a given file
- * in the container file's metadata.
+ * Use this to store a record of all parsers that touched a given file in the container file's
+ * metadata.
*/
- Property TIKA_PARSED_BY_FULL_SET = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set");
+ Property TIKA_PARSED_BY_FULL_SET =
+ Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By-Full-Set");
- Property TIKA_DETECTED_LANGUAGE = Property.externalTextBag(TIKA_META_PREFIX +
- "detected_language");
+ Property TIKA_DETECTED_LANGUAGE =
+ Property.externalTextBag(TIKA_META_PREFIX + "detected_language");
- Property TIKA_DETECTED_LANGUAGE_CONFIDENCE = Property.externalTextBag(TIKA_META_PREFIX +
- "detected_language_confidence");
+ Property TIKA_DETECTED_LANGUAGE_CONFIDENCE =
+ Property.externalTextBag(TIKA_META_PREFIX + "detected_language_confidence");
- Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW = Property.externalRealSeq(TIKA_META_PREFIX +
- "detected_language_confidence_raw");
+ Property TIKA_DETECTED_LANGUAGE_CONFIDENCE_RAW =
+ Property.externalRealSeq(TIKA_META_PREFIX + "detected_language_confidence_raw");
String RESOURCE_NAME_KEY = "resourceName";
String PROTECTED = "protected";
String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
String EMBEDDED_STORAGE_CLASS_ID = "embeddedStorageClassId";
String EMBEDDED_RESOURCE_TYPE_KEY = "embeddedResourceType";
+
/**
- * Some file formats can store information about their original
- * file name/location or about their attachment's original file name/location
- * within the file.
+ * Some file formats can store information about their original file name/location or about
+ * their attachment's original file name/location within the file.
*/
Property ORIGINAL_RESOURCE_NAME =
Property.internalTextBag(TIKA_META_PREFIX + "origResourceName");
+
/**
- * This should be used to store the path (relative or full)
- * of the source file, including the file name,
- * e.g. doc/path/to/my_pdf.pdf
- * <p>
- * This can also be used for a primary key within a database.
+ * This should be used to store the path (relative or full) of the source file, including the
+ * file name, e.g. doc/path/to/my_pdf.pdf
+ *
+ * <p>This can also be used for a primary key within a database.
*/
Property SOURCE_PATH = Property.internalText(TIKA_META_PREFIX + "sourcePath");
+
/**
- * This is currently used to identify Content-Type that may be
- * included within a document, such as in html documents
- * (e.g. <meta http-equiv="content-type" content="text/html; charset=UTF-8">)
- * , or the value might come from outside the document. This information
- * may be faulty and should be treated only as a hint.
+ * This is currently used to identify Content-Type that may be included within a document, such
+ * as in html documents (e.g. <meta http-equiv="content-type" content="text/html;
+ * charset=UTF-8">) , or the value might come from outside the document. This information may be
+ * faulty and should be treated only as a hint.
*/
Property CONTENT_TYPE_HINT = Property.internalText(HttpHeaders.CONTENT_TYPE + "-Hint");
- /**
- * This is used by users to override detection with the override detector.
- */
+
+ /** This is used by users to override detection with the override detector. */
Property CONTENT_TYPE_USER_OVERRIDE =
Property.internalText(HttpHeaders.CONTENT_TYPE + "-Override");
+
/**
- * This is used by parsers to override detection of embedded resources
- * with the override detector.
+ * This is used by parsers to override detection of embedded resources with the override
+ * detector.
*/
Property CONTENT_TYPE_PARSER_OVERRIDE =
Property.internalText(HttpHeaders.CONTENT_TYPE + "-Parser-Override");
+
/**
* @see DublinCore#FORMAT
*/
Property FORMAT = DublinCore.FORMAT;
+
/**
* @see DublinCore#IDENTIFIER
*/
Property IDENTIFIER = DublinCore.IDENTIFIER;
+
/**
* @see DublinCore#CONTRIBUTOR
*/
Property CONTRIBUTOR = DublinCore.CONTRIBUTOR;
+
/**
* @see DublinCore#COVERAGE
*/
Property COVERAGE = DublinCore.COVERAGE;
+
/**
* @see DublinCore#CREATOR
*/
Property CREATOR = DublinCore.CREATOR;
+
/**
* @see Office#LAST_AUTHOR
*/
Property MODIFIER = Office.LAST_AUTHOR;
+
/**
* @see XMP#CREATOR_TOOL
*/
Property CREATOR_TOOL = XMP.CREATOR_TOOL;
+
/**
* @see DublinCore#LANGUAGE
*/
Property LANGUAGE = DublinCore.LANGUAGE;
+
/**
* @see DublinCore#PUBLISHER
*/
Property PUBLISHER = DublinCore.PUBLISHER;
+
/**
* @see DublinCore#RELATION
*/
Property RELATION = DublinCore.RELATION;
+
/**
* @see DublinCore#RIGHTS
*/
Property RIGHTS = DublinCore.RIGHTS;
+
/**
* @see DublinCore#SOURCE
*/
Property SOURCE = DublinCore.SOURCE;
+
/**
* @see DublinCore#TYPE
*/
Property TYPE = DublinCore.TYPE;
+
/**
* @see DublinCore#TITLE
*/
@@ -250,12 +253,13 @@
* @see DublinCore#DESCRIPTION
*/
Property DESCRIPTION = DublinCore.DESCRIPTION;
+
/**
- * {@link DublinCore#SUBJECT}; should include both subject and keywords
- * if a document format has both. See also {@link Office#KEYWORDS}
- * and {@link OfficeOpenXMLCore#SUBJECT}.
+ * {@link DublinCore#SUBJECT}; should include both subject and keywords if a document format has
+ * both. See also {@link Office#KEYWORDS} and {@link OfficeOpenXMLCore#SUBJECT}.
*/
Property SUBJECT = DublinCore.SUBJECT;
+
/**
* @see DublinCore#DATE
*/
@@ -267,38 +271,41 @@
* @see Office#SAVE_DATE
*/
Property MODIFIED = DublinCore.MODIFIED;
+
/**
* @see Office#PRINT_DATE
*/
Property PRINT_DATE = Office.PRINT_DATE;
+
/**
* @see XMP#METADATA_DATE
*/
Property METADATA_DATE = XMP.METADATA_DATE;
+
/**
* @see Geographic#LATITUDE
*/
Property LATITUDE = Geographic.LATITUDE;
-
// Geographic related properties
/**
* @see Geographic#LONGITUDE
*/
Property LONGITUDE = Geographic.LONGITUDE;
+
/**
* @see Geographic#ALTITUDE
*/
Property ALTITUDE = Geographic.ALTITUDE;
+
/**
* @see XMP#RATING
*/
Property RATING = XMP.RATING;
/**
- * This is the number of images (as in a multi-frame gif) returned by
- * Java's {@link javax.imageio.ImageReader#getNumImages(boolean)}. See
- * the javadocs for known limitations.
+ * This is the number of images (as in a multi-frame gif) returned by Java's {@link
+ * javax.imageio.ImageReader#getNumImages(boolean)}. See the javadocs for known limitations.
*/
Property NUM_IMAGES = Property.internalInteger("imagereader:NumImages");
@@ -307,13 +314,18 @@
* @see OfficeOpenXMLExtended#COMMENTS
*/
Property COMMENTS = OfficeOpenXMLExtended.COMMENTS;
- /**
- * Embedded resource type property
- */
- Property EMBEDDED_RESOURCE_TYPE = Property.internalClosedChoise(EMBEDDED_RESOURCE_TYPE_KEY,
- EmbeddedResourceType.ATTACHMENT.toString(), EmbeddedResourceType.INLINE.toString(),
- EmbeddedResourceType.METADATA.toString(), EmbeddedResourceType.MACRO.toString(),
- EmbeddedResourceType.THUMBNAIL.toString(), EmbeddedResourceType.RENDERING.toString());
+
+ /** Embedded resource type property */
+ Property EMBEDDED_RESOURCE_TYPE =
+ Property.internalClosedChoise(
+ EMBEDDED_RESOURCE_TYPE_KEY,
+ EmbeddedResourceType.ATTACHMENT.toString(),
+ EmbeddedResourceType.INLINE.toString(),
+ EmbeddedResourceType.METADATA.toString(),
+ EmbeddedResourceType.MACRO.toString(),
+ EmbeddedResourceType.THUMBNAIL.toString(),
+ EmbeddedResourceType.RENDERING.toString());
+
Property HAS_SIGNATURE = Property.internalBoolean("hasSignature");
Property SIGNATURE_NAME = Property.internalTextBag("signature:name");
@@ -323,69 +335,68 @@
Property SIGNATURE_FILTER = Property.internalTextBag("signature:filter");
Property SIGNATURE_CONTACT_INFO = Property.internalTextBag("signature:contact-info");
- //is the file encrypted
+ // is the file encrypted
Property IS_ENCRYPTED = Property.internalBoolean(TIKA_META_PREFIX + "encrypted");
/**
* When an EncodingDetector detects an encoding, the encoding should be stored in this field.
* This is different from {@link Metadata#CONTENT_ENCODING} because that is what a parser
- * chooses to use for processing a file. If an EncodingDetector returns "null", a parser
- * may choose to use a default encoding. We want to differentiate between a parser using a
- * default encoding and the output of an EncodingDetector.
+ * chooses to use for processing a file. If an EncodingDetector returns "null", a parser may
+ * choose to use a default encoding. We want to differentiate between a parser using a default
+ * encoding and the output of an EncodingDetector.
*/
Property DETECTED_ENCODING = Property.externalText(TIKA_META_PREFIX + "detectedEncoding");
-
/**
- * This should be the simple class name for the EncodingDetectors whose detected encoding
- * was used in the parse.
+ * This should be the simple class name for the EncodingDetectors whose detected encoding was
+ * used in the parse.
*/
Property ENCODING_DETECTOR = Property.externalText(TIKA_META_PREFIX + "encodingDetector");
/**
- * General metadata key for the count of non-final versions available within a file. This
- * was added initially to support generalizing incremental updates in PDF.
+ * General metadata key for the count of non-final versions available within a file. This was
+ * added initially to support generalizing incremental updates in PDF.
*/
Property VERSION_COUNT = Property.externalInteger(TIKA_META_PREFIX + "versionCount");
/**
- * General metadata key for the version number of a given file that contains
- * earlier versions within it. This number is 0-indexed for the earliest version.
- * The latest version does not have this metadata value. This was added initially
- * to support generalizing incremental updates in PDF.
+ * General metadata key for the version number of a given file that contains earlier versions
+ * within it. This number is 0-indexed for the earliest version. The latest version does not
+ * have this metadata value. This was added initially to support generalizing incremental
+ * updates in PDF.
*/
Property VERSION_NUMBER = Property.externalInteger(TIKA_META_PREFIX + "versionNumber");
Property PIPES_RESULT = Property.externalText(TIKA_META_PREFIX + "pipes_result");
+
/**
- * A file might contain different types of embedded documents.
- * The most common is the ATTACHMENT.
- * <p>
- * An INLINE embedded resource should be used for embedded image
- * files that are used to render the page image (as in PDXObjImages in PDF files).
- * <p>
- * A MACRO is code that is embedded in the document and is intended
- * to be executable within the application that opens the document. This
- * includes traditional macros within Microsoft Office files and
- * javascript within PDFActions. This would not include, e.g., an
- * .exe file embedded in a .zip file.
- * <p>
- * A VERSION is an earlier version of the file as in incremental updates.
- * The initial use case for this is incremental updates in PDFs, but
- * it could be applied to other file formats as well where earlier versions
- * are recoverable. See also {@link PDF#INCREMENTAL_UPDATE_NUMBER}
- * <p>
- * Not all parsers have yet implemented this.
+ * A file might contain different types of embedded documents. The most common is the
+ * ATTACHMENT.
+ *
+ * <p>An INLINE embedded resource should be used for embedded image files that are used to
+ * render the page image (as in PDXObjImages in PDF files).
+ *
+ * <p>A MACRO is code that is embedded in the document and is intended to be executable within
+ * the application that opens the document. This includes traditional macros within Microsoft
+ * Office files and javascript within PDFActions. This would not include, e.g., an .exe file
+ * embedded in a .zip file.
+ *
+ * <p>A VERSION is an earlier version of the file as in incremental updates. The initial use
+ * case for this is incremental updates in PDFs, but it could be applied to other file formats
+ * as well where earlier versions are recoverable. See also {@link
+ * PDF#INCREMENTAL_UPDATE_NUMBER}
+ *
+ * <p>Not all parsers have yet implemented this.
*/
enum EmbeddedResourceType {
- INLINE, //image that is intended to be displayed in a rendering of the file
- ATTACHMENT,//standard attachment as in email
- MACRO, //any code that is intended to be run by the application
- METADATA, //e.g. xmp, xfa
- FONT,//embedded font files
- THUMBNAIL, //TODO: set this in parsers that handle thumbnails
- RENDERING, //if a file has been rendered
- VERSION, //an earlier version of a file
- ALTERNATE_FORMAT_CHUNK //OOXML inline alternate format chunk
+ INLINE, // image that is intended to be displayed in a rendering of the file
+ ATTACHMENT, // standard attachment as in email
+ MACRO, // any code that is intended to be run by the application
+ METADATA, // e.g. xmp, xfa
+ FONT, // embedded font files
+ THUMBNAIL, // TODO: set this in parsers that handle thumbnails
+ RENDERING, // if a file has been rendered
+ VERSION, // an earlier version of a file
+ ALTERNATE_FORMAT_CHUNK // OOXML inline alternate format chunk
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java
index 7ae685e..775a20d 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaMimeKeys.java
@@ -16,13 +16,10 @@
*/
package org.apache.tika.metadata;
-/**
- * A collection of Tika metadata keys used in Mime Type resolution
- */
+/** A collection of Tika metadata keys used in Mime Type resolution */
public interface TikaMimeKeys {
String TIKA_MIME_FILE = "tika.mime.file";
String MIME_TYPE_MAGIC = "mime.type.magic";
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
index e4bf145..3587131 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaPagedText.java
@@ -17,19 +17,16 @@
package org.apache.tika.metadata;
/**
- * Metadata properties for paged text, metadata appropriate
- * for an individual page (useful for embedded document handlers
- * called on individual pages).
+ * Metadata properties for paged text, metadata appropriate for an individual page (useful for
+ * embedded document handlers called on individual pages).
*
- * Use {@link PagedText} where possible
+ * <p>Use {@link PagedText} where possible
*/
public interface TikaPagedText {
String TIKA_PAGED_TEXT_PREFIX = "tika_pg:";
- /**
- * 1-based page number for a specific page
- */
+
+ /** 1-based page number for a specific page */
Property PAGE_NUMBER = Property.internalInteger(TIKA_PAGED_TEXT_PREFIX + "page_number");
Property PAGE_ROTATION = Property.internalRational(TIKA_PAGED_TEXT_PREFIX + "page_rotation");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WARC.java b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java
index 359236b..9c01757 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/WARC.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/WARC.java
@@ -27,5 +27,5 @@
Property WARC_RECORD_ID = Property.externalText(PREFIX + "WARC-Record-ID");
- //TODO: lots
+ // TODO: lots
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java
index 4fd37f0..750cc05 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/WordPerfect.java
@@ -25,46 +25,52 @@
public interface WordPerfect {
String WORDPERFECT_METADATA_NAME_PREFIX = "wordperfect";
- /**
- * File size as defined in document header.
- */
- Property FILE_SIZE = Property.internalText(
- WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "FileSize");
- /**
- * File identifier.
- */
- Property FILE_ID = Property.internalText(
- WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "FileId");
- /**
- * Product type.
- */
- Property PRODUCT_TYPE = Property.internalInteger(
- WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "ProductType");
- /**
- * File type.
- */
- Property FILE_TYPE = Property.internalInteger(
- WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "FileType");
- /**
- * Major version.
- */
- Property MAJOR_VERSION = Property.internalInteger(
- WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "MajorVersion");
- /**
- * Minor version.
- */
- Property MINOR_VERSION = Property.internalInteger(
- WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "MinorVersion");
- /**
- * Is encrypted?.
- */
- Property ENCRYPTED = Property.internalBoolean(
- WORDPERFECT_METADATA_NAME_PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER +
- "Encrypted");
+ /** File size as defined in document header. */
+ Property FILE_SIZE =
+ Property.internalText(
+ WORDPERFECT_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "FileSize");
+
+ /** File identifier. */
+ Property FILE_ID =
+ Property.internalText(
+ WORDPERFECT_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "FileId");
+
+ /** Product type. */
+ Property PRODUCT_TYPE =
+ Property.internalInteger(
+ WORDPERFECT_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "ProductType");
+
+ /** File type. */
+ Property FILE_TYPE =
+ Property.internalInteger(
+ WORDPERFECT_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "FileType");
+
+ /** Major version. */
+ Property MAJOR_VERSION =
+ Property.internalInteger(
+ WORDPERFECT_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "MajorVersion");
+
+ /** Minor version. */
+ Property MINOR_VERSION =
+ Property.internalInteger(
+ WORDPERFECT_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "MinorVersion");
+
+ /** Is encrypted?. */
+ Property ENCRYPTED =
+ Property.internalBoolean(
+ WORDPERFECT_METADATA_NAME_PREFIX
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + "Encrypted");
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
index bca38d4..ba3dcb6 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMP.java
@@ -22,39 +22,30 @@
String PREFIX = "xmp";
- /**
- * The xmp prefix followed by the colon delimiter
- */
+ /** The xmp prefix followed by the colon delimiter */
String PREFIX_ = PREFIX + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER;
- /**
- * Unordered text strings of advisories.
- */
+ /** Unordered text strings of advisories. */
Property ABOUT = Property.externalTextBag(PREFIX_ + "About");
- /**
- * Unordered text strings of advisories.
- */
+ /** Unordered text strings of advisories. */
Property ADVISORY = Property.externalTextBag(PREFIX_ + "Advisory");
/**
- * The date and time the resource was created. For a digital file, this need not
- * match a file-system creation time. For a freshly created resource, it should
- * be close to that time, modulo the time taken to write the file. Later file
- * transfer, copying, and so on, can make the file-system time arbitrarily different.
+ * The date and time the resource was created. For a digital file, this need not match a
+ * file-system creation time. For a freshly created resource, it should be close to that time,
+ * modulo the time taken to write the file. Later file transfer, copying, and so on, can make
+ * the file-system time arbitrarily different.
*/
Property CREATE_DATE = Property.externalDate(PREFIX_ + "CreateDate");
- /**
- * The name of the first known tool used to create the resource.
- */
+ /** The name of the first known tool used to create the resource. */
Property CREATOR_TOOL = Property.externalText(PREFIX_ + "CreatorTool");
/**
- * An unordered array of text strings that unambiguously identify the resource
- * within a given context. An array item may be qualified with xmpidq:Scheme
- * (see 8.7, “xmpidq namespace”) to denote the formal identification system to
- * which that identifier conforms.
+ * An unordered array of text strings that unambiguously identify the resource within a given
+ * context. An array item may be qualified with xmpidq:Scheme (see 8.7, “xmpidq namespace”) to
+ * denote the formal identification system to which that identifier conforms.
*/
Property IDENTIFIER = Property.externalTextBag(PREFIX_ + "Identifier");
@@ -64,26 +55,21 @@
Property LABEL = Property.externalText(PREFIX_ + "Label");
/**
- * The date and time that any metadata for this resource was last changed. It
- * should be the same as or more recent than xmp:ModifyDate
+ * The date and time that any metadata for this resource was last changed. It should be the same
+ * as or more recent than xmp:ModifyDate
*/
Property METADATA_DATE = Property.externalDate(PREFIX_ + "MetadataDate");
- /**
- * The date and time the resource was last modified.
- */
+ /** The date and time the resource was last modified. */
Property MODIFY_DATE = Property.externalDate(PREFIX_ + "ModifyDate");
- /**
- * A word or short phrase that represents the nick name fo the file
- */
+ /** A word or short phrase that represents the nick name fo the file */
Property NICKNAME = Property.externalText(PREFIX_ + "NickName");
/**
- * A user-assigned rating for this file. The value shall be -1 or in the range
- * [0..5], where -1 indicates “rejected” and 0 indicates “unrated”. If xmp:Rating
- * is not present, a value of 0 should be assumed.
+ * A user-assigned rating for this file. The value shall be -1 or in the range [0..5], where -1
+ * indicates “rejected” and 0 indicates “unrated”. If xmp:Rating is not present, a value of 0
+ * should be assumed.
*/
Property RATING = Property.externalInteger(PREFIX_ + "Rating");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java
index d7faa44..e4b43f2 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPDM.java
@@ -19,321 +19,279 @@
import java.util.Date;
/**
- * XMP Dynamic Media schema. This is a collection of
- * {@link Property property definition} constants for the dynamic media
- * properties defined in the XMP standard.
+ * XMP Dynamic Media schema. This is a collection of {@link Property property definition} constants
+ * for the dynamic media properties defined in the XMP standard.
*
- * @see <a href="http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/xmp/pdfs/cc-201306/XMPSpecificationPart2.pdf"
- * >XMP Specification, Part 2: Standard Schemas</a>
+ * @see <a
+ * href="http://wwwimages.adobe.com/content/dam/Adobe/en/devnet/xmp/pdfs/cc-201306/XMPSpecificationPart2.pdf"
+ * >XMP Specification, Part 2: Standard Schemas</a>
* @since Apache Tika 0.7
*/
public interface XMPDM {
- /**
- * "The absolute path to the file's peak audio file. If empty, no peak
- * file exists."
- */
+ /** "The absolute path to the file's peak audio file. If empty, no peak file exists." */
Property ABS_PEAK_AUDIO_FILE_PATH = Property.internalURI("xmpDM:absPeakAudioFilePath");
- /**
- * "The name of the album."
- */
+ /** "The name of the album." */
Property ALBUM = Property.externalText("xmpDM:album");
/**
- * "An alternative tape name, set via the project window or timecode
- * dialog in Premiere. If an alternative name has been set and has not
- * been reverted, that name is displayed."
+ * "An alternative tape name, set via the project window or timecode dialog in Premiere. If an
+ * alternative name has been set and has not been reverted, that name is displayed."
*/
Property ALT_TAPE_NAME = Property.externalText("xmpDM:altTapeName");
-// /**
-// * "A timecode set by the user. When specified, it is used instead
-// * of the startTimecode."
-// */
-// Property ALT_TIMECODE = "xmpDM:altTimecode";
+ // /**
+ // * "A timecode set by the user. When specified, it is used instead
+ // * of the startTimecode."
+ // */
+ // Property ALT_TIMECODE = "xmpDM:altTimecode";
- /**
- * "The name of the artist or artists."
- */
+ /** "The name of the artist or artists." */
Property ARTIST = Property.externalText("xmpDM:artist");
- /**
- * "The name of the album artist or group for compilation albums."
- */
+ /** "The name of the album artist or group for compilation albums." */
Property ALBUM_ARTIST = Property.externalText("xmpDM:albumArtist");
- /**
- * "The date and time when the audio was last modified."
- */
+ /** "The date and time when the audio was last modified." */
Property AUDIO_MOD_DATE = Property.internalDate("xmpDM:audioModDate");
- /**
- * "The audio sample rate. Can be any value, but commonly 32000, 41100,
- * or 48000."
- */
+ /** "The audio sample rate. Can be any value, but commonly 32000, 41100, or 48000." */
Property AUDIO_SAMPLE_RATE = Property.internalInteger("xmpDM:audioSampleRate");
- /**
- * "The audio sample type."
- */
+ /** "The audio sample type." */
Property AUDIO_SAMPLE_TYPE =
- Property.internalClosedChoise("xmpDM:audioSampleType", "8Int", "16Int", "32Int",
- "32Float");
+ Property.internalClosedChoise(
+ "xmpDM:audioSampleType", "8Int", "16Int", "32Int", "32Float");
- /**
- * "The audio channel type."
- */
+ /** "The audio channel type." */
Property AUDIO_CHANNEL_TYPE =
Property.internalClosedChoise("xmpDM:audioChannelType", "Mono", "Stereo", "5.1", "7.1");
- /**
- * "The audio compression used. For example, MP3."
- */
+
+ /** "The audio compression used. For example, MP3." */
Property AUDIO_COMPRESSOR = Property.internalText("xmpDM:audioCompressor");
- /**
- * "An album created by various artists."
- */
+
+ /** "An album created by various artists." */
Property COMPILATION = Property.externalInteger("xmpDM:compilation");
-// /**
-// * "Additional parameters for Beat Splice stretch mode."
-// */
-// Property BEAT_SPLICE_PARAMS = "xmpDM:beatSpliceParams";
- /**
- * "The composer's name."
- */
+ // /**
+ // * "Additional parameters for Beat Splice stretch mode."
+ // */
+ // Property BEAT_SPLICE_PARAMS = "xmpDM:beatSpliceParams";
+ /** "The composer's name." */
Property COMPOSER = Property.externalText("xmpDM:composer");
- /**
- * "The copyright information."
- */
+
+ /** "The copyright information." */
Property COPYRIGHT = Property.externalText("xmpDM:copyright");
-// /**
-// * "An unordered list of all media used to create this media."
-// */
-// Property CONTRIBUTED_MEDIA = "xmpDM:contributedMedia";
- /**
- * "The disc number for part of an album set."
- */
+ // /**
+ // * "An unordered list of all media used to create this media."
+ // */
+ // Property CONTRIBUTED_MEDIA = "xmpDM:contributedMedia";
+ /** "The disc number for part of an album set." */
Property DISC_NUMBER = Property.externalInteger("xmpDM:discNumber");
- /**
- * "The duration of the media file."
- * Value is in Seconds, unless xmpDM:scale is also set.
- */
- Property DURATION = Property.externalReal("xmpDM:duration");
- /**
- * "The engineer's name."
- */
- Property ENGINEER = Property.externalText("xmpDM:engineer");
- /**
- * "The file data rate in megabytes per second. For example:
- * '36/10' = 3.6 MB/sec"
- */
- Property FILE_DATA_RATE = Property.internalRational("xmpDM:fileDataRate");
- /**
- * "The name of the genre."
- */
- Property GENRE = Property.externalText("xmpDM:genre");
- /**
- * "The musical instrument."
- */
- Property INSTRUMENT = Property.externalText("xmpDM:instrument");
- /**
- * "The audio's musical key."
- */
- Property KEY =
- Property.internalClosedChoise("xmpDM:key", "C", "C#", "D", "D#", "E", "F", "F#", "G",
- "G#", "A", "A#", "B");
-// /**
-// * "The duration of lead time for queuing music."
-// */
-// Property INTRO_TIME = "xmpDM:introTime";
- /**
- * "User's log comments."
- */
+ /** "The duration of the media file." Value is in Seconds, unless xmpDM:scale is also set. */
+ Property DURATION = Property.externalReal("xmpDM:duration");
+
+ /** "The engineer's name." */
+ Property ENGINEER = Property.externalText("xmpDM:engineer");
+
+ /** "The file data rate in megabytes per second. For example: '36/10' = 3.6 MB/sec" */
+ Property FILE_DATA_RATE = Property.internalRational("xmpDM:fileDataRate");
+
+ /** "The name of the genre." */
+ Property GENRE = Property.externalText("xmpDM:genre");
+
+ /** "The musical instrument." */
+ Property INSTRUMENT = Property.externalText("xmpDM:instrument");
+
+ /** "The audio's musical key." */
+ Property KEY =
+ Property.internalClosedChoise(
+ "xmpDM:key", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B");
+
+ // /**
+ // * "The duration of lead time for queuing music."
+ // */
+ // Property INTRO_TIME = "xmpDM:introTime";
+ /** "User's log comments." */
Property LOG_COMMENT = Property.externalText("xmpDM:logComment");
- /**
- * "When true, the clip can be looped seamlessly."
- */
+
+ /** "When true, the clip can be looped seamlessly." */
Property LOOP = Property.internalBoolean("xmpDM:loop");
- /**
- * "The number of beats."
- */
+
+ /** "The number of beats." */
Property NUMBER_OF_BEATS = Property.internalReal("xmpDM:numberOfBeats");
- /**
- * "The date and time when the metadata was last modified."
- */
+
+ /** "The date and time when the metadata was last modified." */
Property METADATA_MOD_DATE = Property.internalDate("xmpDM:metadataModDate");
-// /**
-// * An ordered list of markers. See also {@link #TRACKS xmpDM:Tracks}.
-// */
-// Property MARKERS = "xmpDM:markers";
- /**
- * "The sampling phase of film to be converted to video (pull-down)."
- */
+ // /**
+ // * An ordered list of markers. See also {@link #TRACKS xmpDM:Tracks}.
+ // */
+ // Property MARKERS = "xmpDM:markers";
+ /** "The sampling phase of film to be converted to video (pull-down)." */
Property PULL_DOWN =
- Property.internalClosedChoise("xmpDM:pullDown", "WSSWW", "SSWWW", "SWWWS", "WWWSS",
- "WWSSW", "WSSWW_24p", "SSWWW_24p", "SWWWS_24p", "WWWSS_24p", "WWSSW_24p");
+ Property.internalClosedChoise(
+ "xmpDM:pullDown",
+ "WSSWW",
+ "SSWWW",
+ "SWWWS",
+ "WWWSS",
+ "WWSSW",
+ "WSSWW_24p",
+ "SSWWW_24p",
+ "SWWWS_24p",
+ "WWWSS_24p",
+ "WWSSW_24p");
-// /**
-// * "The time at which to fade out."
-// */
-// Property OUT_CUE = "xmpDM:outCue";
+ // /**
+ // * "The time at which to fade out."
+ // */
+ // Property OUT_CUE = "xmpDM:outCue";
-// /**
-// * "A reference to the project that created this file."
-// */
-// Property PROJECT_REF = "xmpDM:projectRef";
- /**
- * "The relative path to the file's peak audio file. If empty, no peak
- * file exists."
- */
+ // /**
+ // * "A reference to the project that created this file."
+ // */
+ // Property PROJECT_REF = "xmpDM:projectRef";
+ /** "The relative path to the file's peak audio file. If empty, no peak file exists." */
Property RELATIVE_PEAK_AUDIO_FILE_PATH =
Property.internalURI("xmpDM:relativePeakAudioFilePath");
- /**
- * "The date the title was released."
- */
+
+ /** "The date the title was released." */
Property RELEASE_DATE = Property.externalDate("xmpDM:releaseDate");
-// /**
-// * "The start time of the media inside the audio project."
-// */
-// Property RELATIVE_TIMESTAMP = "xmpDM:relativeTimestamp";
+ // /**
+ // * "The start time of the media inside the audio project."
+ // */
+ // Property RELATIVE_TIMESTAMP = "xmpDM:relativeTimestamp";
/**
- * "The musical scale used in the music. 'Neither' is most often used
- * for instruments with no associated scale, such as drums."
+ * "The musical scale used in the music. 'Neither' is most often used for instruments with no
+ * associated scale, such as drums."
*/
Property SCALE_TYPE =
Property.internalClosedChoise("xmpDM:scaleType", "Major", "Minor", "Both", "Neither");
-// /**
-// * "Additional parameters for Resample stretch mode."
-// */
-// Property RESAMPLE_PARAMS = "xmpDM:resampleParams";
- /**
- * "The name of the scene."
- */
+ // /**
+ // * "Additional parameters for Resample stretch mode."
+ // */
+ // Property RESAMPLE_PARAMS = "xmpDM:resampleParams";
+ /** "The name of the scene." */
Property SCENE = Property.externalText("xmpDM:scene");
- /**
- * "The date and time when the video was shot."
- */
+
+ /** "The date and time when the video was shot." */
Property SHOT_DATE = Property.externalDate("xmpDM:shotDate");
+
/**
- * "The name of the location where the video was shot. For example:
- * 'Oktoberfest, Munich, Germany'. For more accurate positioning,
- * use the EXIF GPS values."
+ * "The name of the location where the video was shot. For example: 'Oktoberfest, Munich,
+ * Germany'. For more accurate positioning, use the EXIF GPS values."
*/
Property SHOT_LOCATION = Property.externalText("xmpDM:shotLocation");
- /**
- * "The name of the shot or take."
- */
+
+ /** "The name of the shot or take." */
Property SHOT_NAME = Property.externalText("xmpDM:shotName");
+
/**
- * "A description of the speaker angles from center front in degrees.
- * For example: 'Left = -30, Right = 30, Center = 0, LFE = 45,
- * Left Surround = -110, Right Surround = 110'"
+ * "A description of the speaker angles from center front in degrees. For example: 'Left = -30,
+ * Right = 30, Center = 0, LFE = 45, Left Surround = -110, Right Surround = 110'"
*/
Property SPEAKER_PLACEMENT = Property.externalText("xmpDM:speakerPlacement");
- /**
- * "The audio stretch mode."
- */
- Property STRETCH_MODE =
- Property.internalClosedChoise("xmpDM:stretchMode", "Fixed length", "Time-Scale",
- "Resample", "Beat Splice", "Hybrid");
-// /**
-// * "The timecode of the first frame of video in the file, as obtained
-// * from the device control."
-// */
-// Property START_TIMECODE = "xmpDM:startTimecode";
+ /** "The audio stretch mode." */
+ Property STRETCH_MODE =
+ Property.internalClosedChoise(
+ "xmpDM:stretchMode",
+ "Fixed length",
+ "Time-Scale",
+ "Resample",
+ "Beat Splice",
+ "Hybrid");
+
+ // /**
+ // * "The timecode of the first frame of video in the file, as obtained
+ // * from the device control."
+ // */
+ // Property START_TIMECODE = "xmpDM:startTimecode";
/**
- * "The name of the tape from which the clip was captured, as set during
- * the capture process."
+ * "The name of the tape from which the clip was captured, as set during the capture process."
*/
Property TAPE_NAME = Property.externalText("xmpDM:tapeName");
- /**
- * "The audio's tempo."
- */
- Property TEMPO = Property.internalReal("xmpDM:tempo");
- /**
- * "The time signature of the music."
- */
- Property TIME_SIGNATURE =
- Property.internalClosedChoise("xmpDM:timeSignature", "2/4", "3/4", "4/4", "5/4", "7/4",
- "6/8", "9/8", "12/8", "other");
-// /**
-// * "Additional parameters for Time-Scale stretch mode."
-// */
-// Property TIME_SCALE_PARAMS = "xmpDM:timeScaleParams";
- /**
- * "A numeric value indicating the order of the audio file within its
- * original recording."
- */
+ /** "The audio's tempo." */
+ Property TEMPO = Property.internalReal("xmpDM:tempo");
+
+ /** "The time signature of the music." */
+ Property TIME_SIGNATURE =
+ Property.internalClosedChoise(
+ "xmpDM:timeSignature",
+ "2/4",
+ "3/4",
+ "4/4",
+ "5/4",
+ "7/4",
+ "6/8",
+ "9/8",
+ "12/8",
+ "other");
+
+ // /**
+ // * "Additional parameters for Time-Scale stretch mode."
+ // */
+ // Property TIME_SCALE_PARAMS = "xmpDM:timeScaleParams";
+ /** "A numeric value indicating the order of the audio file within its original recording." */
Property TRACK_NUMBER = Property.externalInteger("xmpDM:trackNumber");
- /**
- * "The alpha mode."
- */
+
+ /** "The alpha mode." */
Property VIDEO_ALPHA_MODE =
Property.externalClosedChoise("xmpDM:videoAlphaMode", "straight", "pre-multiplied");
-// /**
-// * "An unordered list of tracks. A track is a named set of markers,
-// * which can specify a frame rate for all markers in the set.
-// * See also {@link #MARKERS xmpDM:markers}."
-// */
-// Property TRACKS = "xmpDM:Tracks";
- /**
- * "When true, unity is clear, when false, it is opaque."
- */
+ // /**
+ // * "An unordered list of tracks. A track is a named set of markers,
+ // * which can specify a frame rate for all markers in the set.
+ // * See also {@link #MARKERS xmpDM:markers}."
+ // */
+ // Property TRACKS = "xmpDM:Tracks";
+ /** "When true, unity is clear, when false, it is opaque." */
Property VIDEO_ALPHA_UNITY_IS_TRANSPARENT =
Property.internalBoolean("xmpDM:videoAlphaUnityIsTransparent");
-// /**
-// * "A color in CMYK or RGB to be used as the pre-multiple color when
-// * alpha mode is pre-multiplied."
-// */
-// Property VIDEO_ALPHA_PREMULTIPLE_COLOR = "xmpDM:videoAlphaPremultipleColor";
- /**
- * "The color space."
- */
+ // /**
+ // * "A color in CMYK or RGB to be used as the pre-multiple color when
+ // * alpha mode is pre-multiplied."
+ // */
+ // Property VIDEO_ALPHA_PREMULTIPLE_COLOR = "xmpDM:videoAlphaPremultipleColor";
+ /** "The color space." */
Property VIDEO_COLOR_SPACE =
Property.internalClosedChoise("xmpDM:videoColorSpace", "sRGB", "CCIR-601", "CCIR-709");
- /**
- * "Video compression used. For example, jpeg."
- */
+
+ /** "Video compression used. For example, jpeg." */
Property VIDEO_COMPRESSOR = Property.internalText("xmpDM:videoCompressor");
- /**
- * "The field order for video."
- */
+
+ /** "The field order for video." */
Property VIDEO_FIELD_ORDER =
Property.internalClosedChoise("xmpDM:videoFieldOrder", "Upper", "Lower", "Progressive");
- /**
- * "The video frame rate."
- */
+
+ /** "The video frame rate." */
Property VIDEO_FRAME_RATE =
Property.internalOpenChoise("xmpDM:videoFrameRate", "24", "NTSC", "PAL");
- /**
- * "The date and time when the video was last modified."
- */
+
+ /** "The date and time when the video was last modified." */
Property VIDEO_MOD_DATE = Property.internalDate("xmpDM:videoModDate");
-// /**
-// * "The frame size. For example: w:720, h: 480, unit:pixels"
-// */
-// Property VIDEO_FRAME_SIZE = "xmpDM:videoFrameSize";
+ // /**
+ // * "The frame size. For example: w:720, h: 480, unit:pixels"
+ // */
+ // Property VIDEO_FRAME_SIZE = "xmpDM:videoFrameSize";
/**
- * "The size in bits of each color component of a pixel. Standard
- * Windows 32-bit pixels have 8 bits per component."
+ * "The size in bits of each color component of a pixel. Standard Windows 32-bit pixels have 8
+ * bits per component."
*/
Property VIDEO_PIXEL_DEPTH =
- Property.internalClosedChoise("xmpDM:videoPixelDepth", "8Int", "16Int", "32Int",
- "32Float");
- /**
- * "The aspect ratio, expressed as wd/ht. For example: '648/720' = 0.9"
- */
+ Property.internalClosedChoise(
+ "xmpDM:videoPixelDepth", "8Int", "16Int", "32Int", "32Float");
+
+ /** "The aspect ratio, expressed as wd/ht. For example: '648/720' = 0.9" */
Property VIDEO_PIXEL_ASPECT_RATIO = Property.internalRational("xmpDM:videoPixelAspectRatio");
/**
@@ -345,9 +303,7 @@
class ChannelTypePropertyConverter {
private static final Property property = AUDIO_CHANNEL_TYPE;
- /**
- * How a standalone converter might work
- */
+ /** How a standalone converter might work */
public static String convert(Object value) {
if (value instanceof String) {
// Assume already done
@@ -368,9 +324,7 @@
return null;
}
- /**
- * How convert+set might work
- */
+ /** How convert+set might work */
public static void convertAndSet(Metadata metadata, Object value) {
if (value instanceof Integer || value instanceof Long) {
metadata.set(property, convert(value));
@@ -386,5 +340,4 @@
}
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java
index 015b065..38131f2 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPIdq.java
@@ -22,15 +22,12 @@
String PREFIX = "xmpidq";
- /**
- * The xmpidq prefix followed by the colon delimiter
- */
+ /** The xmpidq prefix followed by the colon delimiter */
String PREFIX_ = PREFIX + ":";
/**
- * A qualifier providing the name of the formal identification
- * scheme used for an item in the xmp:Identifier array.
+ * A qualifier providing the name of the formal identification scheme used for an item in the
+ * xmp:Identifier array.
*/
Property SCHEME = Property.externalText(PREFIX_ + "Scheme");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
index 2a81fa2..6901c1b 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPMM.java
@@ -22,86 +22,68 @@
String PREFIX = "xmpMM";
- /**
- * The xmpMM prefix followed by the colon delimiter
- */
+ /** The xmpMM prefix followed by the colon delimiter */
String PREFIX_ = PREFIX + ":";
/**
- * A reference to the resource from which this one is derived.
- * This should be a minimal reference, in which missing
- * components can be assumed to be unchanged.
+ * A reference to the resource from which this one is derived. This should be a minimal
+ * reference, in which missing components can be assumed to be unchanged.
*
- * TODO This property is of type RessourceRef which is a struct
+ * <p>TODO This property is of type RessourceRef which is a struct
*/
-// Property DERIVED_FROM = Property.externalText(PREFIX_ + "DerivedFrom");
+ // Property DERIVED_FROM = Property.externalText(PREFIX_ + "DerivedFrom");
- /**
- * The common identifier for all versions and renditions of a resource.
- */
+ /** The common identifier for all versions and renditions of a resource. */
Property DOCUMENTID = Property.externalText(PREFIX_ + "DocumentID");
/**
- * An identifier for a specific incarnation of a resource, updated
- * each time a file is saved.
+ * An identifier for a specific incarnation of a resource, updated each time a file is saved.
*/
Property INSTANCEID = Property.externalText(PREFIX_ + "InstanceID");
/**
- * The common identifier for the original resource from which
- * the current resource is derived. For example, if you save a
- * resource to a different format, then save that one to another
- * format, each save operation should generate a new
- * xmpMM:DocumentID that uniquely identifies the resource in
- * that format, but should retain the ID of the source file here.
+ * The common identifier for the original resource from which the current resource is derived.
+ * For example, if you save a resource to a different format, then save that one to another
+ * format, each save operation should generate a new xmpMM:DocumentID that uniquely identifies
+ * the resource in that format, but should retain the ID of the source file here.
*/
Property ORIGINAL_DOCUMENTID = Property.externalText(PREFIX_ + "OriginalDocumentID");
/**
- * The rendition class name for this resource. This property
- * should be absent or set to default for a resource that is not
- * a derived rendition
+ * The rendition class name for this resource. This property should be absent or set to default
+ * for a resource that is not a derived rendition
*/
Property RENDITION_CLASS =
- Property.externalOpenChoise(PREFIX_ + "RenditionClass", "default", "draft", "low-res",
- "proof", "screen", "thumbnail");
+ Property.externalOpenChoise(
+ PREFIX_ + "RenditionClass",
+ "default",
+ "draft",
+ "low-res",
+ "proof",
+ "screen",
+ "thumbnail");
/**
- * Can be used to provide additional rendition parameters that
- * are too complex or verbose to encode in xmpMM:RenditionClass
+ * Can be used to provide additional rendition parameters that are too complex or verbose to
+ * encode in xmpMM:RenditionClass
*/
Property RENDITION_PARAMS = Property.externalText(PREFIX_ + "RenditionParams");
- /**
- * Instance id in the XMPMM's history section
- */
+ /** Instance id in the XMPMM's history section */
Property HISTORY_EVENT_INSTANCEID = Property.externalTextBag(PREFIX_ + "History:InstanceID");
- /**
- * Action in the XMPMM's history section
- */
+ /** Action in the XMPMM's history section */
Property HISTORY_ACTION = Property.externalTextBag(PREFIX_ + "History:Action");
- /**
- * When the action occurred in the XMPMM's history section
- */
+
+ /** When the action occurred in the XMPMM's history section */
Property HISTORY_WHEN = Property.externalTextBag(PREFIX_ + "History:When");
- /**
- * Software agent that created the action in the XMPMM's
- * history section
- */
+ /** Software agent that created the action in the XMPMM's history section */
Property HISTORY_SOFTWARE_AGENT = Property.externalTextBag(PREFIX_ + "History:SoftwareAgent");
- /**
- * Document id for the document that this document
- * was derived from
- */
+ /** Document id for the document that this document was derived from */
Property DERIVED_FROM_DOCUMENTID = Property.externalText(PREFIX_ + "DerivedFrom:DocumentID");
- /**
- * Instance id for the document instance that this
- * document was derived from
- */
+ /** Instance id for the document instance that this document was derived from */
Property DERIVED_FROM_INSTANCEID = Property.externalText(PREFIX_ + "DerivedFrom:InstanceID");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java b/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java
index 6254dbf..5737c3b 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/XMPRights.java
@@ -22,12 +22,12 @@
/**
* XMP Rights management schema.
- * <p>
- * A collection of property constants for the
- * rights management properties defined in the XMP
+ *
+ * <p>A collection of property constants for the rights management properties defined in the XMP
* standard.
*
- * @see <a href="http://partners.adobe.com/public/developer/en/xmp/sdk/XMPspecification.pdf">XMP Photoshop</a>
+ * @see <a href="http://partners.adobe.com/public/developer/en/xmp/sdk/XMPspecification.pdf">XMP
+ * Photoshop</a>
* @since Apache Tika 1.2
*/
public interface XMPRights {
@@ -35,26 +35,19 @@
String NAMESPACE_URI_XMP_RIGHTS = "http://ns.adobe.com/xap/1.0/rights/";
String PREFIX_XMP_RIGHTS = "xmpRights";
- /**
- * The xmpRights prefix followed by the colon delimiter
- */
+ /** The xmpRights prefix followed by the colon delimiter */
String PREFIX_ = PREFIX_XMP_RIGHTS + ":";
- /**
- * A Web URL for a rights management certificate.
- */
+ /** A Web URL for a rights management certificate. */
Property CERTIFICATE = Property.internalText(PREFIX_ + "Certificate");
/**
- * When true, indicates that this is a rights-managed resource. When
- * false, indicates that this is a public-domain resource. Omit if the
- * state is unknown.
+ * When true, indicates that this is a rights-managed resource. When false, indicates that this
+ * is a public-domain resource. Omit if the state is unknown.
*/
Property MARKED = Property.internalBoolean(PREFIX_ + "Marked");
- /**
- * A list of legal owners of the resource.
- */
+ /** A list of legal owners of the resource. */
Property OWNER = Property.internalTextBag(PREFIX_ + "Owner");
/**
@@ -63,9 +56,6 @@
*/
Property USAGE_TERMS = Property.internalText(PREFIX_ + "UsageTerms");
- /**
- * A Web URL for a statement of the ownership and usage rights for this resource.
- */
+ /** A Web URL for a statement of the ownership and usage rights for this resource. */
Property WEB_STATEMENT = Property.internalText(PREFIX_ + "WebStatement");
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
index ca9b1e6..1159b53 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CaptureGroupMetadataFilter.java
@@ -20,7 +20,6 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -30,28 +29,25 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.StringUtils;
-
/**
- * This filter runs a regex against the first value in the "sourceField".
- * If the pattern matches, it extracts the first group of the first match and
- * set's the "targetField"'s value to that first group.
- * <p/>
- * If there is a match, this will overwrite whatever value is in the
- * "targetField".
- * <p/>
- * If there is not a match, this filter will be a no-op.
- * <p/>
- * If there are multiple matches, this filter will capture only the first.
- * Open a ticket if you need different behavior.
- * <p/>
- * If the source field has multiple values, this will run the regex
- * against only the first value.
- * <p/>
- * If the source field does not exist, this filter will be a no-op.
- * <p/>
- * If the target field is the same value as the source field, this filter
- * will overwrite the value in that field. Again, if there are multiple
- * values in that field, those will all be overwritten.
+ * This filter runs a regex against the first value in the "sourceField". If the pattern matches, it
+ * extracts the first group of the first match and set's the "targetField"'s value to that first
+ * group.
+ *
+ * <p>If there is a match, this will overwrite whatever value is in the "targetField".
+ *
+ * <p>If there is not a match, this filter will be a no-op.
+ *
+ * <p>If there are multiple matches, this filter will capture only the first. Open a ticket if you
+ * need different behavior.
+ *
+ * <p>If the source field has multiple values, this will run the regex against only the first value.
+ *
+ * <p>If the source field does not exist, this filter will be a no-op.
+ *
+ * <p>If the target field is the same value as the source field, this filter will overwrite the
+ * value in that field. Again, if there are multiple values in that field, those will all be
+ * overwritten.
*/
public class CaptureGroupMetadataFilter extends MetadataFilter implements Initializable {
@@ -94,7 +90,6 @@
} catch (PatternSyntaxException e) {
throw new TikaConfigException("Couldn't parse regex", e);
}
-
}
@Override
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
index f196436..e94fdef 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ClearByMimeMetadataFilter.java
@@ -19,16 +19,14 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
-
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
/**
- * This class clears the entire metadata object if the
- * mime matches the mime filter. The idea is that you might not want
- * to store/transmit metadata for images or specific file types.
+ * This class clears the entire metadata object if the mime matches the mime filter. The idea is
+ * that you might not want to store/transmit metadata for images or specific file types.
*/
public class ClearByMimeMetadataFilter extends MetadataFilter {
private final Set<String> mimes;
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
index 2c7d976..ed52548 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/CompositeMetadataFilter.java
@@ -17,7 +17,6 @@
package org.apache.tika.metadata.filter;
import java.util.List;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
index e093873..08ca6c2 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DateNormalizingMetadataFilter.java
@@ -22,32 +22,29 @@
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * Some dates in some file formats do not have a timezone.
- * Tika correctly stores these without a timezone, e.g. 'yyyy-MM-dd'T'HH:mm:ss'
- * This can be a problem if end points expect a 'Z' timezone.
- * This filter makes the assumption that dates without timezones are UTC
- * and always modifies the date to: "yyyy-MM-dd'T'HH:mm:ss'Z'"
+ * Some dates in some file formats do not have a timezone. Tika correctly stores these without a
+ * timezone, e.g. 'yyyy-MM-dd'T'HH:mm:ss' This can be a problem if end points expect a 'Z' timezone.
+ * This filter makes the assumption that dates without timezones are UTC and always modifies the
+ * date to: "yyyy-MM-dd'T'HH:mm:ss'Z'"
*
- * Users can specify an alternate defaultTimeZone with
- * {@link DateNormalizingMetadataFilter#setDefaultTimeZone(String)} to apply
- * if the file format does not specify a timezone.
- *
+ * <p>Users can specify an alternate defaultTimeZone with {@link
+ * DateNormalizingMetadataFilter#setDefaultTimeZone(String)} to apply if the file format does not
+ * specify a timezone.
*/
public class DateNormalizingMetadataFilter extends MetadataFilter {
private static TimeZone UTC = TimeZone.getTimeZone("UTC");
- private static final Logger LOGGER = LoggerFactory.getLogger(DateNormalizingMetadataFilter.class);
+ private static final Logger LOGGER =
+ LoggerFactory.getLogger(DateNormalizingMetadataFilter.class);
private TimeZone defaultTimeZone = UTC;
@@ -75,8 +72,8 @@
d = dateFormatter.parse(dateString);
metadata.set(property, utcFormatter.format(d));
} catch (ParseException e) {
- LOGGER.warn("Couldn't convert date to default time zone: >"
- + dateString + "<");
+ LOGGER.warn(
+ "Couldn't convert date to default time zone: >" + dateString + "<");
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java
index 64a7d0a..c52c2af 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/DefaultMetadataFilter.java
@@ -17,7 +17,6 @@
package org.apache.tika.metadata.filter;
import java.util.List;
-
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.utils.ServiceLoaderUtils;
@@ -36,7 +35,8 @@
}
private static List<MetadataFilter> getDefaultFilters(ServiceLoader loader) {
- List<MetadataFilter> metadataFilters = loader.loadStaticServiceProviders(MetadataFilter.class);
+ List<MetadataFilter> metadataFilters =
+ loader.loadStaticServiceProviders(MetadataFilter.class);
ServiceLoaderUtils.sortLoadedClasses(metadataFilters);
return metadataFilters;
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
index 59d10d9..3405558 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/ExcludeFieldMetadataFilter.java
@@ -19,7 +19,6 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
-
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
index db16f5d..83fecfc 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/FieldNameMappingFilter.java
@@ -18,7 +18,6 @@
import java.util.LinkedHashMap;
import java.util.Map;
-
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -57,10 +56,9 @@
}
/**
- * If this is <code>true</code> (default), this means that only the fields that
- * have a "from" value in the mapper will be passed through. Otherwise,
- * this will pass through all keys/values and mutate the keys
- * that exist in the mappings.
+ * If this is <code>true</code> (default), this means that only the fields that have a "from"
+ * value in the mapper will be passed through. Otherwise, this will pass through all keys/values
+ * and mutate the keys that exist in the mappings.
*
* @param excludeUnmapped
*/
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java
index 2b65cb1..1b5edd1 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/GeoPointMetadataFilter.java
@@ -23,19 +23,19 @@
import org.apache.tika.utils.StringUtils;
/**
- * If {@link Metadata} contains a {@link TikaCoreProperties#LATITUDE} and
- * a {@link TikaCoreProperties#LONGITUDE}, this filter concatenates those with a
- * comma in the order LATITUDE,LONGITUDE.
+ * If {@link Metadata} contains a {@link TikaCoreProperties#LATITUDE} and a {@link
+ * TikaCoreProperties#LONGITUDE}, this filter concatenates those with a comma in the order
+ * LATITUDE,LONGITUDE.
*
- * If you need any other mappings, please open a ticket on our JIRA.
+ * <p>If you need any other mappings, please open a ticket on our JIRA.
*/
public class GeoPointMetadataFilter extends MetadataFilter {
String geoPointFieldName = "location";
/**
- * Set the field for the concatenated LATITUDE,LONGITUDE string.
- * The default if &dquot;location&dquot;
+ * Set the field for the concatenated LATITUDE,LONGITUDE string. The default if
+ * &dquot;location&dquot;
*
* @param geoPointFieldName field name to use for the geopoint field
*/
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
index b75de6a..0543b43 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/IncludeFieldMetadataFilter.java
@@ -19,7 +19,6 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
-
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
index 21eb3ec..1ae2491 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/MetadataFilter.java
@@ -19,13 +19,11 @@
import java.io.IOException;
import java.io.Serializable;
-
-import org.w3c.dom.Element;
-
import org.apache.tika.config.ConfigBase;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.w3c.dom.Element;
/**
* Filters the metadata in place after the parse
@@ -36,16 +34,21 @@
/**
* Loads the metadata filter from the config file if it exists, otherwise returns NoOpFilter
+ *
* @param root
* @return
* @throws TikaConfigException
* @throws IOException
*/
- public static MetadataFilter load(Element root, boolean allowMissing) throws TikaConfigException,
- IOException {
+ public static MetadataFilter load(Element root, boolean allowMissing)
+ throws TikaConfigException, IOException {
try {
- return buildComposite("metadataFilters", CompositeMetadataFilter.class,
- "metadataFilter", MetadataFilter.class, root);
+ return buildComposite(
+ "metadataFilters",
+ CompositeMetadataFilter.class,
+ "metadataFilter",
+ MetadataFilter.class,
+ root);
} catch (TikaConfigException e) {
if (allowMissing && e.getMessage().contains("could not find metadataFilters")) {
return new NoOpFilter();
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
index d95472a..477d9c1 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/filter/NoOpFilter.java
@@ -19,16 +19,13 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-/**
- * This filter performs no operations on the metadata
- * and leaves it untouched.
- */
+/** This filter performs no operations on the metadata and leaves it untouched. */
public class NoOpFilter extends MetadataFilter {
public static final NoOpFilter NOOP_FILTER = new NoOpFilter();
@Override
public void filter(Metadata metadata) throws TikaException {
- //no op
+ // no op
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/package-info.java b/tika-core/src/main/java/org/apache/tika/metadata/package-info.java
index 02fcae3..3f2edbb 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Multi-valued metadata container, and set of constant metadata fields.
- */
+/** Multi-valued metadata container, and set of constant metadata fields. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.metadata;
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java
index e03367c..565db73 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/MetadataWriteFilter.java
@@ -24,13 +24,13 @@
void filterExisting(Map<String, String[]> data);
/**
- * Based on the field and value, this filter modifies the field
- * and/or the value to something that should be added to the Metadata object.
+ * Based on the field and value, this filter modifies the field and/or the value to something
+ * that should be added to the Metadata object.
*
- * If the value is <code>null</code>, no value is set or added.
+ * <p>If the value is <code>null</code>, no value is set or added.
*
- * Status updates (e.g. write limit reached) can be added directly to the
- * underlying metadata.
+ * <p>Status updates (e.g. write limit reached) can be added directly to the underlying
+ * metadata.
*
* @param field
* @param value
@@ -40,9 +40,8 @@
void add(String field, String value, Map<String, String[]> data);
/**
- * Based on the field and the value, this filter modifies
- * the field and/or the value to something that should be set in the
- * Metadata object.
+ * Based on the field and the value, this filter modifies the field and/or the value to
+ * something that should be set in the Metadata object.
*
* @param field
* @param value
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
index f0e9f1f..78572bf 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilter.java
@@ -26,47 +26,40 @@
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-
import org.apache.tika.metadata.AccessPermissions;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.utils.StringUtils;
/**
- * This is to be used to limit the amount of metadata that a
- * parser can add based on the {@link #maxTotalEstimatedSize},
- * {@link #maxFieldSize}, {@link #maxValuesPerField}, and
- * {@link #maxKeySize}. This can also be used to limit which
- * fields are stored in the metadata object at write-time
- * with {@link #includeFields}.
+ * This is to be used to limit the amount of metadata that a parser can add based on the {@link
+ * #maxTotalEstimatedSize}, {@link #maxFieldSize}, {@link #maxValuesPerField}, and {@link
+ * #maxKeySize}. This can also be used to limit which fields are stored in the metadata object at
+ * write-time with {@link #includeFields}.
*
- * All sizes are measured in UTF-16 bytes. The size is estimated
- * as a rough order of magnitude of what is
- * required to store the string in memory in Java. We recognize
- * that Java uses more bytes to store length, offset etc. for strings. But
- * the extra overhead varies by Java version and implementation,
- * and we just need a basic estimate. We also recognize actual
- * memory usage is affected by interning strings, etc.
- * Please forgive us ... or consider writing your own write filter. :)
+ * <p>All sizes are measured in UTF-16 bytes. The size is estimated as a rough order of magnitude of
+ * what is required to store the string in memory in Java. We recognize that Java uses more bytes to
+ * store length, offset etc. for strings. But the extra overhead varies by Java version and
+ * implementation, and we just need a basic estimate. We also recognize actual memory usage is
+ * affected by interning strings, etc. Please forgive us ... or consider writing your own write
+ * filter. :)
*
+ * <p><b>NOTE:</b> Fields in {@link #ALWAYS_SET_FIELDS} are always set no matter the current state
+ * of {@link #maxTotalEstimatedSize}. Except for {@link TikaCoreProperties#TIKA_CONTENT}, they are
+ * truncated at {@link #maxFieldSize}, and their sizes contribute to the {@link
+ * #maxTotalEstimatedSize}.
*
- * <b>NOTE:</b> Fields in {@link #ALWAYS_SET_FIELDS} are
- * always set no matter the current state of {@link #maxTotalEstimatedSize}.
- * Except for {@link TikaCoreProperties#TIKA_CONTENT}, they are truncated at
- * {@link #maxFieldSize}, and their sizes contribute to the {@link #maxTotalEstimatedSize}.
+ * <p><b>NOTE:</b> Fields in {@link #ALWAYS_ADD_FIELDS} are always added no matter the current state
+ * of {@link #maxTotalEstimatedSize}. Except for {@link TikaCoreProperties#TIKA_CONTENT}, each
+ * addition is truncated at {@link #maxFieldSize}, and their sizes contribute to the {@link
+ * #maxTotalEstimatedSize}.
*
- * <b>NOTE:</b> Fields in {@link #ALWAYS_ADD_FIELDS} are
- * always added no matter the current state of {@link #maxTotalEstimatedSize}.
- * Except for {@link TikaCoreProperties#TIKA_CONTENT}, each addition is truncated at
- * {@link #maxFieldSize}, and their sizes contribute to the {@link #maxTotalEstimatedSize}.
+ * <p>This class {@link #minimumMaxFieldSizeInAlwaysFields} to protect the {@link
+ * #ALWAYS_ADD_FIELDS} and {@link #ALWAYS_SET_FIELDS}. If we didn't have this and a user sets the
+ * {@link #maxFieldSize} to, say, 10 bytes, the internal parser behavior would be broken because
+ * parsers rely on {@link Metadata#CONTENT_TYPE} to determine which parser to call.
*
- * This class {@link #minimumMaxFieldSizeInAlwaysFields} to protect the
- * {@link #ALWAYS_ADD_FIELDS} and {@link #ALWAYS_SET_FIELDS}. If we didn't
- * have this and a user sets the {@link #maxFieldSize} to, say, 10 bytes,
- * the internal parser behavior would be broken because parsers rely on
- * {@link Metadata#CONTENT_TYPE} to determine which parser to call.
- *
- * <b>NOTE:</b> as with {@link Metadata}, this object is not thread safe.
+ * <p><b>NOTE:</b> as with {@link Metadata}, this object is not thread safe.
*/
public class StandardWriteFilter implements MetadataWriteFilter, Serializable {
@@ -87,7 +80,7 @@
ALWAYS_SET_FIELDS.add(Metadata.CONTENT_DISPOSITION);
ALWAYS_SET_FIELDS.add(TikaCoreProperties.CONTAINER_EXCEPTION.getName());
ALWAYS_SET_FIELDS.add(TikaCoreProperties.EMBEDDED_EXCEPTION.getName());
- //Metadata.CONTENT_LOCATION? used by the html parser
+ // Metadata.CONTENT_LOCATION? used by the html parser
}
static {
@@ -97,41 +90,42 @@
private static final String METADATA_TRUNCATED_KEY =
TikaCoreProperties.TRUNCATED_METADATA.getName();
private static final String TIKA_CONTENT_KEY = TikaCoreProperties.TIKA_CONTENT.getName();
- private static final String[] TRUE = new String[]{"true"};
+ private static final String[] TRUE = new String[] {"true"};
- //allow at least these many bytes in the "always" fields.
- //As of 2022-03, the longest mime is 146. Doubling that gives
- //us some leeway. If a mime is truncated, bad things will happen.
+ // allow at least these many bytes in the "always" fields.
+ // As of 2022-03, the longest mime is 146. Doubling that gives
+ // us some leeway. If a mime is truncated, bad things will happen.
private final int minimumMaxFieldSizeInAlwaysFields = 300;
-
private final boolean includeEmpty;
private final int maxTotalEstimatedSize;
private final int maxValuesPerField;
private final int maxFieldSize;
private final int maxKeySize;
-
private final Set<String> includeFields;
private Map<String, Integer> fieldSizes = new HashMap<>();
- //tracks the estimated size in utf16 bytes. Can be > maxEstimated size
+ // tracks the estimated size in utf16 bytes. Can be > maxEstimated size
int estimatedSize = 0;
/**
- * @param maxKeySize maximum key size in UTF-16 bytes-- keys will be truncated to this
- * length; if less than 0, keys will not be truncated
+ * @param maxKeySize maximum key size in UTF-16 bytes-- keys will be truncated to this length;
+ * if less than 0, keys will not be truncated
* @param maxEstimatedSize
- * @param includeFields if null or empty, all fields are included; otherwise, which fields
- * to add to the metadata object.
- * @param includeEmpty if <code>true</code>, this will set or add an empty value to the
- * metadata object.
+ * @param includeFields if null or empty, all fields are included; otherwise, which fields to
+ * add to the metadata object.
+ * @param includeEmpty if <code>true</code>, this will set or add an empty value to the metadata
+ * object.
*/
- protected StandardWriteFilter(int maxKeySize, int maxFieldSize, int maxEstimatedSize,
- int maxValuesPerField,
- Set<String> includeFields,
- boolean includeEmpty) {
+ protected StandardWriteFilter(
+ int maxKeySize,
+ int maxFieldSize,
+ int maxEstimatedSize,
+ int maxValuesPerField,
+ Set<String> includeFields,
+ boolean includeEmpty) {
this.maxKeySize = maxKeySize;
this.maxFieldSize = maxFieldSize;
@@ -143,16 +137,16 @@
@Override
public void filterExisting(Map<String, String[]> data) {
- //this is somewhat costly, but it ensures that
- //metadata that was placed in the metadata object before this
- //filter was applied is removed.
- //It should only be called once, and probably not on that
- //many fields.
+ // this is somewhat costly, but it ensures that
+ // metadata that was placed in the metadata object before this
+ // filter was applied is removed.
+ // It should only be called once, and probably not on that
+ // many fields.
Map<String, String[]> tmp = new HashMap<>();
for (Map.Entry<String, String[]> e : data.entrySet()) {
String name = e.getKey();
String[] vals = e.getValue();
- if (! includeField(name)) {
+ if (!includeField(name)) {
continue;
}
for (int i = 0; i < vals.length; i++) {
@@ -166,10 +160,9 @@
data.putAll(tmp);
}
-
@Override
public void set(String field, String value, Map<String, String[]> data) {
- if (! include(field, value)) {
+ if (!include(field, value)) {
return;
}
if (ALWAYS_SET_FIELDS.contains(field) || ALWAYS_ADD_FIELDS.contains(field)) {
@@ -182,12 +175,12 @@
private void setAlwaysInclude(String field, String value, Map<String, String[]> data) {
if (TIKA_CONTENT_KEY.equals(field)) {
- data.put(field, new String[]{ value });
+ data.put(field, new String[] {value});
return;
}
int sizeToAdd = estimateSize(value);
- //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax
- //we do not want to truncate a mime!
+ // if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax
+ // we do not want to truncate a mime!
int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize);
String toSet = value;
if (sizeToAdd > alwaysMaxFieldLength) {
@@ -198,29 +191,29 @@
totalAdded += sizeToAdd;
if (data.containsKey(field)) {
String[] vals = data.get(field);
- //this should only ever be single valued!!!
+ // this should only ever be single valued!!!
if (vals.length > 0) {
totalAdded -= estimateSize(vals[0]);
}
}
estimatedSize += totalAdded;
- data.put(field, new String[]{toSet});
+ data.put(field, new String[] {toSet});
}
private void addAlwaysInclude(String field, String value, Map<String, String[]> data) {
if (TIKA_CONTENT_KEY.equals(field)) {
- data.put(field, new String[]{ value });
+ data.put(field, new String[] {value});
return;
}
- if (! data.containsKey(field)) {
+ if (!data.containsKey(field)) {
setAlwaysInclude(field, value, data);
return;
}
- //TODO: should we limit the number of field values?
+ // TODO: should we limit the number of field values?
int toAddSize = estimateSize(value);
- //if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax
- //we do not want to truncate a mime!
+ // if the maxFieldSize is < minimumMaxFieldSizeInAlwaysFields, use the minmax
+ // we do not want to truncate a mime!
int alwaysMaxFieldLength = Math.max(minimumMaxFieldSizeInAlwaysFields, maxFieldSize);
String toAddValue = value;
if (toAddSize > alwaysMaxFieldLength) {
@@ -234,29 +227,27 @@
data.put(field, appendValue(data.get(field), toAddValue));
}
-
- //calculate the max field length allowed if we are
- //setting a value
+ // calculate the max field length allowed if we are
+ // setting a value
private int maxAllowedToSet(StringSizePair filterKey) {
Integer existingSizeInt = fieldSizes.get(filterKey.string);
int existingSize = existingSizeInt == null ? 0 : existingSizeInt;
- //this is how much is allowed by the overall total limit
+ // this is how much is allowed by the overall total limit
int allowedByMaxTotal = maxTotalEstimatedSize - estimatedSize;
- //if we're overwriting a value, that value's data size is now available
+ // if we're overwriting a value, that value's data size is now available
allowedByMaxTotal += existingSize;
- //if we're adding a key, we need to subtract that value
+ // if we're adding a key, we need to subtract that value
allowedByMaxTotal -= existingSizeInt == null ? filterKey.size : 0;
return Math.min(maxFieldSize, allowedByMaxTotal);
}
-
@Override
public void add(String field, String value, Map<String, String[]> data) {
- if (! include(field, value)) {
+ if (!include(field, value)) {
return;
}
if (ALWAYS_SET_FIELDS.contains(field)) {
@@ -267,7 +258,7 @@
return;
}
StringSizePair filterKey = filterKey(field, value, data);
- if (! data.containsKey(filterKey.string)) {
+ if (!data.containsKey(filterKey.string)) {
setFilterKey(filterKey, value, data);
return;
}
@@ -298,16 +289,16 @@
int addedOverall = valueLength;
if (fieldSizeInteger == null) {
- //if there was no value before, we're adding
- //a key. If there was a value before, do not
- //add the key length.
+ // if there was no value before, we're adding
+ // a key. If there was a value before, do not
+ // add the key length.
addedOverall += filterKey.size;
}
estimatedSize += addedOverall;
fieldSizes.put(filterKey.string, valueLength + fieldSize);
- data.put(filterKey.string, appendValue(data.get(filterKey.string), toAdd ));
+ data.put(filterKey.string, appendValue(data.get(filterKey.string), toAdd));
}
private String[] appendValue(String[] values, final String value) {
@@ -320,28 +311,27 @@
return newValues;
}
- //calculate the max field length allowed if we are
- //adding a value
+ // calculate the max field length allowed if we are
+ // adding a value
private int maxAllowedToAdd(StringSizePair filterKey) {
Integer existingSizeInt = fieldSizes.get(filterKey.string);
int existingSize = existingSizeInt == null ? 0 : existingSizeInt;
- //how much can we add to this field
+ // how much can we add to this field
int allowedByMaxField = maxFieldSize - existingSize;
- //this is how much is allowed by the overall total limit
+ // this is how much is allowed by the overall total limit
int allowedByMaxTotal = maxTotalEstimatedSize - estimatedSize - 1;
- //if we're adding a new key, we need to subtract that value
+ // if we're adding a new key, we need to subtract that value
allowedByMaxTotal -= existingSizeInt == null ? filterKey.size : 0;
return Math.min(allowedByMaxField, allowedByMaxTotal);
}
- private void setFilterKey(StringSizePair filterKey, String value,
- Map<String, String[]> data) {
- //if you can't even add the key, give up now
- if (! data.containsKey(filterKey.string) &&
- (filterKey.size + estimatedSize > maxTotalEstimatedSize)) {
+ private void setFilterKey(StringSizePair filterKey, String value, Map<String, String[]> data) {
+ // if you can't even add the key, give up now
+ if (!data.containsKey(filterKey.string)
+ && (filterKey.size + estimatedSize > maxTotalEstimatedSize)) {
setTruncated(data);
return;
}
@@ -365,9 +355,9 @@
int addedOverall = 0;
if (fieldSizeInteger == null) {
- //if there was no value before, we're adding
- //a key. If there was a value before, do not
- //add the key length.
+ // if there was no value before, we're adding
+ // a key. If there was a value before, do not
+ // add the key length.
addedOverall += filterKey.size;
}
addedOverall += valueLength - fieldSize;
@@ -375,8 +365,7 @@
fieldSizes.put(filterKey.string, valueLength);
- data.put(filterKey.string, new String[]{ toSet });
-
+ data.put(filterKey.string, new String[] {toSet});
}
private void setTruncated(Map<String, String[]> data) {
@@ -390,15 +379,13 @@
}
String toWrite = truncate(field, maxKeySize, data);
- return new StringSizePair(toWrite,
- estimateSize(toWrite),
- true);
+ return new StringSizePair(toWrite, estimateSize(toWrite), true);
}
private String truncate(String value, int length, Map<String, String[]> data) {
setTruncated(data);
- //correctly handle multibyte characters
+ // correctly handle multibyte characters
byte[] bytes = value.getBytes(StandardCharsets.UTF_16BE);
ByteBuffer bb = ByteBuffer.wrap(bytes, 0, length);
CharBuffer cb = CharBuffer.allocate(length);
@@ -416,6 +403,7 @@
/**
* Tests for null or empty. Does not check for length
+ *
* @param value
* @return
*/
@@ -433,8 +421,7 @@
if (ALWAYS_SET_FIELDS.contains(name)) {
return true;
}
- if (includeFields == null ||
- includeFields.contains(name)) {
+ if (includeFields == null || includeFields.contains(name)) {
return true;
}
return false;
@@ -446,7 +433,7 @@
private static class StringSizePair {
final String string;
- final int size;//utf-16 bytes -- estimated
+ final int size; // utf-16 bytes -- estimated
final boolean truncated;
public StringSizePair(String string, int size, boolean truncated) {
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
index b7d60b5..397c238 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/writefilter/StandardWriteFilterFactory.java
@@ -22,12 +22,11 @@
import java.util.concurrent.ConcurrentHashMap;
/**
- * Factory class for {@link StandardWriteFilter}. See that class
- * for how the estimated sizes are calculated on Strings.
+ * Factory class for {@link StandardWriteFilter}. See that class for how the estimated sizes are
+ * calculated on Strings.
*/
public class StandardWriteFilterFactory implements MetadataWriteFilterFactory {
-
public static int DEFAULT_MAX_KEY_SIZE = 1024;
public static int DEFAULT_MAX_FIELD_SIZE = 100 * 1024;
public static int DEFAULT_TOTAL_ESTIMATED_BYTES = 10 * 1024 * 1024;
@@ -54,8 +53,13 @@
throw new IllegalArgumentException("max estimated size must be > 0");
}
- return new StandardWriteFilter(maxKeySize, maxFieldSize,
- maxTotalEstimatedBytes, maxValuesPerField, includeFields, includeEmpty);
+ return new StandardWriteFilter(
+ maxKeySize,
+ maxFieldSize,
+ maxTotalEstimatedBytes,
+ maxValuesPerField,
+ includeFields,
+ includeEmpty);
}
public void setIncludeFields(List<String> includeFields) {
@@ -110,9 +114,19 @@
@Override
public String toString() {
- return "StandardWriteFilterFactory{" + "includeFields=" + includeFields + ", maxKeySize=" +
- maxKeySize + ", maxFieldSize=" + maxFieldSize + ", maxTotalEstimatedBytes=" +
- maxTotalEstimatedBytes + ", maxValuesPerField=" + maxValuesPerField +
- ", includeEmpty=" + includeEmpty + '}';
+ return "StandardWriteFilterFactory{"
+ + "includeFields="
+ + includeFields
+ + ", maxKeySize="
+ + maxKeySize
+ + ", maxFieldSize="
+ + maxFieldSize
+ + ", maxTotalEstimatedBytes="
+ + maxTotalEstimatedBytes
+ + ", maxValuesPerField="
+ + maxValuesPerField
+ + ", includeEmpty="
+ + includeEmpty
+ + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/AndClause.java b/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
index c7a2184..9fcf670 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/AndClause.java
@@ -46,5 +46,4 @@
public String toString() {
return "and" + Arrays.toString(clauses);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/Clause.java b/tika-core/src/main/java/org/apache/tika/mime/Clause.java
index fc3bcc1..cfc7701 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/Clause.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/Clause.java
@@ -18,20 +18,15 @@
import java.io.Serializable;
-/**
- * Defines a clause to be evaluated.
- */
+/** Defines a clause to be evaluated. */
interface Clause extends Serializable {
- /**
- * Evaluates this clause with the specified chunk of data.
- */
+ /** Evaluates this clause with the specified chunk of data. */
boolean eval(byte[] data);
/**
- * Returns the size of this clause. The size of a clause is the number of
- * chars it is composed of.
+ * Returns the size of this clause. The size of a clause is the number of chars it is composed
+ * of.
*/
int size();
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java b/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java
index 1ba53fe..961bfd9 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/HexCoDec.java
@@ -16,13 +16,12 @@
*/
package org.apache.tika.mime;
-/**
- * A set of Hex encoding and decoding utility methods.
- */
+/** A set of Hex encoding and decoding utility methods. */
public class HexCoDec {
- private static final char[] HEX_CHARS =
- {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
+ private static final char[] HEX_CHARS = {
+ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'
+ };
/**
* Decode a hex string
@@ -47,9 +46,9 @@
/**
* Decode an array of hex chars.
*
- * @param hexChars an array of hex characters.
+ * @param hexChars an array of hex characters.
* @param startIndex the index of the first character to decode
- * @param length the number of characters to decode.
+ * @param length the number of characters to decode.
* @return the decode hex chars as bytes.
*/
public static byte[] decode(char[] hexChars, int startIndex, int length) {
@@ -59,8 +58,10 @@
byte[] result = new byte[length / 2];
for (int j = 0; j < result.length; j++) {
- result[j] = (byte) (hexCharToNibble(hexChars[startIndex++]) * 16 +
- hexCharToNibble(hexChars[startIndex++]));
+ result[j] =
+ (byte)
+ (hexCharToNibble(hexChars[startIndex++]) * 16
+ + hexCharToNibble(hexChars[startIndex++]));
}
return result;
}
@@ -78,9 +79,9 @@
/**
* Hex encode an array of bytes
*
- * @param bites the array of bytes to encode.
+ * @param bites the array of bytes to encode.
* @param startIndex the index of the first character to encode.
- * @param length the number of characters to encode.
+ * @param length the number of characters to encode.
* @return the array of hex characters.
*/
public static char[] encode(byte[] bites, int startIndex, int length) {
@@ -93,9 +94,7 @@
return result;
}
- /**
- * Internal method to turn a hex char into a nibble.
- */
+ /** Internal method to turn a hex char into a nibble. */
private static int hexCharToNibble(char ch) {
if ((ch >= '0') && (ch <= '9')) {
return ch - '0';
@@ -107,5 +106,4 @@
throw new IllegalArgumentException("Not a hex char - '" + ch + "'");
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/Magic.java b/tika-core/src/main/java/org/apache/tika/mime/Magic.java
index 9c3b3ad..1254bd5 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/Magic.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/Magic.java
@@ -16,10 +16,7 @@
*/
package org.apache.tika.mime;
-/**
- * Defines a magic for a MimeType. A magic is made of one or several
- * MagicClause.
- */
+/** Defines a magic for a MimeType. A magic is made of one or several MagicClause. */
class Magic implements Clause, Comparable<Magic> {
private final MimeType type;
@@ -82,5 +79,4 @@
public int hashCode() {
return type.hashCode() ^ string.hashCode();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java b/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
index afa0de9..f1c5a83 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MagicMatch.java
@@ -17,15 +17,11 @@
package org.apache.tika.mime;
import java.io.IOException;
-
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
-
import org.apache.tika.detect.MagicDetector;
import org.apache.tika.metadata.Metadata;
-/**
- * Defines a magic match.
- */
+/** Defines a magic match. */
class MagicMatch implements Clause {
private final MediaType mediaType;
@@ -57,8 +53,9 @@
public boolean eval(byte[] data) {
try {
- return getDetector().detect(new UnsynchronizedByteArrayInputStream(data), new Metadata()) !=
- MediaType.OCTET_STREAM;
+ return getDetector()
+ .detect(new UnsynchronizedByteArrayInputStream(data), new Metadata())
+ != MediaType.OCTET_STREAM;
} catch (IOException e) {
// Should never happen with a ByteArrayInputStream
return false;
@@ -72,5 +69,4 @@
public String toString() {
return mediaType.toString() + " " + type + " " + offset + " " + value + " " + mask;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
index 13ad6ed..44a03c1 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MediaType.java
@@ -29,14 +29,10 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-/**
- * Internet media type.
- */
+/** Internet media type. */
public final class MediaType implements Comparable<MediaType>, Serializable {
- /**
- * Serial version UID.
- */
+ /** Serial version UID. */
private static final long serialVersionUID = -3831000556189036392L;
private static final Pattern SPECIAL = Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=]");
@@ -44,23 +40,24 @@
private static final Pattern SPECIAL_OR_WHITESPACE =
Pattern.compile("[\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]");
- /**
- * See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters.
- */
+ /** See http://www.ietf.org/rfc/rfc2045.txt for valid mime-type characters. */
private static final String VALID_CHARS = "([^\\c\\(\\)<>@,;:\\\\\"/\\[\\]\\?=\\s]+)";
private static final Pattern TYPE_PATTERN =
Pattern.compile("(?s)\\s*" + VALID_CHARS + "\\s*/\\s*" + VALID_CHARS + "\\s*($|;.*)");
// TIKA-350: handle charset as first element in content-type
- private static final Pattern CHARSET_FIRST_PATTERN = Pattern.compile(
- "(?is)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*" + VALID_CHARS + "\\s*/\\s*" +
- VALID_CHARS + "\\s*");
+ private static final Pattern CHARSET_FIRST_PATTERN =
+ Pattern.compile(
+ "(?is)\\s*(charset\\s*=\\s*[^\\c;\\s]+)\\s*;\\s*"
+ + VALID_CHARS
+ + "\\s*/\\s*"
+ + VALID_CHARS
+ + "\\s*");
/**
- * Set of basic types with normalized "type/subtype" names.
- * Used to optimize type lookup and to avoid having too many
- * {@link MediaType} instances in memory.
+ * Set of basic types with normalized "type/subtype" names. Used to optimize type lookup and to
+ * avoid having too many {@link MediaType} instances in memory.
*/
private static final Map<String, MediaType> SIMPLE_TYPES = new HashMap<>();
@@ -75,24 +72,22 @@
public static final MediaType APPLICATION_XML = parse("application/xml");
public static final MediaType APPLICATION_ZIP = parse("application/zip");
- /**
- * Canonical string representation of this media type.
- */
+
+ /** Canonical string representation of this media type. */
private final String string;
+
/**
- * Location of the "/" character separating the type and the subtype
- * tokens in {@link #string}.
+ * Location of the "/" character separating the type and the subtype tokens in {@link #string}.
*/
private final int slash;
+
/**
- * Location of the first ";" character separating the type part of
- * {@link #string} from possible parameters. Length of {@link #string}
- * in case there are no parameters.
+ * Location of the first ";" character separating the type part of {@link #string} from possible
+ * parameters. Length of {@link #string} in case there are no parameters.
*/
private final int semicolon;
- /**
- * Immutable sorted map of media type parameters.
- */
+
+ /** Immutable sorted map of media type parameters. */
private final Map<String, String> parameters;
public MediaType(String type, String subtype, Map<String, String> parameters) {
@@ -157,8 +152,8 @@
/**
* Creates a media type by adding a parameter to a base type.
*
- * @param type base type
- * @param name parameter name
+ * @param type base type
+ * @param name parameter name
* @param value parameter value
* @since Apache Tika 1.2
*/
@@ -169,7 +164,7 @@
/**
* Creates a media type by adding the "charset" parameter to a base type.
*
- * @param type base type
+ * @param type base type
* @param charset charset value
* @since Apache Tika 1.2
*/
@@ -198,8 +193,7 @@
}
/**
- * Convenience method that returns an unmodifiable set that contains
- * all the given media types.
+ * Convenience method that returns an unmodifiable set that contains all the given media types.
*
* @param types media types
* @return unmodifiable set of the given types
@@ -216,8 +210,8 @@
}
/**
- * Convenience method that parses the given media type strings and
- * returns an unmodifiable set that contains all the parsed types.
+ * Convenience method that parses the given media type strings and returns an unmodifiable set
+ * that contains all the parsed types.
*
* @param types media type strings
* @return unmodifiable set of the parsed types
@@ -235,10 +229,9 @@
}
/**
- * Parses the given string to a media type. The string is expected
- * to be of the form "type/subtype(; parameter=...)*" as defined in
- * RFC 2045, though we also handle "charset=xxx; type/subtype" for
- * broken web servers.
+ * Parses the given string to a media type. The string is expected to be of the form
+ * "type/subtype(; parameter=...)*" as defined in RFC 2045, though we also handle "charset=xxx;
+ * type/subtype" for broken web servers.
*
* @param string media type string to be parsed
* @return parsed media type, or <code>null</code> if parsing fails
@@ -255,9 +248,9 @@
int slash = string.indexOf('/');
if (slash == -1) {
return null;
- } else if (SIMPLE_TYPES.size() < 10000 &&
- isSimpleName(string.substring(0, slash)) &&
- isSimpleName(string.substring(slash + 1))) {
+ } else if (SIMPLE_TYPES.size() < 10000
+ && isSimpleName(string.substring(0, slash))
+ && isSimpleName(string.substring(slash + 1))) {
type = new MediaType(string, slash);
SIMPLE_TYPES.put(string, type);
}
@@ -270,13 +263,13 @@
Matcher matcher;
matcher = TYPE_PATTERN.matcher(string);
if (matcher.matches()) {
- return new MediaType(matcher.group(1), matcher.group(2),
- parseParameters(matcher.group(3)));
+ return new MediaType(
+ matcher.group(1), matcher.group(2), parseParameters(matcher.group(3)));
}
matcher = CHARSET_FIRST_PATTERN.matcher(string);
if (matcher.matches()) {
- return new MediaType(matcher.group(2), matcher.group(3),
- parseParameters(matcher.group(1)));
+ return new MediaType(
+ matcher.group(2), matcher.group(3), parseParameters(matcher.group(1)));
}
return null;
@@ -285,8 +278,12 @@
private static boolean isSimpleName(String name) {
for (int i = 0; i < name.length(); i++) {
char c = name.charAt(i);
- if (c != '-' && c != '+' && c != '.' && c != '_' && !('0' <= c && c <= '9') &&
- !('a' <= c && c <= 'z')) {
+ if (c != '-'
+ && c != '+'
+ && c != '.'
+ && c != '_'
+ && !('0' <= c && c <= '9')
+ && !('a' <= c && c <= 'z')) {
return false;
}
}
@@ -329,8 +326,7 @@
}
/**
- * Fuzzy unquoting mechanism that works also with somewhat malformed
- * quotes.
+ * Fuzzy unquoting mechanism that works also with somewhat malformed quotes.
*
* @param s string to unquote
* @return unquoted string
@@ -359,8 +355,7 @@
}
/**
- * Returns the base form of the MediaType, excluding
- * any parameters, such as "text/plain" for
+ * Returns the base form of the MediaType, excluding any parameters, such as "text/plain" for
* "text/plain; charset=utf-8"
*/
public MediaType getBaseType() {
@@ -371,18 +366,12 @@
}
}
- /**
- * Return the Type of the MediaType, such as
- * "text" for "text/plain"
- */
+ /** Return the Type of the MediaType, such as "text" for "text/plain" */
public String getType() {
return string.substring(0, slash);
}
- /**
- * Return the Sub-Type of the MediaType,
- * such as "plain" for "text/plain"
- */
+ /** Return the Sub-Type of the MediaType, such as "plain" for "text/plain" */
public String getSubtype() {
return string.substring(slash + 1, semicolon);
}
@@ -390,8 +379,8 @@
/**
* Checks whether this media type contains parameters.
*
- * @return <code>true</code> if this type has one or more parameters,
- * <code>false</code> otherwise
+ * @return <code>true</code> if this type has one or more parameters, <code>false</code>
+ * otherwise
* @since Apache Tika 0.8
*/
public boolean hasParameters() {
@@ -399,8 +388,8 @@
}
/**
- * Returns an immutable sorted map of the parameters of this media type.
- * The parameter names are guaranteed to be trimmed and in lower case.
+ * Returns an immutable sorted map of the parameters of this media type. The parameter names are
+ * guaranteed to be trimmed and in lower case.
*
* @return sorted map of parameters
*/
@@ -428,5 +417,4 @@
public int compareTo(MediaType that) {
return string.compareTo(that.string);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
index ac5b3ad..93a3f54 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
@@ -23,24 +23,22 @@
import java.util.TreeSet;
import java.util.concurrent.ConcurrentHashMap;
-/**
- * Registry of known Internet media types.
- */
+/** Registry of known Internet media types. */
public class MediaTypeRegistry implements Serializable {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 4710974869988895410L;
+
/**
- * Registry of known media types, including type aliases. A canonical
- * media type is handled as an identity mapping, while an alias is stored
- * as a mapping from the alias to the corresponding canonical type.
+ * Registry of known media types, including type aliases. A canonical media type is handled as
+ * an identity mapping, while an alias is stored as a mapping from the alias to the
+ * corresponding canonical type.
*/
private final Map<MediaType, MediaType> registry = new ConcurrentHashMap<>();
+
/**
- * Known type inheritance relationships. The mapping is from a media type
- * to the closest supertype.
+ * Known type inheritance relationships. The mapping is from a media type to the closest
+ * supertype.
*/
private final Map<MediaType, MediaType> inheritance = new HashMap<>();
@@ -55,8 +53,8 @@
}
/**
- * Returns the set of all known canonical media types. Type aliases are
- * not included in the returned set.
+ * Returns the set of all known canonical media types. Type aliases are not included in the
+ * returned set.
*
* @return canonical media types
* @since Apache Tika 0.8
@@ -126,13 +124,12 @@
}
/**
- * Checks whether the given media type a is a specialization of a more
- * generic type b. Both types should be already normalised.
+ * Checks whether the given media type a is a specialization of a more generic type b. Both
+ * types should be already normalised.
*
* @param a media type, normalised
* @param b suspected supertype, normalised
- * @return <code>true</code> if b is a supertype of a,
- * <code>false</code> otherwise
+ * @return <code>true</code> if b is a supertype of a, <code>false</code> otherwise
* @since Apache Tika 0.8
*/
public boolean isSpecializationOf(MediaType a, MediaType b) {
@@ -140,13 +137,13 @@
}
/**
- * Checks whether the given media type equals the given base type or
- * is a specialization of it. Both types should be already normalised.
+ * Checks whether the given media type equals the given base type or is a specialization of it.
+ * Both types should be already normalised.
*
* @param a media type, normalised
* @param b base type, normalised
- * @return <code>true</code> if b equals a or is a specialization of it,
- * <code>false</code> otherwise
+ * @return <code>true</code> if b equals a or is a specialization of it, <code>false</code>
+ * otherwise
* @since Apache Tika 1.2
*/
public boolean isInstanceOf(MediaType a, MediaType b) {
@@ -154,14 +151,14 @@
}
/**
- * Parses and normalises the given media type string and checks whether
- * the result equals the given base type or is a specialization of it.
- * The given base type should already be normalised.
+ * Parses and normalises the given media type string and checks whether the result equals the
+ * given base type or is a specialization of it. The given base type should already be
+ * normalised.
*
* @param a media type
* @param b base type, normalised
- * @return <code>true</code> if b equals a or is a specialization of it,
- * <code>false</code> otherwise
+ * @return <code>true</code> if b equals a or is a specialization of it, <code>false</code>
+ * otherwise
* @since Apache Tika 1.2
*/
public boolean isInstanceOf(String a, MediaType b) {
@@ -169,14 +166,12 @@
}
/**
- * Returns the supertype of the given type. If the media type database
- * has an explicit inheritance rule for the type, then that is used.
- * Next, if the given type has any parameters, then the respective base
- * type (parameter-less) is returned. Otherwise built-in heuristics like
- * text/... -> text/plain and .../...+xml -> application/xml are used.
- * Finally application/octet-stream is returned for all types for which no other
- * supertype is known, and the return value for application/octet-stream
- * is <code>null</code>.
+ * Returns the supertype of the given type. If the media type database has an explicit
+ * inheritance rule for the type, then that is used. Next, if the given type has any parameters,
+ * then the respective base type (parameter-less) is returned. Otherwise built-in heuristics
+ * like text/... -> text/plain and .../...+xml -> application/xml are used. Finally
+ * application/octet-stream is returned for all types for which no other supertype is known, and
+ * the return value for application/octet-stream is <code>null</code>.
*
* @param type media type
* @return supertype, or <code>null</code> for application/octet-stream
@@ -203,5 +198,4 @@
return null;
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
index 8dc3ddb..8522dcc 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeType.java
@@ -22,69 +22,49 @@
import java.util.Collections;
import java.util.List;
-/**
- * Internet media type.
- */
+/** Internet media type. */
public final class MimeType implements Comparable<MimeType>, Serializable {
- /**
- * Serial version UID.
- */
+ /** Serial version UID. */
private static final long serialVersionUID = 4357830439860729201L;
- /**
- * The normalized media type name.
- */
+
+ /** The normalized media type name. */
private final MediaType type;
- /**
- * The minimum length of data to provides for magic analyzis
- */
+
+ /** The minimum length of data to provides for magic analyzis */
private final int minLength = 0;
- /**
- * The MimeType acronym
- */
+
+ /** The MimeType acronym */
private String acronym = "";
- /**
- * The http://en.wikipedia.org/wiki/Uniform_Type_Identifier
- */
+ /** The http://en.wikipedia.org/wiki/Uniform_Type_Identifier */
private String uti = "";
- /**
- * Documentation Links
- */
+ /** Documentation Links */
private List<URI> links = Collections.emptyList();
- /**
- * Description of this media type.
- */
+ /** Description of this media type. */
private String description = "";
- /**
- * The magics associated to this Mime-Type
- */
+ /** The magics associated to this Mime-Type */
private List<Magic> magics = null;
- /**
- * The root-XML associated to this Mime-Type
- */
+ /** The root-XML associated to this Mime-Type */
private List<RootXML> rootXML = null;
- /**
- * All known file extensions of this type, in order of preference
- * (best first).
- */
+
+ /** All known file extensions of this type, in order of preference (best first). */
private List<String> extensions = null;
+
/**
- * Whether this mime-type is used for server-side scripts,
- * and thus cannot reliably be used for filename-based type detection
+ * Whether this mime-type is used for server-side scripts, and thus cannot reliably be used for
+ * filename-based type detection
*/
private boolean isInterpreted = false;
/**
- * Creates a media type with the give name and containing media type
- * registry. The name is expected to be valid and normalized to lower
- * case. This constructor should only be called by
- * {@link MimeTypes#forName(String)} to keep the media type registry
- * up to date.
+ * Creates a media type with the give name and containing media type registry. The name is
+ * expected to be valid and normalized to lower case. This constructor should only be called by
+ * {@link MimeTypes#forName(String)} to keep the media type registry up to date.
*
* @param type normalized media type name
*/
@@ -96,9 +76,9 @@
}
/**
- * Checks that the given string is a valid Internet media type name
- * based on rules from RFC 2054 section 5.3. For validation purposes the
- * rules can be simplified to the following:
+ * Checks that the given string is a valid Internet media type name based on rules from RFC 2054
+ * section 5.3. For validation purposes the rules can be simplified to the following:
+ *
* <pre>
* name := token "/" token
* token := 1*<any (US-ASCII) CHAR except SPACE, CTLs, or tspecials>
@@ -107,8 +87,8 @@
* </pre>
*
* @param name name string
- * @return <code>true</code> if the string is a valid media type name,
- * <code>false</code> otherwise
+ * @return <code>true</code> if the string is a valid media type name, <code>false</code>
+ * otherwise
*/
public static boolean isValid(String name) {
if (name == null) {
@@ -118,9 +98,22 @@
boolean slash = false;
for (int i = 0; i < name.length(); i++) {
char ch = name.charAt(i);
- if (ch <= ' ' || ch >= 127 || ch == '(' || ch == ')' || ch == '<' || ch == '>' ||
- ch == '@' || ch == ',' || ch == ';' || ch == ':' || ch == '\\' || ch == '"' ||
- ch == '[' || ch == ']' || ch == '?' || ch == '=') {
+ if (ch <= ' '
+ || ch >= 127
+ || ch == '('
+ || ch == ')'
+ || ch == '<'
+ || ch == '>'
+ || ch == '@'
+ || ch == ','
+ || ch == ';'
+ || ch == ':'
+ || ch == '\\'
+ || ch == '"'
+ || ch == '['
+ || ch == ']'
+ || ch == '?'
+ || ch == '=') {
return false;
} else if (ch == '/') {
if (slash || i == 0 || i + 1 == name.length()) {
@@ -171,7 +164,6 @@
this.description = description;
}
-
/**
* Returns an acronym for this mime type.
*
@@ -197,7 +189,8 @@
* Get the UTI for this mime type.
*
* @return The Uniform Type Identifier
- * @see <a href="http://en.wikipedia.org/wiki/Uniform_Type_Identifier">http://en.wikipedia.org/wiki/Uniform_Type_Identifier</a>
+ * @see <a
+ * href="http://en.wikipedia.org/wiki/Uniform_Type_Identifier">http://en.wikipedia.org/wiki/Uniform_Type_Identifier</a>
*/
public String getUniformTypeIdentifier() {
return uti;
@@ -239,7 +232,6 @@
links = Collections.unmodifiableList(copy);
}
-
/**
* Add some rootXML info to this mime-type
*
@@ -308,9 +300,7 @@
return matchesMagic(data);
}
- /**
- * whether the type is used as a server-side scripting technology
- */
+ /** whether the type is used as a server-side scripting technology */
boolean isInterpreted() {
return isInterpreted;
}
@@ -323,7 +313,7 @@
return type.compareTo(mime.type);
}
- //----------------------------------------------------------< Comparable >
+ // ----------------------------------------------------------< Comparable >
public boolean equals(Object o) {
if (o instanceof MimeType) {
@@ -334,7 +324,7 @@
return false;
}
- //--------------------------------------------------------------< Object >
+ // --------------------------------------------------------------< Object >
public int hashCode() {
return type.hashCode();
@@ -350,9 +340,9 @@
}
/**
- * Returns the preferred file extension of this type, or an empty string
- * if no extensions are known. Use the {@link #getExtensions()} method to
- * get the full list of known extensions of this type.
+ * Returns the preferred file extension of this type, or an empty string if no extensions are
+ * known. Use the {@link #getExtensions()} method to get the full list of known extensions of
+ * this type.
*
* @return preferred file extension or empty string
* @since Apache Tika 0.9
@@ -395,15 +385,10 @@
}
}
- /**
- * Defines a RootXML description. RootXML is made of a localName and/or a
- * namespaceURI.
- */
+ /** Defines a RootXML description. RootXML is made of a localName and/or a namespaceURI. */
static class RootXML implements Serializable {
- /**
- * Serial version UID.
- */
+ /** Serial version UID. */
private static final long serialVersionUID = 5140496601491000730L;
private MimeType type = null;
@@ -423,7 +408,7 @@
}
boolean matches(String namespaceURI, String localName) {
- //Compare namespaces
+ // Compare namespaces
if (!isEmpty(this.namespaceURI)) {
if (!this.namespaceURI.equals(namespaceURI)) {
return false;
@@ -436,7 +421,7 @@
}
}
- //Compare root element's local name
+ // Compare root element's local name
if (!isEmpty(this.localName)) {
return this.localName.equals(localName);
} else {
@@ -446,9 +431,7 @@
}
}
- /**
- * Checks if a string is null or empty.
- */
+ /** Checks if a string is null or empty. */
private boolean isEmpty(String str) {
return (str == null) || (str.equals(""));
}
@@ -469,5 +452,4 @@
return type + ", " + namespaceURI + ", " + localName;
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java
index 31bc8a1..1df0280 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypeException.java
@@ -18,9 +18,7 @@
import org.apache.tika.exception.TikaException;
-/**
- * A class to encapsulate MimeType related exceptions.
- */
+/** A class to encapsulate MimeType related exceptions. */
public class MimeTypeException extends TikaException {
/**
@@ -33,14 +31,12 @@
}
/**
- * Constructs a MimeTypeException with the specified detail message
- * and root cause.
+ * Constructs a MimeTypeException with the specified detail message and root cause.
*
* @param message the detail message.
- * @param cause root cause
+ * @param cause root cause
*/
public MimeTypeException(String message, Throwable cause) {
super(message, cause);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
index 10ef6cb..852be15 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
@@ -31,9 +31,7 @@
import java.util.Locale;
import java.util.Map;
import javax.xml.namespace.QName;
-
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
-
import org.apache.tika.Tika;
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.TextDetector;
@@ -42,76 +40,62 @@
import org.apache.tika.metadata.TikaCoreProperties;
/**
- * This class is a MimeType repository. It gathers a set of MimeTypes and
- * enables to retrieves a content-type from its name, from a file name, or from
- * a magic character sequence.
- * <p>
- * The MIME type detection methods that take an {@link InputStream} as
- * an argument will never reads more than {@link #getMinLength()} bytes
- * from the stream. Also the given stream is never
- * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked},
- * or {@link InputStream#reset() reset} by the methods. Thus a client can
- * use the {@link InputStream#markSupported() mark feature} of the stream
- * (if available) to restore the stream back to the state it was before type
- * detection if it wants to process the stream based on the detected type.
+ * This class is a MimeType repository. It gathers a set of MimeTypes and enables to retrieves a
+ * content-type from its name, from a file name, or from a magic character sequence.
+ *
+ * <p>The MIME type detection methods that take an {@link InputStream} as an argument will never
+ * reads more than {@link #getMinLength()} bytes from the stream. Also the given stream is never
+ * {@link InputStream#close() closed}, {@link InputStream#mark(int) marked}, or {@link
+ * InputStream#reset() reset} by the methods. Thus a client can use the {@link
+ * InputStream#markSupported() mark feature} of the stream (if available) to restore the stream back
+ * to the state it was before type detection if it wants to process the stream based on the detected
+ * type.
*/
public final class MimeTypes implements Detector, Serializable {
- /**
- * Name of the {@link #rootMimeType root} type, application/octet-stream.
- */
+ /** Name of the {@link #rootMimeType root} type, application/octet-stream. */
public static final String OCTET_STREAM = "application/octet-stream";
- /**
- * Name of the {@link #textMimeType text} type, text/plain.
- */
+
+ /** Name of the {@link #textMimeType text} type, text/plain. */
public static final String PLAIN_TEXT = "text/plain";
- /**
- * Name of the {@link #xmlMimeType xml} type, application/xml.
- */
+
+ /** Name of the {@link #xmlMimeType xml} type, application/xml. */
public static final String XML = "application/xml";
- /**
- * Serial version UID.
- */
+
+ /** Serial version UID. */
private static final long serialVersionUID = -1350863170146349036L;
+
private static final Map<ClassLoader, MimeTypes> CLASSLOADER_SPECIFIC_DEFAULT_TYPES =
new HashMap<>();
private static MimeTypes DEFAULT_TYPES = null;
- /**
- * Root type, application/octet-stream.
- */
+
+ /** Root type, application/octet-stream. */
private final MimeType rootMimeType;
+
private final List<MimeType> rootMimeTypeL;
- /**
- * Text type, text/plain.
- */
+
+ /** Text type, text/plain. */
private final MimeType textMimeType;
- /**
- * html type, text/html
- */
+
+ /** html type, text/html */
private final MimeType htmlMimeType;
- /**
- * xml type, application/xml
- */
+
+ /** xml type, application/xml */
private final MimeType xmlMimeType;
- /**
- * Registered media types and their aliases.
- */
+
+ /** Registered media types and their aliases. */
private final MediaTypeRegistry registry = new MediaTypeRegistry();
- /**
- * All the registered MimeTypes indexed on their canonical names
- */
+
+ /** All the registered MimeTypes indexed on their canonical names */
private final Map<MediaType, MimeType> types = new HashMap<>();
- /**
- * The patterns matcher
- */
+
+ /** The patterns matcher */
private final Patterns patterns = new Patterns(registry);
- /**
- * Sorted list of all registered magics
- */
+
+ /** Sorted list of all registered magics */
private final List<Magic> magics = new ArrayList<>();
- /**
- * Sorted list of all registered rootXML
- */
+
+ /** Sorted list of all registered rootXML */
private final List<MimeType> xmls = new ArrayList<>();
public MimeTypes() {
@@ -128,8 +112,8 @@
}
/**
- * Get the default MimeTypes. This includes all the build in
- * media types, and any custom override ones present.
+ * Get the default MimeTypes. This includes all the build in media types, and any custom
+ * override ones present.
*
* @return MimeTypes default type registry
*/
@@ -138,8 +122,8 @@
}
/**
- * Get the default MimeTypes. This includes all the built-in
- * media types, and any custom override ones present.
+ * Get the default MimeTypes. This includes all the built-in media types, and any custom
+ * override ones present.
*
* @param classLoader to use, if not the default
* @return MimeTypes default type registry
@@ -152,8 +136,9 @@
if (types == null) {
try {
- types = MimeTypesFactory
- .create("tika-mimetypes.xml", "custom-mimetypes.xml", classLoader);
+ types =
+ MimeTypesFactory.create(
+ "tika-mimetypes.xml", "custom-mimetypes.xml", classLoader);
} catch (MimeTypeException e) {
throw new RuntimeException("Unable to parse the default media type registry", e);
} catch (IOException e) {
@@ -170,8 +155,8 @@
}
/**
- * Find the Mime Content Type of a document from its name.
- * Returns application/octet-stream if no better match is found.
+ * Find the Mime Content Type of a document from its name. Returns application/octet-stream if
+ * no better match is found.
*
* @param name of the document to analyze.
* @return the Mime Content Type of the specified document name
@@ -192,13 +177,13 @@
}
/**
- * Find the Mime Content Type of a document stored in the given file.
- * Returns application/octet-stream if no better match is found.
+ * Find the Mime Content Type of a document stored in the given file. Returns
+ * application/octet-stream if no better match is found.
*
* @param file file to analyze
* @return the Mime Content Type of the specified document
* @throws MimeTypeException if the type can't be detected
- * @throws IOException if the file can't be read
+ * @throws IOException if the file can't be read
* @deprecated Use {@link Tika#detect(File)} instead
*/
@Deprecated
@@ -207,16 +192,14 @@
}
/**
- * Returns the MIME type that best matches the given first few bytes
- * of a document stream. Returns application/octet-stream if no better
- * match is found.
- * <p>
- * If multiple matches are found, the best (highest priority) matching
- * type is returned. If multiple matches are found with the same priority,
- * then all of these are returned.
- * <p>
- * The given byte array is expected to be at least {@link #getMinLength()}
- * long, or shorter only if the document stream itself is shorter.
+ * Returns the MIME type that best matches the given first few bytes of a document stream.
+ * Returns application/octet-stream if no better match is found.
+ *
+ * <p>If multiple matches are found, the best (highest priority) matching type is returned. If
+ * multiple matches are found with the same priority, then all of these are returned.
+ *
+ * <p>The given byte array is expected to be at least {@link #getMinLength()} long, or shorter
+ * only if the document stream itself is shorter.
*
* @param data first few bytes of a document stream
* @return matching MIME type
@@ -248,15 +231,15 @@
// When detecting generic XML (or possibly XHTML),
// extract the root element and match it against known types
- if ("application/xml".equals(matched.getName()) ||
- "text/html".equals(matched.getName())) {
+ if ("application/xml".equals(matched.getName())
+ || "text/html".equals(matched.getName())) {
XmlRootExtractor extractor = new XmlRootExtractor();
QName rootElement = extractor.extractRootElement(data);
if (rootElement != null) {
for (MimeType type : xmls) {
- if (type.matchesXML(rootElement.getNamespaceURI(),
- rootElement.getLocalPart())) {
+ if (type.matchesXML(
+ rootElement.getNamespaceURI(), rootElement.getLocalPart())) {
result.set(i, type);
break;
}
@@ -292,7 +275,8 @@
// Finally, assume plain text if no control bytes are found
try {
TextDetector detector = new TextDetector(getMinLength());
- UnsynchronizedByteArrayInputStream stream = new UnsynchronizedByteArrayInputStream(data);
+ UnsynchronizedByteArrayInputStream stream =
+ new UnsynchronizedByteArrayInputStream(data);
MimeType type = forName(detector.detect(stream, new Metadata()).toString());
return Collections.singletonList(type);
} catch (Exception e) {
@@ -301,13 +285,11 @@
}
/**
- * Reads the first {@link #getMinLength()} bytes from the given stream.
- * If the stream is shorter, then the entire content of the stream is
- * returned.
- * <p>
- * The given stream is never {@link InputStream#close() closed},
- * {@link InputStream#mark(int) marked}, or
- * {@link InputStream#reset() reset} by this method.
+ * Reads the first {@link #getMinLength()} bytes from the given stream. If the stream is
+ * shorter, then the entire content of the stream is returned.
+ *
+ * <p>The given stream is never {@link InputStream#close() closed}, {@link InputStream#mark(int)
+ * marked}, or {@link InputStream#reset() reset} by this method.
*
* @param stream stream to be read
* @return first {@link #getMinLength()} (or fewer) bytes of the stream
@@ -336,9 +318,8 @@
}
/**
- * Returns the registered media type with the given name (or alias).
- * The named media type is automatically registered (and returned) if
- * it doesn't already exist.
+ * Returns the registered media type with the given name (or alias). The named media type is
+ * automatically registered (and returned) if it doesn't already exist.
*
* @param name media type name (case-insensitive)
* @return the registered media type with the given name or alias
@@ -371,18 +352,18 @@
/**
* Returns the registered, normalised media type with the given name (or alias).
*
- * <p>Unlike {@link #forName(String)}, this function will <em>not</em> create a
- * new MimeType and register it. Instead, <code>null</code> will be returned if
- * there is no definition available for the given name.
+ * <p>Unlike {@link #forName(String)}, this function will <em>not</em> create a new MimeType and
+ * register it. Instead, <code>null</code> will be returned if there is no definition available
+ * for the given name.
*
- * <p>Also, unlike {@link #forName(String)}, this function may return a
- * mime type that has fewer parameters than were included in the supplied name.
- * If the registered mime type has parameters (e.g.
- * <code>application/dita+xml;format=map</code>), then those will be maintained.
- * However, if the supplied name has paramenters that the <em>registered</em> mime
- * type does not (e.g. <code>application/xml; charset=UTF-8</code> as a name,
- * compared to just <code>application/xml</code> for the type in the registry),
- * then those parameters will not be included in the returned type.
+ * <p>Also, unlike {@link #forName(String)}, this function may return a mime type that has fewer
+ * parameters than were included in the supplied name. If the registered mime type has
+ * parameters (e.g. <code>application/dita+xml;format=map</code>), then those will be
+ * maintained. However, if the supplied name has paramenters that the <em>registered</em> mime
+ * type does not (e.g. <code>
+ * application/xml; charset=UTF-8</code> as a name, compared to just <code>application/xml
+ * </code> for the type in the registry), then those parameters will not be included in the
+ * returned type.
*
* @param name media type name (case-insensitive)
* @return the registered media type with the given name or alias, or null if not found
@@ -410,10 +391,10 @@
}
/**
- * Adds an alias for the given media type. This method should only
- * be called from {@link MimeType#addAlias(String)}.
+ * Adds an alias for the given media type. This method should only be called from {@link
+ * MimeType#addAlias(String)}.
*
- * @param type media type
+ * @param type media type
* @param alias media type alias (normalized to lower case)
*/
synchronized void addAlias(MimeType type, MediaType alias) {
@@ -421,10 +402,10 @@
}
/**
- * Adds a file name pattern for the given media type. Assumes that the
- * pattern being added is <b>not</b> a JDK standard regular expression.
+ * Adds a file name pattern for the given media type. Assumes that the pattern being added is
+ * <b>not</b> a JDK standard regular expression.
*
- * @param type media type
+ * @param type media type
* @param pattern file name pattern
* @throws MimeTypeException if the pattern conflicts with existing ones
*/
@@ -433,16 +414,14 @@
}
/**
- * Adds a file name pattern for the given media type. The caller can specify
- * whether the pattern being added <b>is</b> or <b>is not</b> a JDK standard
- * regular expression via the <code>isRegex</code> parameter. If the value
- * is set to true, then a JDK standard regex is assumed, otherwise the
- * freedesktop glob type is assumed.
+ * Adds a file name pattern for the given media type. The caller can specify whether the pattern
+ * being added <b>is</b> or <b>is not</b> a JDK standard regular expression via the <code>
+ * isRegex</code> parameter. If the value is set to true, then a JDK standard regex is assumed,
+ * otherwise the freedesktop glob type is assumed.
*
- * @param type media type
+ * @param type media type
* @param pattern file name pattern
- * @param isRegex set to true if JDK std regexs are desired, otherwise set to
- * false.
+ * @param isRegex set to true if JDK std regexs are desired, otherwise set to false.
* @throws MimeTypeException if the pattern conflicts with existing ones.
*/
public void addPattern(MimeType type, String pattern, boolean isRegex)
@@ -455,8 +434,8 @@
}
/**
- * Return the minimum length of data to provide to analyzing methods based
- * on the document's content in order to check all the known MimeTypes.
+ * Return the minimum length of data to provide to analyzing methods based on the document's
+ * content in order to check all the known MimeTypes.
*
* @return the minimum length of data to provide.
* @see #getMimeType(byte[])
@@ -487,10 +466,7 @@
}
}
- /**
- * Called after all configured types have been loaded.
- * Initializes the magics and xmls sets.
- */
+ /** Called after all configured types have been loaded. Initializes the magics and xmls sets. */
void init() {
for (MimeType type : types.values()) {
magics.addAll(type.getMagics());
@@ -503,14 +479,13 @@
}
/**
- * Automatically detects the MIME type of a document based on magic
- * markers in the stream prefix and any given metadata hints.
- * <p>
- * The given stream is expected to support marks, so that this method
- * can reset the stream to the position it was in before this method
- * was called.
+ * Automatically detects the MIME type of a document based on magic markers in the stream prefix
+ * and any given metadata hints.
*
- * @param input document stream, or <code>null</code>
+ * <p>The given stream is expected to support marks, so that this method can reset the stream to
+ * the position it was in before this method was called.
+ *
+ * @param input document stream, or <code>null</code>
* @param metadata metadata hints
* @return MIME type of the document
* @throws IOException if the document stream could not be read
@@ -585,18 +560,17 @@
}
/**
- * Use the MimeType hint to try to clarify or specialise the current
- * possible types list.
- * If the hint is a specialised form, use that instead
- * If there are multiple possible types, use the hint to select one
+ * Use the MimeType hint to try to clarify or specialise the current possible types list. If the
+ * hint is a specialised form, use that instead If there are multiple possible types, use the
+ * hint to select one
*/
private List<MimeType> applyHint(List<MimeType> possibleTypes, MimeType hint) {
if (possibleTypes == null || possibleTypes.isEmpty()) {
return Collections.singletonList(hint);
} else {
for (final MimeType type : possibleTypes) {
- if (hint.equals(type) ||
- registry.isSpecializationOf(hint.getType(), type.getType())) {
+ if (hint.equals(type)
+ || registry.isSpecializationOf(hint.getType(), type.getType())) {
// Use just this type
return Collections.singletonList(hint);
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
index afec1f1..936334d 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesFactory.java
@@ -23,22 +23,18 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
-/**
- * Creates instances of MimeTypes.
- */
+/** Creates instances of MimeTypes. */
public class MimeTypesFactory {
private static final Logger LOG = LoggerFactory.getLogger(MimeTypesFactory.class);
-
/**
- * System property to set a path to an additional external custom mimetypes
- * XML file to be loaded.
+ * System property to set a path to an additional external custom mimetypes XML file to be
+ * loaded.
*/
public static final String CUSTOM_MIMES_SYS_PROP = "tika.custom-mimetypes";
@@ -64,10 +60,10 @@
}
/**
- * Creates and returns a MimeTypes instance from the specified input stream.
- * Does not close the input stream(s).
+ * Creates and returns a MimeTypes instance from the specified input stream. Does not close the
+ * input stream(s).
*
- * @throws IOException if the stream can not be read
+ * @throws IOException if the stream can not be read
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(InputStream... inputStreams)
@@ -85,16 +81,15 @@
* @see #create(InputStream...)
*/
public static MimeTypes create(InputStream stream) throws IOException, MimeTypeException {
- return create(new InputStream[]{stream});
+ return create(new InputStream[] {stream});
}
/**
- * Creates and returns a MimeTypes instance from the resource
- * at the location specified by the URL. Opens and closes the
- * InputStream from the URL.
- * If multiple URLs are supplied, then they are loaded in turn.
+ * Creates and returns a MimeTypes instance from the resource at the location specified by the
+ * URL. Opens and closes the InputStream from the URL. If multiple URLs are supplied, then they
+ * are loaded in turn.
*
- * @throws IOException if the URL can not be accessed
+ * @throws IOException if the URL can not be accessed
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(URL... urls) throws IOException, MimeTypeException {
@@ -116,14 +111,14 @@
* @see #create(URL...)
*/
public static MimeTypes create(URL url) throws IOException, MimeTypeException {
- return create(new URL[]{url});
+ return create(new URL[] {url});
}
/**
- * Creates and returns a MimeTypes instance from the specified file path,
- * as interpreted by the class loader in getResource().
+ * Creates and returns a MimeTypes instance from the specified file path, as interpreted by the
+ * class loader in getResource().
*
- * @throws IOException if the file can not be accessed
+ * @throws IOException if the file can not be accessed
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(String filePath) throws IOException, MimeTypeException {
@@ -131,15 +126,13 @@
}
/**
- * Creates and returns a MimeTypes instance. The core mimetypes
- * will be loaded from the specified file path, and any custom
- * override mimetypes found will loaded afterwards.
- * The file paths will be interpreted by the default class loader in
- * getResource().
+ * Creates and returns a MimeTypes instance. The core mimetypes will be loaded from the
+ * specified file path, and any custom override mimetypes found will loaded afterwards. The file
+ * paths will be interpreted by the default class loader in getResource().
*
- * @param coreFilePath The main MimeTypes file to load
+ * @param coreFilePath The main MimeTypes file to load
* @param extensionFilePath The name of extension MimeType files to load afterwards
- * @throws IOException if the file can not be accessed
+ * @throws IOException if the file can not be accessed
* @throws MimeTypeException if the type configuration is invalid
*/
public static MimeTypes create(String coreFilePath, String extensionFilePath)
@@ -148,21 +141,19 @@
}
/**
- * Creates and returns a MimeTypes instance. The core mimetypes
- * will be loaded from the specified file path, and any custom
- * override mimetypes found will loaded afterwards.
- * The file paths will be interpreted by the specified class
- * loader in getResource().
- * It will also load custom mimetypes from the system property
- * {@link #CUSTOM_MIMES_SYS_PROP}, if specified.
+ * Creates and returns a MimeTypes instance. The core mimetypes will be loaded from the
+ * specified file path, and any custom override mimetypes found will loaded afterwards. The file
+ * paths will be interpreted by the specified class loader in getResource(). It will also load
+ * custom mimetypes from the system property {@link #CUSTOM_MIMES_SYS_PROP}, if specified.
*
- * @param coreFilePath The main MimeTypes file to load
+ * @param coreFilePath The main MimeTypes file to load
* @param extensionFilePath The name of extension MimeType files to load afterwards
- * @throws IOException if the file can not be accessed
+ * @throws IOException if the file can not be accessed
* @throws MimeTypeException if the type configuration is invalid
*/
- public static MimeTypes create(String coreFilePath, String extensionFilePath,
- ClassLoader classLoader) throws IOException, MimeTypeException {
+ public static MimeTypes create(
+ String coreFilePath, String extensionFilePath, ClassLoader classLoader)
+ throws IOException, MimeTypeException {
// If no specific classloader was requested, use our own class's one
if (classLoader == null) {
classLoader = MimeTypesReader.class.getClassLoader();
@@ -174,17 +165,14 @@
// Get the core URL, and all the extensions URLs
URL coreURL = classLoader.getResource(classPrefix + coreFilePath);
- List<URL> extensionURLs =
- Collections.list(classLoader.getResources(extensionFilePath));
+ List<URL> extensionURLs = Collections.list(classLoader.getResources(extensionFilePath));
// Swap that into an Array, and process
List<URL> urls = new ArrayList<>();
urls.add(coreURL);
urls.addAll(extensionURLs);
if (LOG.isDebugEnabled()) {
- urls.stream().forEach( u ->
- LOG.debug("Loaded custom mimes file: {}", u)
- );
+ urls.stream().forEach(u -> LOG.debug("Loaded custom mimes file: {}", u));
}
String customMimesPath = System.getProperty(CUSTOM_MIMES_SYS_PROP);
@@ -197,7 +185,9 @@
URL externalURL = externalFile.toURI().toURL();
urls.add(externalURL);
if (LOG.isDebugEnabled()) {
- LOG.debug("Loaded external custom mimetypes file: {}", externalFile.getAbsolutePath());
+ LOG.debug(
+ "Loaded external custom mimetypes file: {}",
+ externalFile.getAbsolutePath());
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
index 76bc5c7..a8eacdf 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
@@ -34,8 +34,9 @@
import javax.xml.transform.TransformerException;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.sax.SAXResult;
-
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.XMLReaderUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
@@ -44,9 +45,6 @@
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.utils.XMLReaderUtils;
-
/**
* A reader for XML files compliant with the freedesktop MIME-info DTD.
*
@@ -103,21 +101,21 @@
* type CDATA #REQUIRED>
* ]>
* </pre>
- * <p>
- * In addition to the standard fields, this will also read two Tika specific fields:
- * - link
- * - uti
*
- * @see <a href="https://freedesktop.org/wiki/Specifications/shared-mime-info-spec/">https://freedesktop.org/wiki/Specifications/shared-mime-info-spec/</a>
+ * <p>In addition to the standard fields, this will also read two Tika specific fields: - link - uti
+ *
+ * @see <a
+ * href="https://freedesktop.org/wiki/Specifications/shared-mime-info-spec/">https://freedesktop.org/wiki/Specifications/shared-mime-info-spec/</a>
*/
public class MimeTypesReader extends DefaultHandler implements MimeTypesReaderMetKeys {
private static final ReentrantReadWriteLock READ_WRITE_LOCK = new ReentrantReadWriteLock();
- /**
- * Parser pool size
- */
+
+ /** Parser pool size */
private static int POOL_SIZE = 10;
+
private static ArrayBlockingQueue<SAXParser> SAX_PARSERS = new ArrayBlockingQueue<>(POOL_SIZE);
static Logger LOG = LoggerFactory.getLogger(MimeTypesReader.class);
+
static {
try {
setPoolSize(POOL_SIZE);
@@ -128,9 +126,7 @@
protected final MimeTypes types;
- /**
- * Current type
- */
+ /** Current type */
protected MimeType type = null;
protected int priority;
@@ -143,9 +139,8 @@
}
/**
- * Acquire a SAXParser from the pool; create one if it
- * doesn't exist. Make sure to {@link #releaseParser(SAXParser)} in
- * a <code>finally</code> block every time you call this.
+ * Acquire a SAXParser from the pool; create one if it doesn't exist. Make sure to {@link
+ * #releaseParser(SAXParser)} in a <code>finally</code> block every time you call this.
*
* @return a SAXParser
* @throws TikaException
@@ -160,7 +155,6 @@
throw new TikaException("interrupted while waiting for SAXParser", e);
} finally {
READ_WRITE_LOCK.readLock().unlock();
-
}
if (parser != null) {
return parser;
@@ -177,11 +171,11 @@
try {
parser.reset();
} catch (UnsupportedOperationException e) {
- //ignore
+ // ignore
}
try {
READ_WRITE_LOCK.readLock().lock();
- //if there are extra parsers (e.g. after a reset of the pool to a smaller size),
+ // if there are extra parsers (e.g. after a reset of the pool to a smaller size),
// this parser will not be added and will then be gc'd
SAX_PARSERS.offer(parser);
} finally {
@@ -196,9 +190,9 @@
*/
public static void setPoolSize(int poolSize) throws TikaException {
try {
- //stop the world with a write lock
- //parsers that are currently in use will be offered, but not
- //accepted and will be gc'd
+ // stop the world with a write lock
+ // parsers that are currently in use will be offered, but not
+ // accepted and will be gc'd
READ_WRITE_LOCK.writeLock().lock();
SAX_PARSERS = new ArrayBlockingQueue<>(poolSize);
for (int i = 0; i < poolSize; i++) {
@@ -216,8 +210,10 @@
try {
factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
} catch (ParserConfigurationException | SAXException e) {
- LOG.warn("can't set secure processing feature on: " + factory.getClass() +
- ". User assumes responsibility for consequences.");
+ LOG.warn(
+ "can't set secure processing feature on: "
+ + factory.getClass()
+ + ". User assumes responsibility for consequences.");
}
try {
return factory.newSAXParser();
@@ -278,8 +274,10 @@
} else if (SUB_CLASS_OF_TAG.equals(qName)) {
String parent = attributes.getValue(SUB_CLASS_TYPE_ATTR);
types.setSuperType(type, MediaType.parse(parent));
- } else if (ACRONYM_TAG.equals(qName) || COMMENT_TAG.equals(qName) ||
- TIKA_LINK_TAG.equals(qName) || TIKA_UTI_TAG.equals(qName)) {
+ } else if (ACRONYM_TAG.equals(qName)
+ || COMMENT_TAG.equals(qName)
+ || TIKA_LINK_TAG.equals(qName)
+ || TIKA_UTI_TAG.equals(qName)) {
characters = new StringBuilder();
} else if (GLOB_TAG.equals(qName)) {
String pattern = attributes.getValue(PATTERN_ATTR);
@@ -297,8 +295,11 @@
type.addRootXML(namespace, name);
} else if (MATCH_TAG.equals(qName)) {
if (attributes.getValue(MATCH_MINSHOULDMATCH_ATTR) != null) {
- current = new ClauseRecord(new MinShouldMatchVal(
- Integer.parseInt(attributes.getValue(MATCH_MINSHOULDMATCH_ATTR))));
+ current =
+ new ClauseRecord(
+ new MinShouldMatchVal(
+ Integer.parseInt(
+ attributes.getValue(MATCH_MINSHOULDMATCH_ATTR))));
} else {
String kind = attributes.getValue(MATCH_TYPE_ATTR);
String offset = attributes.getValue(MATCH_OFFSET_ATTR);
@@ -360,20 +361,25 @@
}
}
- protected void handleMimeError(String input, MimeTypeException ex, String qName,
- Attributes attributes) throws SAXException {
+ protected void handleMimeError(
+ String input, MimeTypeException ex, String qName, Attributes attributes)
+ throws SAXException {
throw new SAXException(ex);
}
- protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex,
- String qName, Attributes attributes) throws SAXException {
+ protected void handleGlobError(
+ MimeType type,
+ String pattern,
+ MimeTypeException ex,
+ String qName,
+ Attributes attributes)
+ throws SAXException {
throw new SAXException(ex);
}
/**
- * Shim class used during building of actual classes.
- * This temporarily holds the value of the minShouldMatchClause
- * so that the actual MinShouldMatchClause can have a cleaner/immutable
+ * Shim class used during building of actual classes. This temporarily holds the value of the
+ * minShouldMatchClause so that the actual MinShouldMatchClause can have a cleaner/immutable
* initialization.
*/
private static class MinShouldMatchVal implements Clause {
@@ -442,6 +448,5 @@
public List<Clause> getClauses() {
return subclauses;
}
-
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
index df35134..54fb57f 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypesReaderMetKeys.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.mime;
-/**
- * Met Keys used by the {@link MimeTypesReader}.
- */
+/** Met Keys used by the {@link MimeTypesReader}. */
public interface MimeTypesReaderMetKeys {
String MIME_INFO_TAG = "mime-info";
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java b/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java
index 0a18f4e..97c6423 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MinShouldMatchClause.java
@@ -25,9 +25,9 @@
/**
* Minimum number of clauses that need to match.
- * <p>
- * Throws IllegalArgumentException if min <= 0,
- * if clauses is null or has size == 0, or if min > clauses.size()
+ *
+ * <p>Throws IllegalArgumentException if min <= 0, if clauses is null or has size == 0, or if
+ * min > clauses.size()
*
* @param min
* @param clauses
@@ -72,5 +72,4 @@
public String toString() {
return "minShouldMatch (min: " + min + ") " + clauses;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/OrClause.java b/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
index 6a2f212..8235062 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/OrClause.java
@@ -46,5 +46,4 @@
public String toString() {
return "or" + clauses;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/Patterns.java b/tika-core/src/main/java/org/apache/tika/mime/Patterns.java
index 48c0329..ea83c14 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/Patterns.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/Patterns.java
@@ -23,32 +23,23 @@
import java.util.SortedMap;
import java.util.TreeMap;
-/**
- * Defines a MimeType pattern.
- */
+/** Defines a MimeType pattern. */
class Patterns implements Serializable {
- /**
- * Serial version UID.
- */
+ /** Serial version UID. */
private static final long serialVersionUID = -5778015347278111140L;
private final MediaTypeRegistry registry;
- /**
- * Index of exact name patterns.
- */
+ /** Index of exact name patterns. */
private final Map<String, MimeType> names = new HashMap<>();
- /**
- * Index of extension patterns of the form "*extension".
- */
+ /** Index of extension patterns of the form "*extension". */
private final Map<String, MimeType> extensions = new HashMap<>();
- /**
- * Index of generic glob patterns, sorted by length.
- */
- private final SortedMap<String, MimeType> globs =
- new TreeMap<>(new LengthComparator());
+
+ /** Index of generic glob patterns, sorted by length. */
+ private final SortedMap<String, MimeType> globs = new TreeMap<>(new LengthComparator());
+
private int minExtensionLength = Integer.MAX_VALUE;
private int maxExtensionLength = 0;
@@ -71,11 +62,14 @@
addGlob(pattern, type);
} else {
- if (pattern.indexOf('*') == -1 && pattern.indexOf('?') == -1 &&
- pattern.indexOf('[') == -1) {
+ if (pattern.indexOf('*') == -1
+ && pattern.indexOf('?') == -1
+ && pattern.indexOf('[') == -1) {
addName(pattern, type);
- } else if (pattern.startsWith("*") && pattern.indexOf('*', 1) == -1 &&
- pattern.indexOf('?') == -1 && pattern.indexOf('[') == -1) {
+ } else if (pattern.startsWith("*")
+ && pattern.indexOf('*', 1) == -1
+ && pattern.indexOf('?') == -1
+ && pattern.indexOf('[') == -1) {
String extension = pattern.substring(1);
addExtension(extension, type);
type.addExtension(extension);
@@ -89,8 +83,8 @@
MimeType previous = names.get(name);
if (previous == null || registry.isSpecializationOf(previous.getType(), type.getType())) {
names.put(name, type);
- } else if (previous == type ||
- registry.isSpecializationOf(type.getType(), previous.getType())) {
+ } else if (previous == type
+ || registry.isSpecializationOf(type.getType(), previous.getType())) {
// do nothing
} else {
throw new MimeTypeException("Conflicting name pattern: " + name);
@@ -104,8 +98,8 @@
int length = extension.length();
minExtensionLength = Math.min(minExtensionLength, length);
maxExtensionLength = Math.max(maxExtensionLength, length);
- } else if (previous == type ||
- registry.isSpecializationOf(type.getType(), previous.getType())) {
+ } else if (previous == type
+ || registry.isSpecializationOf(type.getType(), previous.getType())) {
// do nothing
} else {
throw new MimeTypeException("Conflicting extension pattern: " + extension);
@@ -116,8 +110,8 @@
MimeType previous = globs.get(glob);
if (previous == null || registry.isSpecializationOf(previous.getType(), type.getType())) {
globs.put(glob, type);
- } else if (previous == type ||
- registry.isSpecializationOf(type.getType(), previous.getType())) {
+ } else if (previous == type
+ || registry.isSpecializationOf(type.getType(), previous.getType())) {
// do nothing
} else {
throw new MimeTypeException("Conflicting glob pattern: " + glob);
@@ -126,17 +120,15 @@
/**
* Find the MimeType corresponding to a resource name.
- * <p>
- * It applies the recommendations detailed in FreeDesktop Shared MIME-info
- * Database for guessing MimeType from a resource name: It first tries a
- * case-sensitive match, then try again with the resource name converted to
- * lower-case if that fails. If several patterns match then the longest
- * pattern is used. In particular, files with multiple extensions (such as
- * Data.tar.gz) match the longest sequence of extensions (eg '*.tar.gz' in
- * preference to '*.gz'). Literal patterns (eg, 'Makefile') are matched
- * before all others. Patterns beginning with `*.' and containing no other
- * special characters (`*?[') are matched before other wildcarded patterns
- * (since this covers the majority of the patterns).
+ *
+ * <p>It applies the recommendations detailed in FreeDesktop Shared MIME-info Database for
+ * guessing MimeType from a resource name: It first tries a case-sensitive match, then try again
+ * with the resource name converted to lower-case if that fails. If several patterns match then
+ * the longest pattern is used. In particular, files with multiple extensions (such as
+ * Data.tar.gz) match the longest sequence of extensions (eg '*.tar.gz' in preference to
+ * '*.gz'). Literal patterns (eg, 'Makefile') are matched before all others. Patterns beginning
+ * with `*.' and containing no other special characters (`*?[') are matched before other
+ * wildcarded patterns (since this covers the majority of the patterns).
*/
public MimeType matches(String name) {
if (name == null) {
@@ -189,9 +181,7 @@
private static final class LengthComparator implements Comparator<String>, Serializable {
- /**
- * Serial version UID.
- */
+ /** Serial version UID. */
private static final long serialVersionUID = 8468289702915532359L;
public int compare(String a, String b) {
@@ -201,7 +191,5 @@
}
return diff;
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java b/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java
index 5e33b85..aba6869 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/ProbabilisticMimeDetectionSelector.java
@@ -22,21 +22,17 @@
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
-
import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
-/**
- * Selector for combining different mime detection results
- * based on probability
- */
+/** Selector for combining different mime detection results based on probability */
public class ProbabilisticMimeDetectionSelector implements Detector {
private static final long serialVersionUID = 224589862960269260L;
- /**
- * probability parameters default value
- */
+
+ /** probability parameters default value */
private static final float DEFAULT_MAGIC_TRUST = 0.9f;
+
private static final float DEFAULT_META_TRUST = 0.8f;
private static final float DEFAULT_EXTENSION_TRUST = 0.8f;
private final MimeTypes mimeTypes;
@@ -58,10 +54,7 @@
*/
private float threshold;
- /**
- *
- ***********************/
-
+ /** ********************* */
public ProbabilisticMimeDetectionSelector() {
this(MimeTypes.getDefaultMimeTypes(), null);
}
@@ -80,10 +73,14 @@
this.initializeDefaultProbabilityParameters();
this.changeRate = 0.1f;
if (builder != null) {
- priorMagicFileType = builder.priorMagicFileType == 0f ? priorMagicFileType :
- builder.priorMagicFileType;
- priorExtensionFileType = builder.priorExtensionFileType == 0f ? priorExtensionFileType :
- builder.priorExtensionFileType;
+ priorMagicFileType =
+ builder.priorMagicFileType == 0f
+ ? priorMagicFileType
+ : builder.priorMagicFileType;
+ priorExtensionFileType =
+ builder.priorExtensionFileType == 0f
+ ? priorExtensionFileType
+ : builder.priorExtensionFileType;
priorMetaFileType =
builder.priorMetaFileType == 0f ? priorMetaFileType : builder.priorMetaFileType;
@@ -99,9 +96,7 @@
}
}
- /**
- * Initilize probability parameters with default values;
- */
+ /** Initilize probability parameters with default values; */
private void initializeDefaultProbabilityParameters() {
priorMagicFileType = 0.5f;
priorExtensionFileType = 0.5f;
@@ -130,7 +125,7 @@
input.mark(mimeTypes.getMinLength());
try {
byte[] prefix = mimeTypes.readMagicHeader(input);
- //defensive copy
+ // defensive copy
possibleTypes.addAll(mimeTypes.getMimeType(prefix));
} finally {
input.reset();
@@ -186,9 +181,10 @@
return applyProbilities(possibleTypes, extHint, metaHint);
}
- private MediaType applyProbilities(final List<MimeType> possibleTypes,
- final MimeType extMimeType,
- final MimeType metadataMimeType) {
+ private MediaType applyProbilities(
+ final List<MimeType> possibleTypes,
+ final MimeType extMimeType,
+ final MimeType metadataMimeType) {
/* initialize some probability variables */
MediaType extensionMediaType_ = extMimeType == null ? null : extMimeType.getType();
@@ -231,8 +227,8 @@
} else {
// check if each identified type belongs to the same class;
if (extensionMediaType_ != null) {
- if (extensionMediaType_.equals(magictype) ||
- registry.isSpecializationOf(extensionMediaType_, magictype)) {
+ if (extensionMediaType_.equals(magictype)
+ || registry.isSpecializationOf(extensionMediaType_, magictype)) {
// Use just this type
possibleTypes.set(i, extMimeType);
} else if (registry.isSpecializationOf(magictype, extensionMediaType_)) {
@@ -240,8 +236,8 @@
}
}
if (metaMediaType_ != null) {
- if (metaMediaType_.equals(magictype) ||
- registry.isSpecializationOf(metaMediaType_, magictype)) {
+ if (metaMediaType_.equals(magictype)
+ || registry.isSpecializationOf(metaMediaType_, magictype)) {
// Use just this type
possibleTypes.set(i, metadataMimeType);
} else if (registry.isSpecializationOf(magictype, metaMediaType_)) {
@@ -269,7 +265,6 @@
* grow as our trust goes down
*/
mag_neg = mag_neg * (1 + changeRate);
-
}
if (magictype != null && mag_trust != 1) {
@@ -387,7 +382,6 @@
}
pPrime /= (pPrime + deno);
results[0] = pPrime;
-
}
if (maxProb < results[0]) {
maxProb = results[0];
@@ -405,7 +399,6 @@
}
pPrime /= (pPrime + deno);
results[1] = pPrime;
-
}
if (maxProb < results[1]) {
maxProb = results[1];
@@ -429,19 +422,15 @@
bestEstimate = extensionMediaType_;
}
}
-
}
return maxProb < threshold ? this.rootMediaType : bestEstimate;
-
}
public MediaTypeRegistry getMediaTypeRegistry() {
return this.mimeTypes.getMediaTypeRegistry();
}
- /**
- * build class for probability parameters setting
- */
+ /** build class for probability parameters setting */
public static class Builder {
/*
* the following are the prior probabilities for the file type
@@ -512,12 +501,9 @@
return this;
}
- /**
- * Initialize the MimeTypes with this builder instance
- */
+ /** Initialize the MimeTypes with this builder instance */
public ProbabilisticMimeDetectionSelector build2() {
return new ProbabilisticMimeDetectionSelector(this);
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/package-info.java b/tika-core/src/main/java/org/apache/tika/mime/package-info.java
index 104dc3a..2387683 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Media type information.
- */
+/** Media type information. */
@aQute.bnd.annotation.Version("1.2.0")
package org.apache.tika.mime;
diff --git a/tika-core/src/main/java/org/apache/tika/package-info.java b/tika-core/src/main/java/org/apache/tika/package-info.java
index cf4352d..77ffacc 100644
--- a/tika-core/src/main/java/org/apache/tika/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Apache Tika.
- */
+/** Apache Tika. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika;
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java
index 2e9f393..19152f2 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractEncodingDetectorParser.java
@@ -20,14 +20,12 @@
import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
-
/**
- * Abstract base class for parsers that use the AutoDetectReader and need
- * to use the {@link EncodingDetector} configured by {@link TikaConfig}
+ * Abstract base class for parsers that use the AutoDetectReader and need to use the {@link
+ * EncodingDetector} configured by {@link TikaConfig}
*/
public abstract class AbstractEncodingDetectorParser implements Parser {
-
private EncodingDetector encodingDetector;
public AbstractEncodingDetectorParser() {
@@ -39,8 +37,8 @@
}
/**
- * Look for an EncodingDetetor in the ParseContext. If it hasn't been
- * passed in, use the original EncodingDetector from initialization.
+ * Look for an EncodingDetetor in the ParseContext. If it hasn't been passed in, use the
+ * original EncodingDetector from initialization.
*
* @param parseContext
* @return
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java
index c5c3315..1285800 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractExternalProcessParser.java
@@ -20,28 +20,27 @@
import java.util.concurrent.ConcurrentHashMap;
/**
- * Abstract base class for parsers that call external processes. This
- * adds one more layer of 'hope' that processes won't be orphaned if
- * the jvm has to be restarted. This does not guarantee that the
- * processes won't be orphaned in case of, e.g. kill -9, but this
- * increases the chances that under normal circumstances or if the jvm
- * itself exits, that external processes won't be orphaned.
+ * Abstract base class for parsers that call external processes. This adds one more layer of 'hope'
+ * that processes won't be orphaned if the jvm has to be restarted. This does not guarantee that the
+ * processes won't be orphaned in case of, e.g. kill -9, but this increases the chances that under
+ * normal circumstances or if the jvm itself exits, that external processes won't be orphaned.
*
* @since Apache Tika 1.27
*/
public abstract class AbstractExternalProcessParser implements Parser {
- /**
- * Serial version UID.
- */
+ /** Serial version UID. */
private static final long serialVersionUID = 7186985395903074255L;
private static final ConcurrentHashMap<String, Process> PROCESS_MAP = new ConcurrentHashMap<>();
static {
- Runtime.getRuntime().addShutdownHook(new Thread(() -> {
- PROCESS_MAP.forEachValue(1, Process::destroyForcibly);
- }));
+ Runtime.getRuntime()
+ .addShutdownHook(
+ new Thread(
+ () -> {
+ PROCESS_MAP.forEachValue(1, Process::destroyForcibly);
+ }));
}
protected String register(Process p) {
@@ -54,4 +53,3 @@
return PROCESS_MAP.remove(id);
}
}
-
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
index f6017d6..7913f52 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AbstractParser.java
@@ -18,16 +18,14 @@
import java.io.IOException;
import java.io.InputStream;
-
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-
/**
- * Abstract base class for new parsers. This method implements the old
- * deprecated parse method so subclasses won't have to.
+ * Abstract base class for new parsers. This method implements the old deprecated parse method so
+ * subclasses won't have to.
*
* @deprecated for removal in 4.x
* @since Apache Tika 0.10
@@ -35,27 +33,21 @@
@Deprecated
public abstract class AbstractParser implements Parser {
- /**
- * Serial version UID.
- */
+ /** Serial version UID. */
private static final long serialVersionUID = 7186985395903074255L;
/**
- * Calls the
- * {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)}
- * method with an empty {@link ParseContext}. This method exists as a
- * leftover from Tika 0.x when the three-argument parse() method still
- * existed in the {@link Parser} interface. No new code should call this
- * method anymore, it's only here for backwards compatibility.
+ * Calls the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)} method
+ * with an empty {@link ParseContext}. This method exists as a leftover from Tika 0.x when the
+ * three-argument parse() method still existed in the {@link Parser} interface. No new code
+ * should call this method anymore, it's only here for backwards compatibility.
*
- * @deprecated use the {@link Parser#parse(InputStream, ContentHandler,
- * Metadata, ParseContext)} method instead
+ * @deprecated use the {@link Parser#parse(InputStream, ContentHandler, Metadata, ParseContext)}
+ * method instead
*/
@Deprecated
public void parse(InputStream stream, ContentHandler handler, Metadata metadata)
throws IOException, SAXException, TikaException {
parse(stream, handler, metadata, new ParseContext());
}
-
}
-
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 86eae69..d8869aa 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -18,10 +18,6 @@
import java.io.IOException;
import java.io.InputStream;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
@@ -38,31 +34,23 @@
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.sax.SecureContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
public class AutoDetectParser extends CompositeParser {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 6110455808615143122L;
- //private final TikaConfig config;
- /**
- * The type detector used by this parser to auto-detect the type
- * of a document.
- */
+ // private final TikaConfig config;
+
+ /** The type detector used by this parser to auto-detect the type of a document. */
private Detector detector; // always set in the constructor
- /**
- * Configuration used when initializing a SecureContentHandler
- * and the TikaInputStream.
- */
+ /** Configuration used when initializing a SecureContentHandler and the TikaInputStream. */
private AutoDetectParserConfig autoDetectParserConfig;
- /**
- * Creates an auto-detecting parser instance using the default Tika
- * configuration.
- */
+ /** Creates an auto-detecting parser instance using the default Tika configuration. */
public AutoDetectParser() {
this(TikaConfig.getDefaultConfig());
}
@@ -73,10 +61,10 @@
}
/**
- * Creates an auto-detecting parser instance using the specified set of parser.
- * This allows one to create a Tika configuration where only a subset of the
- * available parsers have their 3rd party jars included, as otherwise the
- * use of the default TikaConfig will throw various "ClassNotFound" exceptions.
+ * Creates an auto-detecting parser instance using the specified set of parser. This allows one
+ * to create a Tika configuration where only a subset of the available parsers have their 3rd
+ * party jars included, as otherwise the use of the default TikaConfig will throw various
+ * "ClassNotFound" exceptions.
*
* @param parsers
*/
@@ -95,14 +83,13 @@
setFallback(buildFallbackParser(config));
setDetector(config.getDetector());
setAutoDetectParserConfig(config.getAutoDetectParserConfig());
-
}
private static Parser buildFallbackParser(TikaConfig config) {
Parser fallback = null;
Parser p = config.getParser();
if (p instanceof DefaultParser) {
- fallback = ((DefaultParser)p).getFallback();
+ fallback = ((DefaultParser) p).getFallback();
} else {
fallback = new EmptyParser();
}
@@ -110,25 +97,27 @@
if (config.getAutoDetectParserConfig().getDigesterFactory() == null) {
return fallback;
} else {
- return new DigestingParser(fallback,
+ return new DigestingParser(
+ fallback,
config.getAutoDetectParserConfig().getDigesterFactory().build(),
- config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument());
+ config.getAutoDetectParserConfig()
+ .getDigesterFactory()
+ .isSkipContainerDocument());
}
-
}
private static Parser getParser(TikaConfig config) {
if (config.getAutoDetectParserConfig().getDigesterFactory() == null) {
return config.getParser();
}
- return new DigestingParser(config.getParser(),
+ return new DigestingParser(
+ config.getParser(),
config.getAutoDetectParserConfig().getDigesterFactory().build(),
config.getAutoDetectParserConfig().getDigesterFactory().isSkipContainerDocument());
}
/**
- * Returns the type detector used by this parser to auto-detect the type
- * of a document.
+ * Returns the type detector used by this parser to auto-detect the type of a document.
*
* @return type detector
* @since Apache Tika 0.4
@@ -138,8 +127,7 @@
}
/**
- * Sets the type detector used by this parser to auto-detect the type
- * of a document.
+ * Sets the type detector used by this parser to auto-detect the type of a document.
*
* @param detector type detector
* @since Apache Tika 0.4
@@ -149,8 +137,8 @@
}
/**
- * Sets the configuration that will be used to create SecureContentHandlers
- * that will be used for parsing.
+ * Sets the configuration that will be used to create SecureContentHandlers that will be used
+ * for parsing.
*
* @param autoDetectParserConfig type SecureContentHandlerConfig
* @since Apache Tika 2.1.1
@@ -163,8 +151,9 @@
return this.autoDetectParserConfig;
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
if (autoDetectParserConfig.getMetadataWriteFilterFactory() != null) {
metadata.setMetadataWriteFilter(
autoDetectParserConfig.getMetadataWriteFilterFactory().newInstance());
@@ -172,18 +161,18 @@
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
- //figure out if we should spool to disk
+ // figure out if we should spool to disk
maybeSpool(tis, autoDetectParserConfig, metadata);
// Automatically detect the MIME type of the document
MediaType type = detector.detect(tis, metadata);
- //update CONTENT_TYPE as long as it wasn't set by parser override
- if (metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) == null ||
- !metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE)
+ // update CONTENT_TYPE as long as it wasn't set by parser override
+ if (metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) == null
+ || !metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE)
.equals(type.toString())) {
metadata.set(Metadata.CONTENT_TYPE, type.toString());
}
- //check for zero-byte inputstream
+ // check for zero-byte inputstream
if (tis.getOpenContainer() == null) {
if (autoDetectParserConfig.getThrowOnZeroBytes()) {
tis.mark(1);
@@ -195,8 +184,10 @@
}
handler = decorateHandler(handler, metadata, context, autoDetectParserConfig);
// TIKA-216: Zip bomb prevention
- SecureContentHandler sch = handler != null ?
- createSecureContentHandler(handler, tis, autoDetectParserConfig) : null;
+ SecureContentHandler sch =
+ handler != null
+ ? createSecureContentHandler(handler, tis, autoDetectParserConfig)
+ : null;
initializeEmbeddedDocumentExtractor(metadata, context);
try {
@@ -212,33 +203,38 @@
}
}
- private ContentHandler decorateHandler(ContentHandler handler, Metadata metadata,
- ParseContext context,
- AutoDetectParserConfig autoDetectParserConfig) {
+ private ContentHandler decorateHandler(
+ ContentHandler handler,
+ Metadata metadata,
+ ParseContext context,
+ AutoDetectParserConfig autoDetectParserConfig) {
if (context.get(RecursiveParserWrapper.RecursivelySecureContentHandler.class) != null) {
- //using the recursiveparserwrapper. we should decorate this handler
- return autoDetectParserConfig.getContentHandlerDecoratorFactory()
+ // using the recursiveparserwrapper. we should decorate this handler
+ return autoDetectParserConfig
+ .getContentHandlerDecoratorFactory()
.decorate(handler, metadata, context);
}
ParseRecord parseRecord = context.get(ParseRecord.class);
if (parseRecord == null || parseRecord.getDepth() == 0) {
- return autoDetectParserConfig.getContentHandlerDecoratorFactory()
+ return autoDetectParserConfig
+ .getContentHandlerDecoratorFactory()
.decorate(handler, metadata, context);
}
- //else do not decorate
+ // else do not decorate
return handler;
}
- private void maybeSpool(TikaInputStream tis, AutoDetectParserConfig autoDetectParserConfig,
- Metadata metadata) throws IOException {
+ private void maybeSpool(
+ TikaInputStream tis, AutoDetectParserConfig autoDetectParserConfig, Metadata metadata)
+ throws IOException {
if (tis.hasFile()) {
return;
}
if (autoDetectParserConfig.getSpoolToDisk() == null) {
return;
}
- //whether or not a content-length has been sent in,
- //if spoolToDisk == 0, spool it
+ // whether or not a content-length has been sent in,
+ // if spoolToDisk == 0, spool it
if (autoDetectParserConfig.getSpoolToDisk() == 0) {
tis.getPath();
metadata.set(HttpHeaders.CONTENT_LENGTH, Long.toString(tis.getLength()));
@@ -253,7 +249,7 @@
metadata.set(HttpHeaders.CONTENT_LENGTH, Long.toString(tis.getLength()));
}
} catch (NumberFormatException e) {
- //swallow...maybe log?
+ // swallow...maybe log?
}
}
}
@@ -262,8 +258,8 @@
if (context.get(EmbeddedDocumentExtractor.class) != null) {
return;
}
- //pass self to handle embedded documents if
- //the caller hasn't specified one.
+ // pass self to handle embedded documents if
+ // the caller hasn't specified one.
Parser p = context.get(Parser.class);
if (p == null) {
context.set(Parser.class, this);
@@ -284,9 +280,8 @@
parse(stream, handler, metadata, context);
}
- private SecureContentHandler createSecureContentHandler(ContentHandler handler,
- TikaInputStream tis,
- AutoDetectParserConfig config) {
+ private SecureContentHandler createSecureContentHandler(
+ ContentHandler handler, TikaInputStream tis, AutoDetectParserConfig config) {
SecureContentHandler sch = new SecureContentHandler(handler, tis);
if (config == null) {
return sch;
@@ -309,5 +304,4 @@
}
return sch;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index afe65b0..5f295da 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -18,33 +18,33 @@
import java.io.IOException;
import java.io.Serializable;
-
-import org.w3c.dom.Element;
-import org.xml.sax.ContentHandler;
-
import org.apache.tika.config.ConfigBase;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory;
import org.apache.tika.sax.ContentHandlerDecoratorFactory;
+import org.w3c.dom.Element;
+import org.xml.sax.ContentHandler;
/**
- * This config object can be used to tune how conservative we want to be
- * when parsing data that is extremely compressible and resembles a ZIP
- * bomb. Null values will be ignored and will not affect the default values
- * in SecureContentHandler.
- * <p>
- * See <a href="https://cwiki.apache.org/confluence/display/TIKA/ModifyingContentWithHandlersAndMetadataFilters"/>ModifyingContentWithHandlersAndMetadataFilters</a>
- * for documentation and examples for configuring this with a tika-config.xml file.
+ * This config object can be used to tune how conservative we want to be when parsing data that is
+ * extremely compressible and resembles a ZIP bomb. Null values will be ignored and will not affect
+ * the default values in SecureContentHandler.
+ *
+ * <p>See <a
+ * href="https://cwiki.apache.org/confluence/display/TIKA/ModifyingContentWithHandlersAndMetadataFilters"/>ModifyingContentWithHandlersAndMetadataFilters</a>
+ * for documentation and examples for configuring this with a tika-config.xml file.
*/
public class AutoDetectParserConfig extends ConfigBase implements Serializable {
private static ContentHandlerDecoratorFactory NOOP_CONTENT_HANDLER_DECORATOR_FACTORY =
new ContentHandlerDecoratorFactory() {
@Override
- public ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
- ParseContext parseContext) {
+ public ContentHandler decorate(
+ ContentHandler contentHandler,
+ Metadata metadata,
+ ParseContext parseContext) {
return contentHandler;
}
};
@@ -53,35 +53,29 @@
public static AutoDetectParserConfig load(Element element)
throws TikaConfigException, IOException {
- return AutoDetectParserConfig.buildSingle("autoDetectParserConfig",
- AutoDetectParserConfig.class, element, AutoDetectParserConfig.DEFAULT);
+ return AutoDetectParserConfig.buildSingle(
+ "autoDetectParserConfig",
+ AutoDetectParserConfig.class,
+ element,
+ AutoDetectParserConfig.DEFAULT);
}
/**
- * If this is not null and greater than -1, the AutoDetectParser
- * will spool the stream to disk if the length of the stream is known
- * ahead of time.
+ * If this is not null and greater than -1, the AutoDetectParser will spool the stream to disk
+ * if the length of the stream is known ahead of time.
*/
private Long spoolToDisk = null;
- /**
- * SecureContentHandler -- Desired output threshold in characters.
- */
+ /** SecureContentHandler -- Desired output threshold in characters. */
private Long outputThreshold = null;
- /**
- * SecureContentHandler -- Desired maximum compression ratio.
- */
+ /** SecureContentHandler -- Desired maximum compression ratio. */
private Long maximumCompressionRatio = null;
- /**
- * SecureContentHandler -- Desired maximum XML nesting level.
- */
+ /** SecureContentHandler -- Desired maximum XML nesting level. */
private Integer maximumDepth = null;
- /**
- * SecureContentHandler -- Desired maximum package entry nesting level.
- */
+ /** SecureContentHandler -- Desired maximum package entry nesting level. */
private Integer maximumPackageEntryDepth = null;
private MetadataWriteFilterFactory metadataWriteFilterFactory = null;
@@ -99,14 +93,17 @@
* Creates a SecureContentHandlerConfig using the passed in parameters.
*
* @param spoolToDisk
- * @param outputThreshold SecureContentHandler - character output threshold.
- * @param maximumCompressionRatio SecureContentHandler - max compression ratio allowed.
- * @param maximumDepth SecureContentHandler - maximum XML element nesting level.
+ * @param outputThreshold SecureContentHandler - character output threshold.
+ * @param maximumCompressionRatio SecureContentHandler - max compression ratio allowed.
+ * @param maximumDepth SecureContentHandler - maximum XML element nesting level.
* @param maximumPackageEntryDepth SecureContentHandler - maximum package entry nesting level.
*/
- public AutoDetectParserConfig(Long spoolToDisk, Long outputThreshold,
- Long maximumCompressionRatio, Integer maximumDepth,
- Integer maximumPackageEntryDepth) {
+ public AutoDetectParserConfig(
+ Long spoolToDisk,
+ Long outputThreshold,
+ Long maximumCompressionRatio,
+ Integer maximumDepth,
+ Integer maximumPackageEntryDepth) {
this.spoolToDisk = spoolToDisk;
this.outputThreshold = outputThreshold;
this.maximumCompressionRatio = maximumCompressionRatio;
@@ -114,9 +111,7 @@
this.maximumPackageEntryDepth = maximumPackageEntryDepth;
}
- public AutoDetectParserConfig() {
-
- }
+ public AutoDetectParserConfig() {}
public Long getSpoolToDisk() {
return spoolToDisk;
@@ -203,14 +198,27 @@
@Override
public String toString() {
- return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", outputThreshold=" +
- outputThreshold + ", maximumCompressionRatio=" + maximumCompressionRatio +
- ", maximumDepth=" + maximumDepth + ", maximumPackageEntryDepth=" +
- maximumPackageEntryDepth + ", metadataWriteFilterFactory=" +
- metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" +
- embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" +
- contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory +
- ", throwOnZeroBytes=" + throwOnZeroBytes + '}';
+ return "AutoDetectParserConfig{"
+ + "spoolToDisk="
+ + spoolToDisk
+ + ", outputThreshold="
+ + outputThreshold
+ + ", maximumCompressionRatio="
+ + maximumCompressionRatio
+ + ", maximumDepth="
+ + maximumDepth
+ + ", maximumPackageEntryDepth="
+ + maximumPackageEntryDepth
+ + ", metadataWriteFilterFactory="
+ + metadataWriteFilterFactory
+ + ", embeddedDocumentExtractorFactory="
+ + embeddedDocumentExtractorFactory
+ + ", contentHandlerDecoratorFactory="
+ + contentHandlerDecoratorFactory
+ + ", digesterFactory="
+ + digesterFactory
+ + ", throwOnZeroBytes="
+ + throwOnZeroBytes
+ + '}';
}
}
-
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java
index 2365c89..e2a8f9f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserFactory.java
@@ -22,21 +22,14 @@
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.Map;
-
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
+import org.xml.sax.SAXException;
-/**
- * Factory for an AutoDetectParser
- */
+/** Factory for an AutoDetectParser */
public class AutoDetectParserFactory extends ParserFactory {
- /**
- * Path to a tika-config file. This must be a literal
- * file or findable on the classpath.
- */
+ /** Path to a tika-config file. This must be a literal file or findable on the classpath. */
public static final String TIKA_CONFIG_PATH = "tika_config_path";
public AutoDetectParserFactory(Map<String, String> args) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
index 3b50b4d..415814c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CompositeParser.java
@@ -26,10 +26,6 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.io.TemporaryResources;
@@ -41,37 +37,32 @@
import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.ParserUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * Composite parser that delegates parsing tasks to a component parser
- * based on the declared content type of the incoming document. A fallback
- * parser is defined for cases where a parser for the given content type is
- * not available.
+ * Composite parser that delegates parsing tasks to a component parser based on the declared content
+ * type of the incoming document. A fallback parser is defined for cases where a parser for the
+ * given content type is not available.
*/
public class CompositeParser implements Parser {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 2192845797749627824L;
- /**
- * Media type registry.
- */
+ /** Media type registry. */
private MediaTypeRegistry registry;
- /**
- * List of component parsers.
- */
+ /** List of component parsers. */
private List<Parser> parsers;
- /**
- * The fallback parser, used when no better parser is available.
- */
+ /** The fallback parser, used when no better parser is available. */
private Parser fallback = new EmptyParser();
- public CompositeParser(MediaTypeRegistry registry, List<Parser> parsers,
- Collection<Class<? extends Parser>> excludeParsers) {
+ public CompositeParser(
+ MediaTypeRegistry registry,
+ List<Parser> parsers,
+ Collection<Class<? extends Parser>> excludeParsers) {
if (excludeParsers == null || excludeParsers.isEmpty()) {
this.parsers = parsers;
} else {
@@ -107,13 +98,13 @@
return map;
}
- private boolean isExcluded(Collection<Class<? extends Parser>> excludeParsers,
- Class<? extends Parser> p) {
+ private boolean isExcluded(
+ Collection<Class<? extends Parser>> excludeParsers, Class<? extends Parser> p) {
return excludeParsers.contains(p) || assignableFrom(excludeParsers, p);
}
- private boolean assignableFrom(Collection<Class<? extends Parser>> excludeParsers,
- Class<? extends Parser> p) {
+ private boolean assignableFrom(
+ Collection<Class<? extends Parser>> excludeParsers, Class<? extends Parser> p) {
for (Class<? extends Parser> e : excludeParsers) {
if (e.isAssignableFrom(p)) {
return true;
@@ -123,9 +114,9 @@
}
/**
- * Utility method that goes through all the component parsers and finds
- * all media types for which more than one parser declares support. This
- * is useful in tracking down conflicting parser definitions.
+ * Utility method that goes through all the component parsers and finds all media types for
+ * which more than one parser declares support. This is useful in tracking down conflicting
+ * parser definitions.
*
* @param context parsing context
* @return media types that are supported by at least two component parsers
@@ -175,9 +166,8 @@
}
/**
- * Returns all parsers registered with the Composite Parser,
- * including ones which may not currently be active.
- * This won't include the Fallback Parser, if defined
+ * Returns all parsers registered with the Composite Parser, including ones which may not
+ * currently be active. This won't include the Fallback Parser, if defined
*/
public List<Parser> getAllComponentParsers() {
return Collections.unmodifiableList(parsers);
@@ -200,8 +190,9 @@
public void setParsers(Map<MediaType, Parser> parsers) {
this.parsers = new ArrayList<>(parsers.size());
for (Map.Entry<MediaType, Parser> entry : parsers.entrySet()) {
- this.parsers.add(ParserDecorator
- .withTypes(entry.getValue(), Collections.singleton(entry.getKey())));
+ this.parsers.add(
+ ParserDecorator.withTypes(
+ entry.getValue(), Collections.singleton(entry.getKey())));
}
}
@@ -224,14 +215,12 @@
}
/**
- * Returns the parser that best matches the given metadata. By default
- * looks for a parser that matches the content type metadata property,
- * and uses the fallback parser if a better match is not found. The
- * type hierarchy information included in the configured media type
- * registry is used when looking for a matching parser instance.
- * <p>
- * Subclasses can override this method to provide more accurate
- * parser resolution.
+ * Returns the parser that best matches the given metadata. By default looks for a parser that
+ * matches the content type metadata property, and uses the fallback parser if a better match is
+ * not found. The type hierarchy information included in the configured media type registry is
+ * used when looking for a matching parser instance.
+ *
+ * <p>Subclasses can override this method to provide more accurate parser resolution.
*
* @param metadata document metadata
* @return matching parser
@@ -242,7 +231,7 @@
protected Parser getParser(Metadata metadata, ParseContext context) {
Map<MediaType, Parser> map = getParsers(context);
- //check for parser override first
+ // check for parser override first
String contentTypeString = metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE);
if (contentTypeString == null) {
contentTypeString = metadata.get(Metadata.CONTENT_TYPE);
@@ -271,14 +260,14 @@
/**
* Delegates the call to the matching component parser.
- * <p>
- * Potential {@link RuntimeException}s, {@link IOException}s and
- * {@link SAXException}s unrelated to the given input stream and content
- * handler are automatically wrapped into {@link TikaException}s to better
- * honor the {@link Parser} contract.
+ *
+ * <p>Potential {@link RuntimeException}s, {@link IOException}s and {@link SAXException}s
+ * unrelated to the given input stream and content handler are automatically wrapped into {@link
+ * TikaException}s to better honor the {@link Parser} contract.
*/
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
Parser parser = getParser(metadata, context);
TemporaryResources tmp = new TemporaryResources();
ParseRecord parserRecord = context.get(ParseRecord.class);
@@ -297,7 +286,7 @@
try {
parser.parse(taggedStream, taggedHandler, metadata, context);
} catch (SecurityException e) {
- //rethrow security exceptions
+ // rethrow security exceptions
throw e;
} catch (IOException e) {
taggedStream.throwIfCauseOf(e);
@@ -324,7 +313,7 @@
private void recordEmbeddedMetadata(Metadata metadata, ParseContext context) {
ParseRecord record = context.get(ParseRecord.class);
if (record == null) {
- //this should never happen
+ // this should never happen
return;
}
for (Exception e : record.getExceptions()) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java b/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java
index 1ffd851..16656c2 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/CryptoParser.java
@@ -26,27 +26,23 @@
import java.util.Set;
import javax.crypto.Cipher;
import javax.crypto.CipherInputStream;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * Decrypts the incoming document stream and delegates further parsing to
- * another parser instance. The decryption key and other settings as well
- * as the delegate parser are taken from the parsing context.
+ * Decrypts the incoming document stream and delegates further parsing to another parser instance.
+ * The decryption key and other settings as well as the delegate parser are taken from the parsing
+ * context.
*
* @since Apache Tika 0.10
*/
public abstract class CryptoParser extends DelegatingParser {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -3507995752666557731L;
private final String transformation;
@@ -69,8 +65,9 @@
return types;
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
try {
Cipher cipher;
if (provider != null) {
@@ -101,5 +98,4 @@
throw new TikaException("Unable to decrypt document stream", e);
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
index 3205ea8..1a2516f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DefaultParser.java
@@ -21,7 +21,6 @@
import java.util.Collections;
import java.util.List;
import java.util.Map;
-
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.DefaultEncodingDetector;
import org.apache.tika.detect.EncodingDetector;
@@ -32,42 +31,56 @@
import org.apache.tika.utils.ServiceLoaderUtils;
/**
- * A composite parser based on all the {@link Parser} implementations
- * available through the
- * {@link javax.imageio.spi.ServiceRegistry service provider mechanism}.
+ * A composite parser based on all the {@link Parser} implementations available through the {@link
+ * javax.imageio.spi.ServiceRegistry service provider mechanism}.
*
* @since Apache Tika 0.8
*/
public class DefaultParser extends CompositeParser {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = 3612324825403757520L;
- private transient final ServiceLoader loader;
- public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
- Collection<Class<? extends Parser>> excludeParsers,
- EncodingDetector encodingDetector, Renderer renderer) {
+ private final transient ServiceLoader loader;
+
+ public DefaultParser(
+ MediaTypeRegistry registry,
+ ServiceLoader loader,
+ Collection<Class<? extends Parser>> excludeParsers,
+ EncodingDetector encodingDetector,
+ Renderer renderer) {
super(registry, getDefaultParsers(loader, encodingDetector, renderer, excludeParsers));
this.loader = loader;
}
- public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
- Collection<Class<? extends Parser>> excludeParsers) {
- super(registry,
- getDefaultParsers(loader, new DefaultEncodingDetector(loader),
- new CompositeRenderer(loader), excludeParsers));
+ public DefaultParser(
+ MediaTypeRegistry registry,
+ ServiceLoader loader,
+ Collection<Class<? extends Parser>> excludeParsers) {
+ super(
+ registry,
+ getDefaultParsers(
+ loader,
+ new DefaultEncodingDetector(loader),
+ new CompositeRenderer(loader),
+ excludeParsers));
this.loader = loader;
}
- public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader,
- EncodingDetector encodingDetector, Renderer renderer) {
+ public DefaultParser(
+ MediaTypeRegistry registry,
+ ServiceLoader loader,
+ EncodingDetector encodingDetector,
+ Renderer renderer) {
this(registry, loader, Collections.EMPTY_SET, encodingDetector, renderer);
}
public DefaultParser(MediaTypeRegistry registry, ServiceLoader loader) {
- this(registry, loader, Collections.EMPTY_SET, new DefaultEncodingDetector(loader),
+ this(
+ registry,
+ loader,
+ Collections.EMPTY_SET,
+ new DefaultEncodingDetector(loader),
new CompositeRenderer(loader));
}
@@ -88,21 +101,19 @@
}
/**
- * Finds all statically loadable parsers and sort the list by name,
- * rather than discovery order. CompositeParser takes the last
- * parser for any given media type, so put the Tika parsers first
+ * Finds all statically loadable parsers and sort the list by name, rather than discovery order.
+ * CompositeParser takes the last parser for any given media type, so put the Tika parsers first
* so that non-Tika (user supplied) parsers can take precedence.
*
* @param loader service loader
* @return ordered list of statically loadable parsers
*/
- private static List<Parser> getDefaultParsers(ServiceLoader loader,
- EncodingDetector encodingDetector,
- Renderer renderer,
- Collection<Class<? extends Parser>>
- excludeParsers) {
- List<Parser> parsers =
- loader.loadStaticServiceProviders(Parser.class, excludeParsers);
+ private static List<Parser> getDefaultParsers(
+ ServiceLoader loader,
+ EncodingDetector encodingDetector,
+ Renderer renderer,
+ Collection<Class<? extends Parser>> excludeParsers) {
+ List<Parser> parsers = loader.loadStaticServiceProviders(Parser.class, excludeParsers);
if (encodingDetector != null) {
for (Parser p : parsers) {
@@ -115,14 +126,14 @@
}
}
ServiceLoaderUtils.sortLoadedClasses(parsers);
- //reverse the order of parsers so that custom ones come last
- //this will prevent them from being overwritten in getParsers(ParseContext ..)
+ // reverse the order of parsers so that custom ones come last
+ // this will prevent them from being overwritten in getParsers(ParseContext ..)
Collections.reverse(parsers);
return parsers;
}
- //recursively go through the parsers and set the encoding detector
- //as configured in the config file
+ // recursively go through the parsers and set the encoding detector
+ // as configured in the config file
private static void setEncodingDetector(Parser p, EncodingDetector encodingDetector) {
if (p instanceof AbstractEncodingDetectorParser) {
((AbstractEncodingDetectorParser) p).setEncodingDetector(encodingDetector);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
index f2e007c..e7ddcc7 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DelegatingParser.java
@@ -19,30 +19,26 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * Base class for parser implementations that want to delegate parts of the
- * task of parsing an input document to another parser. The delegate parser
- * is looked up from the parsing context using the {@link Parser} class as
- * the key.
+ * Base class for parser implementations that want to delegate parts of the task of parsing an input
+ * document to another parser. The delegate parser is looked up from the parsing context using the
+ * {@link Parser} class as the key.
*
* @since Apache Tika 0.4, major changes in Tika 0.5
*/
public class DelegatingParser implements Parser {
/**
- * Returns the parser instance to which parsing tasks should be delegated.
- * The default implementation looks up the delegate parser from the given
- * parse context, and uses an {@link EmptyParser} instance as a fallback.
- * Subclasses can override this method to implement alternative delegation
- * strategies.
+ * Returns the parser instance to which parsing tasks should be delegated. The default
+ * implementation looks up the delegate parser from the given parse context, and uses an {@link
+ * EmptyParser} instance as a fallback. Subclasses can override this method to implement
+ * alternative delegation strategies.
*
* @param context parse context
* @return delegate parser
@@ -57,18 +53,16 @@
}
/**
- * Looks up the delegate parser from the parsing context and
- * delegates the parse operation to it. If a delegate parser is not
- * found, then an empty XHTML document is returned.
- * <p>
- * Subclasses should override this method to parse the top level
- * structure of the given document stream. Parsed sub-streams can
- * be passed to this base class method to be parsed by the configured
- * delegate parser.
+ * Looks up the delegate parser from the parsing context and delegates the parse operation to
+ * it. If a delegate parser is not found, then an empty XHTML document is returned.
+ *
+ * <p>Subclasses should override this method to parse the top level structure of the given
+ * document stream. Parsed sub-streams can be passed to this base class method to be parsed by
+ * the configured delegate parser.
*/
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws SAXException, IOException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws SAXException, IOException, TikaException {
getDelegateParser(context).parse(stream, handler, metadata, context);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
index 8c0358d..65d0e6e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/DigestingParser.java
@@ -17,23 +17,21 @@
package org.apache.tika.parser;
-
import java.io.IOException;
import java.io.InputStream;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
public class DigestingParser extends ParserDecorator {
private final Digester digester;
private final boolean skipContainerDocument;
+
/**
* Creates a decorator for the given parser.
*
@@ -46,8 +44,9 @@
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
try {
@@ -64,7 +63,7 @@
if (digester == null) {
return false;
}
- if (! skipContainerDocument) {
+ if (!skipContainerDocument) {
return true;
}
Integer parseDepth = metadata.getInt(TikaCoreProperties.EMBEDDED_DEPTH);
@@ -75,43 +74,40 @@
}
/**
- * This is used in {@link AutoDetectParserConfig} to (optionally)
- * wrap the parser in a digesting parser.
+ * This is used in {@link AutoDetectParserConfig} to (optionally) wrap the parser in a digesting
+ * parser.
*/
public interface DigesterFactory {
Digester build();
+
void setSkipContainerDocument(boolean skipContainerDocument);
+
boolean isSkipContainerDocument();
}
- /**
- * Interface for digester. See
- * org.apache.parser.utils.CommonsDigester in tika-parsers for an
+ /**
+ * Interface for digester. See org.apache.parser.utils.CommonsDigester in tika-parsers for an
* implementation.
*/
public interface Digester {
/**
- * Digests an InputStream and sets the appropriate value(s) in the metadata.
- * The Digester is also responsible for marking and resetting the stream.
- * <p>
- * The given stream is guaranteed to support the
- * {@link InputStream#markSupported() mark feature} and the detector
- * is expected to {@link InputStream#mark(int) mark} the stream before
- * reading any bytes from it, and to {@link InputStream#reset() reset}
- * the stream before returning. The stream must not be closed by the
- * detector.
+ * Digests an InputStream and sets the appropriate value(s) in the metadata. The Digester is
+ * also responsible for marking and resetting the stream.
*
- * @param is InputStream to digest
- * @param m Metadata to set the values for
+ * <p>The given stream is guaranteed to support the {@link InputStream#markSupported() mark
+ * feature} and the detector is expected to {@link InputStream#mark(int) mark} the stream
+ * before reading any bytes from it, and to {@link InputStream#reset() reset} the stream
+ * before returning. The stream must not be closed by the detector.
+ *
+ * @param is InputStream to digest
+ * @param m Metadata to set the values for
* @param parseContext ParseContext
* @throws IOException
*/
void digest(InputStream is, Metadata m, ParseContext parseContext) throws IOException;
}
- /**
- * Encodes byte array from a MessageDigest to String
- */
+ /** Encodes byte array from a MessageDigest to String */
public interface Encoder {
String encode(byte[] bytes);
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java b/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
index 546d0c2..774da24 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/EmptyParser.java
@@ -19,35 +19,30 @@
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * Dummy parser that always produces an empty XHTML document without even
- * attempting to parse the given document stream. Useful as a sentinel parser
- * for unknown document types.
+ * Dummy parser that always produces an empty XHTML document without even attempting to parse the
+ * given document stream. Useful as a sentinel parser for unknown document types.
*/
public class EmptyParser implements Parser {
- /**
- * Singleton instance of this class.
- */
+ /** Singleton instance of this class. */
public static final EmptyParser INSTANCE = new EmptyParser();
- /**
- * Serial version UID.
- */
+
+ /** Serial version UID. */
private static final long serialVersionUID = -4218649699095732123L;
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.emptySet();
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws SAXException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws SAXException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
xhtml.endDocument();
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java b/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java
index b8071cb..37b946c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ErrorParser.java
@@ -19,31 +19,28 @@
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
/**
- * Dummy parser that always throws a {@link TikaException} without even
- * attempting to parse the given document stream. Useful as a sentinel parser
- * for unknown document types.
+ * Dummy parser that always throws a {@link TikaException} without even attempting to parse the
+ * given document stream. Useful as a sentinel parser for unknown document types.
*/
public class ErrorParser implements Parser {
- /**
- * Singleton instance of this class.
- */
+ /** Singleton instance of this class. */
public static final ErrorParser INSTANCE = new ErrorParser();
+
private static final long serialVersionUID = 7727423956957641824L;
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.emptySet();
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws TikaException {
throw new TikaException("Parse error");
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java b/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
index 822512d..51e1ca6 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/NetworkParser.java
@@ -26,14 +26,8 @@
import java.net.URLConnection;
import java.util.Collections;
import java.util.Set;
-
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -42,7 +36,10 @@
import org.apache.tika.sax.TaggedContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
-
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
public class NetworkParser implements Parser {
@@ -63,8 +60,9 @@
return supportedTypes;
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
@@ -74,16 +72,20 @@
}
}
- private void parse(TikaInputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ private void parse(
+ TikaInputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
if ("telnet".equals(uri.getScheme())) {
try (Socket socket = new Socket(uri.getHost(), uri.getPort())) {
- new ParsingTask(stream, new FilterOutputStream(socket.getOutputStream()) {
- @Override
- public void close() throws IOException {
- socket.shutdownOutput();
- }
- }).parse(socket.getInputStream(), handler, metadata, context);
+ new ParsingTask(
+ stream,
+ new FilterOutputStream(socket.getOutputStream()) {
+ @Override
+ public void close() throws IOException {
+ socket.shutdownOutput();
+ }
+ })
+ .parse(socket.getInputStream(), handler, metadata, context);
}
} else {
URL url = uri.toURL();
@@ -95,7 +97,6 @@
.parse(CloseShieldInputStream.wrap(input), handler, metadata, context);
}
}
-
}
private static class ParsingTask implements Runnable {
@@ -111,17 +112,16 @@
this.output = output;
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
Thread thread = new Thread(this, "Tika network parser");
thread.start();
- TaggedContentHandler tagged =
- new TaggedContentHandler(handler);
+ TaggedContentHandler tagged = new TaggedContentHandler(handler);
try {
- XMLReaderUtils
- .parseSAX(stream, new TeeContentHandler(tagged, new MetaHandler(metadata)),
- context);
+ XMLReaderUtils.parseSAX(
+ stream, new TeeContentHandler(tagged, new MetaHandler(metadata)), context);
} catch (SAXException e) {
tagged.throwIfCauseOf(e);
throw new TikaException("Invalid network parser output", e);
@@ -141,7 +141,7 @@
}
}
- //----------------------------------------------------------<Runnable>
+ // ----------------------------------------------------------<Runnable>
public void run() {
try {
@@ -154,7 +154,6 @@
exception = e;
}
}
-
}
private static class MetaHandler extends DefaultHandler {
@@ -176,7 +175,5 @@
}
}
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index 531f1da..ba4781b 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -28,15 +28,13 @@
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.XMLInputFactory;
import javax.xml.transform.Transformer;
-
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.utils.XMLReaderUtils;
import org.xml.sax.SAXNotRecognizedException;
import org.xml.sax.SAXNotSupportedException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.utils.XMLReaderUtils;
-
/**
* Parse context. Used to pass context information to Tika parsers.
*
@@ -45,21 +43,16 @@
*/
public class ParseContext implements Serializable {
- /**
- * Serial version UID.
- */
+ /** Serial version UID. */
private static final long serialVersionUID = -5921436862145826534L;
- /**
- * Map of objects in this context
- */
+ /** Map of objects in this context */
private final Map<String, Object> context = new HashMap<>();
/**
- * Adds the given value to the context as an implementation of the given
- * interface.
+ * Adds the given value to the context as an implementation of the given interface.
*
- * @param key the interface implemented by the given value
+ * @param key the interface implemented by the given value
* @param value the value to be added, or <code>null</code> to remove
*/
public <T> void set(Class<T> key, T value) {
@@ -74,8 +67,7 @@
* Returns the object in this context that implements the given interface.
*
* @param key the interface implemented by the requested object
- * @return the object that implements the given interface,
- * or <code>null</code> if not found
+ * @return the object that implements the given interface, or <code>null</code> if not found
*/
@SuppressWarnings("unchecked")
public <T> T get(Class<T> key) {
@@ -83,13 +75,13 @@
}
/**
- * Returns the object in this context that implements the given interface,
- * or the given default value if such an object is not found.
+ * Returns the object in this context that implements the given interface, or the given default
+ * value if such an object is not found.
*
- * @param key the interface implemented by the requested object
+ * @param key the interface implemented by the requested object
* @param defaultValue value to return if the requested object is not found
- * @return the object that implements the given interface,
- * or the given default value if not found
+ * @return the object that implements the given interface, or the given default value if not
+ * found
*/
public <T> T get(Class<T> key, T defaultValue) {
T value = get(key);
@@ -101,9 +93,8 @@
}
/**
- * Returns the XMLReader specified in this parsing context. If a reader
- * is not explicitly specified, then one is created using the specified
- * or the default SAX parser.
+ * Returns the XMLReader specified in this parsing context. If a reader is not explicitly
+ * specified, then one is created using the specified or the default SAX parser.
*
* @return XMLReader
* @throws TikaException
@@ -119,11 +110,10 @@
}
/**
- * Returns the SAX parser specified in this parsing context. If a parser
- * is not explicitly specified, then one is created using the specified
- * or the default SAX parser factory. Consider using
- * {@link XMLReaderUtils#parseSAX(InputStream, DefaultHandler, ParseContext)}
- * for more efficient reuse of SAXParsers.
+ * Returns the SAX parser specified in this parsing context. If a parser is not explicitly
+ * specified, then one is created using the specified or the default SAX parser factory.
+ * Consider using {@link XMLReaderUtils#parseSAX(InputStream, DefaultHandler, ParseContext)} for
+ * more efficient reuse of SAXParsers.
*
* @return SAX parser
* @throws TikaException if a SAX parser could not be created
@@ -140,11 +130,10 @@
}
/**
- * Returns the SAX parser factory specified in this parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned. The default factory instance is
- * configured to be namespace-aware, not validating, and to use
- * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
+ * Returns the SAX parser factory specified in this parsing context. If a factory is not
+ * explicitly specified, then a default factory instance is created and returned. The default
+ * factory instance is configured to be namespace-aware, not validating, and to use {@link
+ * XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
*
* @return SAX parser factory
* @since Apache Tika 0.8
@@ -158,7 +147,7 @@
try {
factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
} catch (ParserConfigurationException | SAXNotSupportedException e) {
- //swallow
+ // swallow
} catch (SAXNotRecognizedException e) {
// TIKA-271: Some XML parsers do not support the
// secure-processing feature, even though it's required by
@@ -171,17 +160,16 @@
}
/**
- * Returns the DOM builder factory specified in this parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned. The default factory instance is
- * configured to be namespace-aware and to apply reasonable security
+ * Returns the DOM builder factory specified in this parsing context. If a factory is not
+ * explicitly specified, then a default factory instance is created and returned. The default
+ * factory instance is configured to be namespace-aware and to apply reasonable security
* features.
*
* @return DOM parser factory
* @since Apache Tika 1.13
*/
private DocumentBuilderFactory getDocumentBuilderFactory() {
- //borrowed from Apache POI
+ // borrowed from Apache POI
DocumentBuilderFactory documentBuilderFactory = get(DocumentBuilderFactory.class);
if (documentBuilderFactory != null) {
return documentBuilderFactory;
@@ -191,13 +179,11 @@
}
/**
- * Returns the DOM builder specified in this parsing context.
- * If a builder is not explicitly specified, then a builder
- * instance is created and returned. The builder instance is
- * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER},
- * and it sets the ErrorHandler to <code>null</code>.
- * Consider using {@link XMLReaderUtils#buildDOM(InputStream, ParseContext)}
- * instead for more efficient reuse of document builders.
+ * Returns the DOM builder specified in this parsing context. If a builder is not explicitly
+ * specified, then a builder instance is created and returned. The builder instance is
+ * configured to apply an {@link XMLReaderUtils#IGNORING_SAX_ENTITY_RESOLVER}, and it sets the
+ * ErrorHandler to <code>null</code>. Consider using {@link XMLReaderUtils#buildDOM(InputStream,
+ * ParseContext)} instead for more efficient reuse of document builders.
*
* @return DOM Builder
* @since Apache Tika 1.13
@@ -212,11 +198,10 @@
}
/**
- * Returns the StAX input factory specified in this parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned. The default factory instance is
- * configured to be namespace-aware and to apply reasonable security
- * using the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}.
+ * Returns the StAX input factory specified in this parsing context. If a factory is not
+ * explicitly specified, then a default factory instance is created and returned. The default
+ * factory instance is configured to be namespace-aware and to apply reasonable security using
+ * the {@link XMLReaderUtils#IGNORING_STAX_ENTITY_RESOLVER}.
*
* @return StAX input factory
* @since Apache Tika 1.13
@@ -229,14 +214,12 @@
return XMLReaderUtils.getXMLInputFactory();
}
-
/**
* Returns the transformer specified in this parsing context.
- * <p>
- * If a transformer is not explicitly specified, then a default transformer
- * instance is created and returned. The default transformer instance is
- * configured to to use
- * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
+ *
+ * <p>If a transformer is not explicitly specified, then a default transformer instance is
+ * created and returned. The default transformer instance is configured to to use {@link
+ * XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
*
* @return Transformer
* @throws TikaException when the transformer can not be created
@@ -251,5 +234,4 @@
return XMLReaderUtils.getTransformer();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
index ca0edc5..267f099 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
@@ -20,18 +20,16 @@
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;
-
import org.apache.tika.metadata.Metadata;
/**
- * Use this class to store exceptions, warnings and other information
- * during the parse. This information is added to the parent's metadata
- * after the parse by the {@link CompositeParser}.
+ * Use this class to store exceptions, warnings and other information during the parse. This
+ * information is added to the parent's metadata after the parse by the {@link CompositeParser}.
*/
public class ParseRecord {
- //hard limits so that specially crafted files
- //don't cause an OOM
+ // hard limits so that specially crafted files
+ // don't cause an OOM
private static int MAX_PARSERS = 100;
private static final int MAX_EXCEPTIONS = 100;
@@ -103,7 +101,6 @@
return warnings;
}
-
public boolean isWriteLimitReached() {
return writeLimitReached;
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/Parser.java b/tika-core/src/main/java/org/apache/tika/parser/Parser.java
index 4488288..6aa9dfa 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/Parser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/Parser.java
@@ -20,22 +20,18 @@
import java.io.InputStream;
import java.io.Serializable;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
-/**
- * Tika parser interface.
- */
+/** Tika parser interface. */
public interface Parser extends Serializable {
/**
- * Returns the set of media types supported by this parser when used
- * with the given parse context.
+ * Returns the set of media types supported by this parser when used with the given parse
+ * context.
*
* @param context parse context
* @return immutable set of media types
@@ -44,26 +40,24 @@
Set<MediaType> getSupportedTypes(ParseContext context);
/**
- * Parses a document stream into a sequence of XHTML SAX events.
- * Fills in related document metadata in the given metadata object.
- * <p>
- * The given document stream is consumed but not closed by this method.
- * The responsibility to close the stream remains on the caller.
- * <p>
- * Information about the parsing context can be passed in the context
- * parameter. See the parser implementations for the kinds of context
- * information they expect.
+ * Parses a document stream into a sequence of XHTML SAX events. Fills in related document
+ * metadata in the given metadata object.
*
- * @param stream the document stream (input)
- * @param handler handler for the XHTML SAX events (output)
+ * <p>The given document stream is consumed but not closed by this method. The responsibility to
+ * close the stream remains on the caller.
+ *
+ * <p>Information about the parsing context can be passed in the context parameter. See the
+ * parser implementations for the kinds of context information they expect.
+ *
+ * @param stream the document stream (input)
+ * @param handler handler for the XHTML SAX events (output)
* @param metadata document metadata (input and output)
- * @param context parse context
- * @throws IOException if the document stream could not be read
- * @throws SAXException if the SAX events could not be processed
+ * @param context parse context
+ * @throws IOException if the document stream could not be read
+ * @throws SAXException if the SAX events could not be processed
* @throws TikaException if the document could not be parsed
* @since Apache Tika 0.5
*/
void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException;
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java b/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
index 32d6661..7e0f655 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParserDecorator.java
@@ -21,34 +21,30 @@
import java.util.Collection;
import java.util.HashSet;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
import org.apache.tika.parser.multiple.FallbackParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* Decorator base class for the {@link Parser} interface.
- * <p>This class simply delegates all parsing calls to an underlying decorated
- * parser instance. Subclasses can provide extra decoration by overriding the
- * parse method.
- * <p>To decorate several different parsers at the same time, wrap them in
- * a {@link CompositeParser} instance first.
+ *
+ * <p>This class simply delegates all parsing calls to an underlying decorated parser instance.
+ * Subclasses can provide extra decoration by overriding the parse method.
+ *
+ * <p>To decorate several different parsers at the same time, wrap them in a {@link CompositeParser}
+ * instance first.
*/
public class ParserDecorator implements Parser {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -3861669115439125268L;
- /**
- * The decorated parser instance.
- */
+
+ /** The decorated parser instance. */
private final Parser parser;
/**
@@ -61,11 +57,11 @@
}
/**
- * Decorates the given parser so that it always claims to support
- * parsing of the given media types.
+ * Decorates the given parser so that it always claims to support parsing of the given media
+ * types.
*
* @param parser the parser to be decorated
- * @param types supported media types
+ * @param types supported media types
* @return the decorated parser
*/
public static final Parser withTypes(Parser parser, final Set<MediaType> types) {
@@ -85,10 +81,10 @@
}
/**
- * Decorates the given parser so that it never claims to support
- * parsing of the given media types, but will work for all others.
+ * Decorates the given parser so that it never claims to support parsing of the given media
+ * types, but will work for all others.
*
- * @param parser the parser to be decorated
+ * @param parser the parser to be decorated
* @param excludeTypes excluded/ignored media types
* @return the decorated parser
*/
@@ -99,8 +95,7 @@
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
// Get our own, writable copy of the types the parser supports
- Set<MediaType> parserTypes =
- new HashSet<>(super.getSupportedTypes(context));
+ Set<MediaType> parserTypes = new HashSet<>(super.getSupportedTypes(context));
// Remove anything on our excludes list
parserTypes.removeAll(excludeTypes);
// Return whatever is left
@@ -115,14 +110,14 @@
}
/**
- * Decorates the given parsers into a virtual parser, where they'll
- * be tried in preference order until one works without error.
+ * Decorates the given parsers into a virtual parser, where they'll be tried in preference order
+ * until one works without error.
*
* @deprecated This has been replaced by {@link FallbackParser}
*/
@Deprecated
- public static final Parser withFallbacks(final Collection<? extends Parser> parsers,
- final Set<MediaType> types) {
+ public static final Parser withFallbacks(
+ final Collection<? extends Parser> parsers, final Set<MediaType> types) {
// Delegate to the new FallbackParser for now, until people upgrade
// Keep old behaviour on metadata, which was to preseve all
MediaTypeRegistry registry = MediaTypeRegistry.getDefaultRegistry();
@@ -135,21 +130,22 @@
}
/**
- * Delegates the method call to the decorated parser. Subclasses should
- * override this method (and use <code>super.getSupportedTypes()</code>
- * to invoke the decorated parser) to implement extra decoration.
+ * Delegates the method call to the decorated parser. Subclasses should override this method
+ * (and use <code>super.getSupportedTypes()</code> to invoke the decorated parser) to implement
+ * extra decoration.
*/
public Set<MediaType> getSupportedTypes(ParseContext context) {
return parser.getSupportedTypes(context);
}
/**
- * Delegates the method call to the decorated parser. Subclasses should
- * override this method (and use <code>super.parse()</code> to invoke
- * the decorated parser) to implement extra decoration.
+ * Delegates the method call to the decorated parser. Subclasses should override this method
+ * (and use <code>super.parse()</code> to invoke the decorated parser) to implement extra
+ * decoration.
*/
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
parser.parse(stream, handler, metadata, context);
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java b/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java
index af541b0..4df3a13 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParserFactory.java
@@ -17,13 +17,10 @@
package org.apache.tika.parser;
-
import java.io.IOException;
import java.util.Map;
-
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
+import org.xml.sax.SAXException;
public abstract class ParserFactory {
@@ -34,5 +31,4 @@
}
public abstract Parser build() throws IOException, SAXException, TikaException;
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java b/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
index 308fa7e..7d3b55d 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParserPostProcessor.java
@@ -18,21 +18,18 @@
import java.io.IOException;
import java.io.InputStream;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.utils.RegexUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * Parser decorator that post-processes the results from a decorated parser.
- * The post-processing takes care of filling in the "fulltext", "summary",
- * and "outlinks" metadata entries based on the full text content returned by
- * the decorated parser.
+ * Parser decorator that post-processes the results from a decorated parser. The post-processing
+ * takes care of filling in the "fulltext", "summary", and "outlinks" metadata entries based on the
+ * full text content returned by the decorated parser.
*/
public class ParserPostProcessor extends ParserDecorator {
@@ -46,11 +43,11 @@
}
/**
- * Forwards the call to the delegated parser and post-processes the
- * results as described above.
+ * Forwards the call to the delegated parser and post-processes the results as described above.
*/
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
ContentHandler body = new BodyContentHandler();
ContentHandler tee = new TeeContentHandler(handler, body);
super.parse(stream, tee, metadata, context);
@@ -65,5 +62,4 @@
metadata.add("outlinks", link);
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java b/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java
index fe98e74..b8f560c 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParsingReader.java
@@ -29,59 +29,42 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.Executor;
-
-import org.xml.sax.ContentHandler;
-
import org.apache.tika.exception.ZeroByteFileException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.ContentHandler;
/**
- * Reader for the text content from a given binary stream. This class
- * uses a background parsing task with a {@link Parser}
- * ({@link AutoDetectParser} by default) to parse the text content from
- * a given input stream. The {@link BodyContentHandler} class and a pipe
- * is used to convert the push-based SAX event stream to the pull-based
- * character stream defined by the {@link Reader} interface.
+ * Reader for the text content from a given binary stream. This class uses a background parsing task
+ * with a {@link Parser} ({@link AutoDetectParser} by default) to parse the text content from a
+ * given input stream. The {@link BodyContentHandler} class and a pipe is used to convert the
+ * push-based SAX event stream to the pull-based character stream defined by the {@link Reader}
+ * interface.
*
* @since Apache Tika 0.2
*/
public class ParsingReader extends Reader {
- /**
- * Parser instance used for parsing the given binary stream.
- */
+ /** Parser instance used for parsing the given binary stream. */
private final Parser parser;
- /**
- * Buffered read end of the pipe.
- */
+ /** Buffered read end of the pipe. */
private final Reader reader;
- /**
- * Write end of the pipe.
- */
+ /** Write end of the pipe. */
private final Writer writer;
- /**
- * The binary stream being parsed.
- */
+ /** The binary stream being parsed. */
private final InputStream stream;
- /**
- * Metadata associated with the document being parsed.
- */
+ /** Metadata associated with the document being parsed. */
private final Metadata metadata;
- /**
- * The parse context.
- */
+ /** The parse context. */
private final ParseContext context;
- /**
- * An exception (if any) thrown by the parsing thread.
- */
+ /** An exception (if any) thrown by the parsing thread. */
private transient Throwable throwable;
/**
@@ -96,11 +79,10 @@
}
/**
- * Creates a reader for the text content of the given binary stream
- * with the given name.
+ * Creates a reader for the text content of the given binary stream with the given name.
*
* @param stream binary stream
- * @param name document name
+ * @param name document name
* @throws IOException if the document can not be parsed
*/
public ParsingReader(InputStream stream, String name) throws IOException {
@@ -113,7 +95,7 @@
*
* @param path path
* @throws FileNotFoundException if the given file does not exist
- * @throws IOException if the document can not be parsed
+ * @throws IOException if the document can not be parsed
*/
public ParsingReader(Path path) throws IOException {
this(Files.newInputStream(path), path.getFileName().toString());
@@ -124,7 +106,7 @@
*
* @param file file
* @throws FileNotFoundException if the given file does not exist
- * @throws IOException if the document can not be parsed
+ * @throws IOException if the document can not be parsed
* @see #ParsingReader(Path)
*/
public ParsingReader(File file) throws FileNotFoundException, IOException {
@@ -132,56 +114,66 @@
}
/**
- * Creates a reader for the text content of the given binary stream
- * with the given document metadata. The given parser is used for
- * parsing. A new background thread is started for the parsing task.
- * <p>
- * The created reader will be responsible for closing the given stream.
- * The stream and any associated resources will be closed at or before
- * the time when the {@link #close()} method is called on this reader.
+ * Creates a reader for the text content of the given binary stream with the given document
+ * metadata. The given parser is used for parsing. A new background thread is started for the
+ * parsing task.
*
- * @param parser parser instance
- * @param stream binary stream
+ * <p>The created reader will be responsible for closing the given stream. The stream and any
+ * associated resources will be closed at or before the time when the {@link #close()} method is
+ * called on this reader.
+ *
+ * @param parser parser instance
+ * @param stream binary stream
* @param metadata document metadata
* @throws IOException if the document can not be parsed
*/
- public ParsingReader(Parser parser, InputStream stream, final Metadata metadata,
- ParseContext context) throws IOException {
- this(parser, stream, metadata, context, command -> {
- String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
- if (name != null) {
- name = "Apache Tika: " + name;
- } else {
- name = "Apache Tika";
- }
- Thread thread = new Thread(command, name);
- thread.setDaemon(true);
- thread.start();
- });
+ public ParsingReader(
+ Parser parser, InputStream stream, final Metadata metadata, ParseContext context)
+ throws IOException {
+ this(
+ parser,
+ stream,
+ metadata,
+ context,
+ command -> {
+ String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ if (name != null) {
+ name = "Apache Tika: " + name;
+ } else {
+ name = "Apache Tika";
+ }
+ Thread thread = new Thread(command, name);
+ thread.setDaemon(true);
+ thread.start();
+ });
}
/**
- * Creates a reader for the text content of the given binary stream
- * with the given document metadata. The given parser is used for the
- * parsing task that is run with the given executor. The given executor
- * <em>must</em> run the parsing task asynchronously in a separate thread,
- * since the current thread must return to the caller that can then
- * consume the parsed text through the {@link Reader} interface.
- * <p>
- * The created reader will be responsible for closing the given stream.
- * The stream and any associated resources will be closed at or before
- * the time when the {@link #close()} method is called on this reader.
+ * Creates a reader for the text content of the given binary stream with the given document
+ * metadata. The given parser is used for the parsing task that is run with the given executor.
+ * The given executor <em>must</em> run the parsing task asynchronously in a separate thread,
+ * since the current thread must return to the caller that can then consume the parsed text
+ * through the {@link Reader} interface.
*
- * @param parser parser instance
- * @param stream binary stream
+ * <p>The created reader will be responsible for closing the given stream. The stream and any
+ * associated resources will be closed at or before the time when the {@link #close()} method is
+ * called on this reader.
+ *
+ * @param parser parser instance
+ * @param stream binary stream
* @param metadata document metadata
- * @param context parsing context
+ * @param context parsing context
* @param executor executor for the parsing task
* @throws IOException if the document can not be parsed
* @since Apache Tika 0.4
*/
- public ParsingReader(Parser parser, InputStream stream, Metadata metadata, ParseContext context,
- Executor executor) throws IOException {
+ public ParsingReader(
+ Parser parser,
+ InputStream stream,
+ Metadata metadata,
+ ParseContext context,
+ Executor executor)
+ throws IOException {
this.parser = parser;
PipedReader pipedReader = new PipedReader();
this.reader = new BufferedReader(pipedReader);
@@ -203,8 +195,7 @@
}
/**
- * Utility method that returns a {@link Metadata} instance
- * for a document with the given name.
+ * Utility method that returns a {@link Metadata} instance for a document with the given name.
*
* @param name resource name (or <code>null</code>)
* @return metadata instance
@@ -218,14 +209,14 @@
}
/**
- * Reads parsed text from the pipe connected to the parsing thread.
- * Fails if the parsing thread has thrown an exception.
+ * Reads parsed text from the pipe connected to the parsing thread. Fails if the parsing thread
+ * has thrown an exception.
*
* @param cbuf character buffer
- * @param off start offset within the buffer
- * @param len maximum number of characters to read
- * @throws IOException if the parsing thread has failed or
- * if for some reason the pipe does not work properly
+ * @param off start offset within the buffer
+ * @param len maximum number of characters to read
+ * @throws IOException if the parsing thread has failed or if for some reason the pipe does not
+ * work properly
*/
@Override
public int read(char[] cbuf, int off, int len) throws IOException {
@@ -240,9 +231,9 @@
}
/**
- * Closes the read end of the pipe. If the parsing thread is still
- * running, next write to the pipe will fail and cause the thread
- * to stop. Thus there is no need to explicitly terminate the thread.
+ * Closes the read end of the pipe. If the parsing thread is still running, next write to the
+ * pipe will fail and cause the thread to stop. Thus there is no need to explicitly terminate
+ * the thread.
*
* @throws IOException if the pipe can not be closed
*/
@@ -251,16 +242,13 @@
reader.close();
}
- /**
- * The background parsing task.
- */
+ /** The background parsing task. */
private class ParsingTask implements Runnable {
/**
- * Parses the given binary stream and writes the text content
- * to the write end of the pipe. Potential exceptions (including
- * the one caused if the read end is closed unexpectedly) are
- * stored before the input stream is closed and processing is stopped.
+ * Parses the given binary stream and writes the text content to the write end of the pipe.
+ * Potential exceptions (including the one caused if the read end is closed unexpectedly)
+ * are stored before the input stream is closed and processing is stopped.
*/
public void run() {
try {
@@ -286,7 +274,5 @@
}
}
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java b/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java
index b14badd..61dd2b3 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/PasswordProvider.java
@@ -19,23 +19,19 @@
import org.apache.tika.metadata.Metadata;
/**
- * Interface for providing a password to a Parser for handling Encrypted
- * and Password Protected Documents.
- * An implementation of this should be set on the {@link ParseContext}
- * supplied to {@link Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler,
- * Metadata, ParseContext)}
- * to provide a way to get the document password.
- * An implementation of this interface defines some specific selection
- * or lookup criteria, to be applied against the document metadata passed
- * to the {@link #getPassword(Metadata)} method.
+ * Interface for providing a password to a Parser for handling Encrypted and Password Protected
+ * Documents. An implementation of this should be set on the {@link ParseContext} supplied to {@link
+ * Parser#parse(java.io.InputStream, org.xml.sax.ContentHandler, Metadata, ParseContext)} to provide
+ * a way to get the document password. An implementation of this interface defines some specific
+ * selection or lookup criteria, to be applied against the document metadata passed to the {@link
+ * #getPassword(Metadata)} method.
*
* @since Apache Tika 1.1
*/
public interface PasswordProvider {
/**
- * Looks up the password for a document with the given metadata,
- * and returns it for the Parser. If no password is available
- * for the document, will return null.
+ * Looks up the password for a document with the given metadata, and returns it for the Parser.
+ * If no password is available for the document, will return null.
*
* @param metadata document metadata
* @return The document decryption password, or <code>null</code> if not known
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
index 3cb78d5..6037893 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java
@@ -19,11 +19,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.CorruptedFileException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
@@ -42,49 +37,44 @@
import org.apache.tika.sax.WriteLimiter;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.ParserUtils;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * This is a helper class that wraps a parser in a recursive handler.
- * It takes care of setting the embedded parser in the ParseContext
- * and handling the embedded path calculations.
- * <p>
- * After parsing a document, call getMetadata() to retrieve a list of
- * Metadata objects, one for each embedded resource. The first item
- * in the list will contain the Metadata for the outer container file.
- * <p>
- * Content can also be extracted and stored in the {@link TikaCoreProperties#TIKA_CONTENT} field
- * of a Metadata object. Select the type of content to be stored
- * at initialization.
- * <p>
- * If a WriteLimitReachedException is encountered, the wrapper will stop
- * processing the current resource, and it will not process
- * any of the child resources for the given resource. However, it will try to
- * parse as much as it can. If a WLRE is reached in the parent document,
- * no child resources will be parsed.
- * <p>
- * The implementation is based on Jukka's RecursiveMetadataParser
- * and Nick's additions. See:
- * <a href="http://wiki.apache.org/tika/RecursiveMetadata#Jukka.27s_RecursiveMetadata_Parser">RecursiveMetadataParser</a>.
- * <p>
- * Note that this wrapper holds all data in memory and is not appropriate
- * for files with content too large to be held in memory.
- * <p>
- * The unit tests for this class are in the tika-parsers module.
- * </p>
+ * This is a helper class that wraps a parser in a recursive handler. It takes care of setting the
+ * embedded parser in the ParseContext and handling the embedded path calculations.
+ *
+ * <p>After parsing a document, call getMetadata() to retrieve a list of Metadata objects, one for
+ * each embedded resource. The first item in the list will contain the Metadata for the outer
+ * container file.
+ *
+ * <p>Content can also be extracted and stored in the {@link TikaCoreProperties#TIKA_CONTENT} field
+ * of a Metadata object. Select the type of content to be stored at initialization.
+ *
+ * <p>If a WriteLimitReachedException is encountered, the wrapper will stop processing the current
+ * resource, and it will not process any of the child resources for the given resource. However, it
+ * will try to parse as much as it can. If a WLRE is reached in the parent document, no child
+ * resources will be parsed.
+ *
+ * <p>The implementation is based on Jukka's RecursiveMetadataParser and Nick's additions. See: <a
+ * href="http://wiki.apache.org/tika/RecursiveMetadata#Jukka.27s_RecursiveMetadata_Parser">RecursiveMetadataParser</a>.
+ *
+ * <p>Note that this wrapper holds all data in memory and is not appropriate for files with content
+ * too large to be held in memory.
+ *
+ * <p>The unit tests for this class are in the tika-parsers module.
*/
public class RecursiveParserWrapper extends ParserDecorator {
- /**
- * Generated serial version
- */
+ /** Generated serial version */
private static final long serialVersionUID = 9086536568120690938L;
-
private final boolean catchEmbeddedExceptions;
/**
- * Initialize the wrapper with {@link #catchEmbeddedExceptions} set
- * to <code>true</code> as default.
+ * Initialize the wrapper with {@link #catchEmbeddedExceptions} set to <code>true</code> as
+ * default.
*
* @param wrappedParser parser to use for the container documents and the embedded documents
*/
@@ -93,29 +83,26 @@
}
/**
- * @param wrappedParser parser to wrap
- * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions.
- * If set to <code>false</code>, embedded exceptions will be
- * thrown and the rest of the file will not be parsed. The
- * following will not be ignored:
- * {@link CorruptedFileException}, {@link RuntimeException}
+ * @param wrappedParser parser to wrap
+ * @param catchEmbeddedExceptions whether or not to catch+record embedded exceptions. If set to
+ * <code>false</code>, embedded exceptions will be thrown and the rest of the file will not
+ * be parsed. The following will not be ignored: {@link CorruptedFileException}, {@link
+ * RuntimeException}
*/
public RecursiveParserWrapper(Parser wrappedParser, boolean catchEmbeddedExceptions) {
super(wrappedParser);
this.catchEmbeddedExceptions = catchEmbeddedExceptions;
}
-
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return getWrappedParser().getSupportedTypes(context);
}
-
/**
* @param stream
- * @param recursiveParserWrapperHandler -- handler must implement
- * {@link RecursiveParserWrapperHandler}
+ * @param recursiveParserWrapperHandler -- handler must implement {@link
+ * RecursiveParserWrapperHandler}
* @param metadata
* @param context
* @throws IOException
@@ -124,14 +111,18 @@
* @throws IllegalStateException if the handler is not a {@link RecursiveParserWrapperHandler}
*/
@Override
- public void parse(InputStream stream, ContentHandler recursiveParserWrapperHandler,
- Metadata metadata, ParseContext context)
+ public void parse(
+ InputStream stream,
+ ContentHandler recursiveParserWrapperHandler,
+ Metadata metadata,
+ ParseContext context)
throws IOException, SAXException, TikaException {
- //this tracks the state of the parent parser, per call to #parse
+ // this tracks the state of the parent parser, per call to #parse
ParserState parserState;
if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) {
- parserState = new ParserState(
- (AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler);
+ parserState =
+ new ParserState(
+ (AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler);
} else {
throw new IllegalStateException(
"ContentHandler must implement RecursiveParserWrapperHandler");
@@ -149,17 +140,18 @@
if (recursiveParserWrapperHandler instanceof AbstractRecursiveParserWrapperHandler) {
ContentHandlerFactory factory =
- ((AbstractRecursiveParserWrapperHandler)recursiveParserWrapperHandler).getContentHandlerFactory();
+ ((AbstractRecursiveParserWrapperHandler) recursiveParserWrapperHandler)
+ .getContentHandlerFactory();
if (factory instanceof WriteLimiter) {
- writeLimit = ((WriteLimiter)factory).getWriteLimit();
- throwOnWriteLimitReached = ((WriteLimiter)factory).isThrowOnWriteLimitReached();
+ writeLimit = ((WriteLimiter) factory).getWriteLimit();
+ throwOnWriteLimitReached = ((WriteLimiter) factory).isThrowOnWriteLimitReached();
}
}
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
RecursivelySecureContentHandler secureContentHandler =
- new RecursivelySecureContentHandler(localHandler, tis, writeLimit,
- throwOnWriteLimitReached, context);
+ new RecursivelySecureContentHandler(
+ localHandler, tis, writeLimit, throwOnWriteLimitReached, context);
context.set(RecursivelySecureContentHandler.class, secureContentHandler);
getWrappedParser().parse(tis, secureContentHandler, metadata, context);
} catch (Throwable e) {
@@ -193,13 +185,12 @@
} else {
objectName = "embedded-" + (++state.unknownCount);
}
- //make sure that there isn't any path info in the objectName
- //some parsers can return paths, not just file names
+ // make sure that there isn't any path info in the objectName
+ // some parsers can return paths, not just file names
objectName = FilenameUtils.getName(objectName);
return objectName;
}
-
private class EmbeddedParserDecorator extends StatefulParser {
private static final long serialVersionUID = 207648200464263337L;
@@ -208,9 +199,8 @@
private String embeddedIdPath = null;
-
- private EmbeddedParserDecorator(Parser parser, String location,
- String embeddedIdPath, ParserState parseState) {
+ private EmbeddedParserDecorator(
+ Parser parser, String location, String embeddedIdPath, ParserState parseState) {
super(parser);
this.location = location;
if (!this.location.endsWith("/")) {
@@ -221,10 +211,11 @@
}
@Override
- public void parse(InputStream stream, ContentHandler ignore, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler ignore, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
- //Test to see if we should avoid parsing
+ // Test to see if we should avoid parsing
if (parserState.recursiveParserWrapperHandler.hasHitMaximumEmbeddedResources()) {
return;
}
@@ -235,25 +226,26 @@
metadata.add(TikaCoreProperties.EMBEDDED_RESOURCE_PATH, objectLocation);
String idPath =
- this.embeddedIdPath.equals("/") ?
- this.embeddedIdPath + ++parserState.embeddedCount :
- this.embeddedIdPath + "/" + ++parserState.embeddedCount;
+ this.embeddedIdPath.equals("/")
+ ? this.embeddedIdPath + ++parserState.embeddedCount
+ : this.embeddedIdPath + "/" + ++parserState.embeddedCount;
metadata.add(TikaCoreProperties.EMBEDDED_ID_PATH, idPath);
metadata.set(TikaCoreProperties.EMBEDDED_ID, parserState.embeddedCount);
- //get a fresh handler
+ // get a fresh handler
ContentHandler localHandler =
parserState.recursiveParserWrapperHandler.getNewContentHandler();
parserState.recursiveParserWrapperHandler.startEmbeddedDocument(localHandler, metadata);
Parser preContextParser = context.get(Parser.class);
- context.set(Parser.class,
- new EmbeddedParserDecorator(getWrappedParser(), objectLocation,
- idPath, parserState));
+ context.set(
+ Parser.class,
+ new EmbeddedParserDecorator(
+ getWrappedParser(), objectLocation, idPath, parserState));
long started = System.currentTimeMillis();
RecursivelySecureContentHandler secureContentHandler =
context.get(RecursivelySecureContentHandler.class);
- //store the handler that was used before this parse
- //so that you can return it back to its state at the end of this parse
+ // store the handler that was used before this parse
+ // so that you can return it back to its state at the end of this parse
ContentHandler preContextHandler = secureContentHandler.handler;
secureContentHandler.updateContentHandler(localHandler);
@@ -276,9 +268,9 @@
if (e instanceof EncryptedDocumentException) {
metadata.set(TikaCoreProperties.IS_ENCRYPTED, true);
}
- if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null &&
- e instanceof ZeroByteFileException) {
- //do nothing
+ if (context.get(ZeroByteFileException.IgnoreZeroByteFileException.class) != null
+ && e instanceof ZeroByteFileException) {
+ // do nothing
} else if (catchEmbeddedExceptions) {
ParserUtils.recordParserFailure(this, e, metadata);
} else {
@@ -289,20 +281,21 @@
secureContentHandler.updateContentHandler(preContextHandler);
long elapsedMillis = System.currentTimeMillis() - started;
metadata.set(TikaCoreProperties.PARSE_TIME_MILLIS, Long.toString(elapsedMillis));
- parserState.recursiveParserWrapperHandler
- .endEmbeddedDocument(localHandler, metadata);
+ parserState.recursiveParserWrapperHandler.endEmbeddedDocument(
+ localHandler, metadata);
}
}
}
/**
- * This tracks the state of the parse of a single document.
- * In future versions, this will allow the RecursiveParserWrapper to be thread safe.
+ * This tracks the state of the parse of a single document. In future versions, this will allow
+ * the RecursiveParserWrapper to be thread safe.
*/
private static class ParserState {
private final AbstractRecursiveParserWrapperHandler recursiveParserWrapperHandler;
private int unknownCount = 0;
- private int embeddedCount = 0;//this is effectively 1-indexed
+ private int embeddedCount = 0; // this is effectively 1-indexed
+
private ParserState(AbstractRecursiveParserWrapperHandler handler) {
this.recursiveParserWrapperHandler = handler;
}
@@ -311,7 +304,7 @@
static class RecursivelySecureContentHandler extends SecureContentHandler {
private ContentHandler handler;
- //total allowable chars across all handlers
+ // total allowable chars across all handlers
private final int totalWriteLimit;
private final boolean throwOnWriteLimitReached;
@@ -320,11 +313,15 @@
private boolean writeLimitReached = false;
- //total chars written to all handlers
+ // total chars written to all handlers
private int totalChars = 0;
- public RecursivelySecureContentHandler(ContentHandler handler, TikaInputStream stream,
- int totalWriteLimit,
- boolean throwOnWriteLimitReached, ParseContext parseContext) {
+
+ public RecursivelySecureContentHandler(
+ ContentHandler handler,
+ TikaInputStream stream,
+ int totalWriteLimit,
+ boolean throwOnWriteLimitReached,
+ ParseContext parseContext) {
super(handler, stream);
this.handler = handler;
this.totalWriteLimit = totalWriteLimit;
@@ -339,11 +336,10 @@
/**
* Bypass the SecureContentHandler...
- * <p>
- * This handler only looks at zip bomb via zip expansion.
- * Users should be protected within entries against nested
- * nested xml entities. We don't want to carry
- * those stats _across_ entries.
+ *
+ * <p>This handler only looks at zip bomb via zip expansion. Users should be protected
+ * within entries against nested nested xml entities. We don't want to carry those stats
+ * _across_ entries.
*
* @param uri
* @param localName
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java b/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java
index 412673b..c6bbabf 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RegexCaptureParser.java
@@ -28,10 +28,6 @@
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -40,6 +36,8 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
public class RegexCaptureParser implements Parser, Initializable {
@@ -50,15 +48,11 @@
private Map<String, Pattern> matchMap = new HashMap<>();
@Override
- public void initialize(Map<String, Param> params) throws TikaConfigException {
-
- }
+ public void initialize(Map<String, Param> params) throws TikaConfigException {}
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
- throws TikaConfigException {
-
- }
+ throws TikaConfigException {}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
@@ -68,10 +62,11 @@
private boolean writeContent = false;
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
- try (BufferedReader reader = new BufferedReader(new InputStreamReader(stream,
- StandardCharsets.UTF_8))) {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ try (BufferedReader reader =
+ new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8))) {
String line = reader.readLine();
Map<String, Matcher> localCaptureMap = new HashMap();
for (Map.Entry<String, Pattern> e : captureMap.entrySet()) {
diff --git a/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java
index 0daae6b..7babf88 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/RenderingParser.java
@@ -13,12 +13,11 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */package org.apache.tika.parser;
+ */ package org.apache.tika.parser;
import org.apache.tika.renderer.Renderer;
public interface RenderingParser {
void setRenderer(Renderer renderer);
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java b/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java
index 0fb657b..2428950 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/StatefulParser.java
@@ -17,15 +17,12 @@
package org.apache.tika.parser;
/**
- * The RecursiveParserWrapper wraps the parser sent
- * into the parsecontext and then uses that parser
+ * The RecursiveParserWrapper wraps the parser sent into the parsecontext and then uses that parser
* to store state (among many other things).
- * <p>
- * There are some use cases where regular parsers
- * want to parse content inline (e.g. OCR), and their
- * output should not be treated as coming from an embedded
- * object.
- **/
+ *
+ * <p>There are some use cases where regular parsers want to parse content inline (e.g. OCR), and
+ * their output should not be treated as coming from an embedded object.
+ */
public class StatefulParser extends ParserDecorator {
/**
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java
index ee4dfe2..b3b6ca5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/CompositeDigester.java
@@ -19,7 +19,6 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -27,7 +26,6 @@
import org.apache.tika.parser.DigestingParser;
import org.apache.tika.parser.ParseContext;
-
public class CompositeDigester implements DigestingParser.Digester {
private final DigestingParser.Digester[] digesters;
diff --git a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
index c3e4fde..e170795 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/digest/InputStreamDigester.java
@@ -24,7 +24,6 @@
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.security.Provider;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.BoundedInputStream;
import org.apache.tika.io.TemporaryResources;
@@ -47,19 +46,19 @@
}
/**
- * @param markLimit limit in bytes to allow for mark/reset. If the inputstream is longer
- * than this limit, the stream will be reset and then spooled to a
- * temporary file.
- * Throws IllegalArgumentException if < 0.
- * @param algorithm name of the digest algorithm to retrieve from the Provider
- * @param algorithmKeyName name of the algorithm to store
- * as part of the key in the metadata
- * when {@link #digest(InputStream, Metadata, ParseContext)} is called
- * @param encoder encoder to convert the byte array returned from the digester to a
- * string
+ * @param markLimit limit in bytes to allow for mark/reset. If the inputstream is longer than
+ * this limit, the stream will be reset and then spooled to a temporary file. Throws
+ * IllegalArgumentException if < 0.
+ * @param algorithm name of the digest algorithm to retrieve from the Provider
+ * @param algorithmKeyName name of the algorithm to store as part of the key in the metadata
+ * when {@link #digest(InputStream, Metadata, ParseContext)} is called
+ * @param encoder encoder to convert the byte array returned from the digester to a string
*/
- public InputStreamDigester(int markLimit, String algorithm, String algorithmKeyName,
- DigestingParser.Encoder encoder) {
+ public InputStreamDigester(
+ int markLimit,
+ String algorithm,
+ String algorithmKeyName,
+ DigestingParser.Encoder encoder) {
this.algorithm = algorithm;
this.algorithmKeyName = algorithmKeyName;
this.encoder = encoder;
@@ -70,11 +69,9 @@
}
}
- /**
- * Copied from commons-codec
- */
- private static MessageDigest updateDigest(MessageDigest digest, InputStream data, Metadata metadata)
- throws IOException {
+ /** Copied from commons-codec */
+ private static MessageDigest updateDigest(
+ MessageDigest digest, InputStream data, Metadata metadata) throws IOException {
byte[] buffer = new byte[1024];
long total = 0;
for (int read = data.read(buffer, 0, 1024); read > -1; read = data.read(buffer, 0, 1024)) {
@@ -87,7 +84,7 @@
private static void setContentLength(long length, Metadata metadata) {
if (StringUtils.isBlank(metadata.get(Metadata.CONTENT_LENGTH))) {
- //only add it if it hasn't been populated already
+ // only add it if it hasn't been populated already
metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
}
}
@@ -106,21 +103,20 @@
}
/**
- * When subclassing this, becare to ensure that your provider is
- * thread-safe (not likely) or return a new provider with each call.
+ * When subclassing this, becare to ensure that your provider is thread-safe (not likely) or
+ * return a new provider with each call.
*
- * @return provider to use to get the MessageDigest from the algorithm name.
- * Default is to return null.
+ * @return provider to use to get the MessageDigest from the algorithm name. Default is to
+ * return null.
*/
protected Provider getProvider() {
return null;
}
/**
- * @param is InputStream to digest. Best to use a TikaInputStream because
- * of potential need to spool to disk. InputStream must
- * support mark/reset.
- * @param metadata metadata in which to store the digest information
+ * @param is InputStream to digest. Best to use a TikaInputStream because of potential need to
+ * spool to disk. InputStream must support mark/reset.
+ * @param metadata metadata in which to store the digest information
* @param parseContext ParseContext -- not actually used yet, but there for future expansion
* @throws IOException on IO problem or IllegalArgumentException if algorithm couldn't be found
*/
@@ -133,19 +129,18 @@
if (tis.hasFile()) {
sz = tis.getLength();
}
- //if the inputstream has a file,
- //and its size is greater than its mark limit,
- //just digest the underlying file.
+ // if the inputstream has a file,
+ // and its size is greater than its mark limit,
+ // just digest the underlying file.
if (sz > markLimit) {
digestFile(tis.getFile(), sz, metadata);
return;
}
}
-
- //try the usual mark/reset stuff.
- //however, if you actually hit the bound,
- //then stop and spool to file via TikaInputStream
+ // try the usual mark/reset stuff.
+ // however, if you actually hit the bound,
+ // then stop and spool to file via TikaInputStream
BoundedInputStream bis = new BoundedInputStream(markLimit, is);
boolean finishedStream = false;
bis.mark(markLimit + 1);
@@ -154,8 +149,8 @@
if (finishedStream) {
return;
}
- //if the stream wasn't finished -- if the stream was longer than the mark limit --
- //spool to File and digest that.
+ // if the stream wasn't finished -- if the stream was longer than the mark limit --
+ // spool to File and digest that.
if (tis != null) {
digestFile(tis.getFile(), -1, metadata);
} else {
@@ -174,12 +169,14 @@
}
private String getMetadataKey() {
- return TikaCoreProperties.TIKA_META_PREFIX + "digest" +
- TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + algorithmKeyName;
+ return TikaCoreProperties.TIKA_META_PREFIX
+ + "digest"
+ + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER
+ + algorithmKeyName;
}
private void digestFile(File f, long sz, Metadata m) throws IOException {
- //only add it if it hasn't been populated already
+ // only add it if it hasn't been populated already
if (StringUtils.isBlank(m.get(Metadata.CONTENT_LENGTH))) {
if (sz < 0) {
sz = f.length();
@@ -192,7 +189,7 @@
}
/**
- * @param is input stream to read from
+ * @param is input stream to read from
* @param metadata metadata for reporting the digest
* @return whether or not this finished the input stream
* @throws IOException
@@ -212,5 +209,4 @@
metadata.set(getMetadataKey(), encoder.encode(digestBytes));
return true;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
index 53cb7b7..da4cb6e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java
@@ -18,17 +18,15 @@
import java.io.IOException;
import java.util.List;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
/**
- * A Composite Parser that wraps up all the available External Parsers,
- * and provides an easy way to access them.
- * Parser that uses an external program (like catdoc or pdf2txt) to extract
- * text content and metadata from a given document.
+ * A Composite Parser that wraps up all the available External Parsers, and provides an easy way to
+ * access them. Parser that uses an external program (like catdoc or pdf2txt) to extract text
+ * content and metadata from a given document.
*/
public class CompositeExternalParser extends CompositeParser {
private static final long serialVersionUID = 6962436916649024024L;
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
index beeed1f..5282f79 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
@@ -36,13 +36,7 @@
import java.util.concurrent.TimeoutException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
import org.apache.commons.io.IOUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
@@ -51,57 +45,56 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * Parser that uses an external program (like catdoc or pdf2txt) to extract
- * text content and metadata from a given document.
+ * Parser that uses an external program (like catdoc or pdf2txt) to extract text content and
+ * metadata from a given document.
*/
public class ExternalParser implements Parser {
private static final Logger LOG = LoggerFactory.getLogger(ExternalParser.class);
/**
- * The token, which if present in the Command string, will
- * be replaced with the input filename.
+ * The token, which if present in the Command string, will be replaced with the input filename.
* Alternately, the input data can be streamed over STDIN.
*/
public static final String INPUT_FILE_TOKEN = "${INPUT}";
+
/**
- * The token, which if present in the Command string, will
- * be replaced with the output filename.
+ * The token, which if present in the Command string, will be replaced with the output filename.
* Alternately, the output data can be collected on STDOUT.
*/
public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}";
+
private static final long serialVersionUID = -1079128990650687037L;
- //make this parameterizable
+ // make this parameterizable
private final long timeoutMs = 60000;
- /**
- * Media types supported by the external program.
- */
+
+ /** Media types supported by the external program. */
private Set<MediaType> supportedTypes = Collections.emptySet();
- /**
- * Regular Expressions to run over STDOUT to
- * extract Metadata.
- */
+ /** Regular Expressions to run over STDOUT to extract Metadata. */
private Map<Pattern, String> metadataPatterns = null;
+
/**
* The external command to invoke.
*
* @see Runtime#exec(String[])
*/
- private String[] command = new String[]{"cat"};
- /**
- * A consumer for ignored Lines
- */
+ private String[] command = new String[] {"cat"};
+
+ /** A consumer for ignored Lines */
private LineConsumer ignoredLineConsumer = LineConsumer.NULL;
/**
- * Starts a thread that reads and discards the contents of the
- * standard stream of the given process. Potential exceptions
- * are ignored, and the stream is closed once fully processed.
- * Note: calling this starts a new thread and blocks the current(caller)
- * thread until the new thread dies
+ * Starts a thread that reads and discards the contents of the standard stream of the given
+ * process. Potential exceptions are ignored, and the stream is closed once fully processed.
+ * Note: calling this starts a new thread and blocks the current(caller) thread until the new
+ * thread dies
*
* @param stream stream to be ignored
*/
@@ -110,25 +103,26 @@
}
/**
- * Starts a thread that reads and discards the contents of the
- * standard stream of the given process. Potential exceptions
- * are ignored, and the stream is closed once fully processed.
+ * Starts a thread that reads and discards the contents of the standard stream of the given
+ * process. Potential exceptions are ignored, and the stream is closed once fully processed.
*
- * @param stream stream to sent to black hole (a k a null)
- * @param waitForDeath when {@code true} the caller thread will be
- * blocked till the death of new thread.
+ * @param stream stream to sent to black hole (a k a null)
+ * @param waitForDeath when {@code true} the caller thread will be blocked till the death of new
+ * thread.
* @return The thread that is created and started
*/
private static Thread ignoreStream(final InputStream stream, boolean waitForDeath) {
- Thread t = new Thread(() -> {
- try {
- IOUtils.copy(stream, NULL_OUTPUT_STREAM);
- } catch (IOException e) {
- //swallow
- } finally {
- IOUtils.closeQuietly(stream);
- }
- });
+ Thread t =
+ new Thread(
+ () -> {
+ try {
+ IOUtils.copy(stream, NULL_OUTPUT_STREAM);
+ } catch (IOException e) {
+ // swallow
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+ });
t.start();
if (waitForDeath) {
try {
@@ -140,20 +134,19 @@
}
/**
- * Checks to see if the command can be run. Typically used with
- * something like "myapp --version" to check to see if "myapp"
- * is installed and on the path.
+ * Checks to see if the command can be run. Typically used with something like "myapp --version"
+ * to check to see if "myapp" is installed and on the path.
*
- * @param checkCmd The check command to run
+ * @param checkCmd The check command to run
* @param errorValue What is considered an error value?
*/
public static boolean check(String checkCmd, int... errorValue) {
- return check(new String[]{checkCmd}, errorValue);
+ return check(new String[] {checkCmd}, errorValue);
}
public static boolean check(String[] checkCmd, int... errorValue) {
if (errorValue.length == 0) {
- errorValue = new int[]{127};
+ errorValue = new int[] {127};
}
Process process = null;
@@ -163,7 +156,7 @@
Thread stdOutSuckerThread = ignoreStream(process.getInputStream(), false);
stdErrSuckerThread.join();
stdOutSuckerThread.join();
- //make the timeout parameterizable
+ // make the timeout parameterizable
boolean finished = process.waitFor(60000, TimeUnit.MILLISECONDS);
if (!finished) {
throw new TimeoutException();
@@ -184,14 +177,15 @@
// External process execution is banned by the security manager
throw se;
} catch (Error err) {
- if (err.getMessage() != null && (err.getMessage().contains("posix_spawn") ||
- err.getMessage().contains("UNIXProcess"))) {
+ if (err.getMessage() != null
+ && (err.getMessage().contains("posix_spawn")
+ || err.getMessage().contains("UNIXProcess"))) {
LOG.debug("(TIKA-1526): exception trying to run: " + checkCmd[0], err);
- //"Error forking command due to JVM locale bug
- //(see TIKA-1526 and SOLR-6387)"
+ // "Error forking command due to JVM locale bug
+ // (see TIKA-1526 and SOLR-6387)"
return false;
}
- //throw if a different kind of error
+ // throw if a different kind of error
throw err;
} finally {
if (process != null) {
@@ -217,9 +211,8 @@
}
/**
- * Sets the command to be run. This can include either of
- * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN}
- * if the command needs filenames.
+ * Sets the command to be run. This can include either of {@link #INPUT_FILE_TOKEN} or {@link
+ * #OUTPUT_FILE_TOKEN} if the command needs filenames.
*
* @see Runtime#exec(String[])
*/
@@ -250,23 +243,21 @@
}
/**
- * Sets the map of regular expression patterns and Metadata
- * keys. Any matching patterns will have the matching
- * metadata entries set.
- * Set this to null to disable Metadata extraction.
+ * Sets the map of regular expression patterns and Metadata keys. Any matching patterns will
+ * have the matching metadata entries set. Set this to null to disable Metadata extraction.
*/
public void setMetadataExtractionPatterns(Map<Pattern, String> patterns) {
this.metadataPatterns = patterns;
}
/**
- * Executes the configured external command and passes the given document
- * stream as a simple XHTML document to the given SAX content handler.
- * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)}
- * has been called to set patterns.
+ * Executes the configured external command and passes the given document stream as a simple
+ * XHTML document to the given SAX content handler. Metadata is only extracted if {@link
+ * #setMetadataExtractionPatterns(Map)} has been called to set patterns.
*/
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
TemporaryResources tmp = new TemporaryResources();
@@ -277,8 +268,12 @@
}
}
- private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata,
- TemporaryResources tmp) throws IOException, SAXException, TikaException {
+ private void parse(
+ TikaInputStream stream,
+ XHTMLContentHandler xhtml,
+ Metadata metadata,
+ TemporaryResources tmp)
+ throws IOException, SAXException, TikaException {
boolean inputToStdIn = true;
boolean outputFromStdOut = true;
boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
@@ -360,14 +355,14 @@
}
/**
- * Starts a thread that extracts the contents of the standard output
- * stream of the given process to the given XHTML content handler.
- * The standard output stream is closed once fully processed.
+ * Starts a thread that extracts the contents of the standard output stream of the given process
+ * to the given XHTML content handler. The standard output stream is closed once fully
+ * processed.
*
* @param stream
- * @param xhtml XHTML content handler
+ * @param xhtml XHTML content handler
* @throws SAXException if the XHTML SAX events could not be handled
- * @throws IOException if an input error occurred
+ * @throws IOException if an input error occurred
*/
private void extractOutput(InputStream stream, XHTMLContentHandler xhtml)
throws SAXException, IOException {
@@ -384,24 +379,25 @@
}
/**
- * Starts a thread that sends the contents of the given input stream
- * to the standard input stream of the given process. Potential
- * exceptions are ignored, and the standard input stream is closed
- * once fully processed. Note that the given input stream is <em>not</em>
- * closed by this method.
+ * Starts a thread that sends the contents of the given input stream to the standard input
+ * stream of the given process. Potential exceptions are ignored, and the standard input stream
+ * is closed once fully processed. Note that the given input stream is <em>not</em> closed by
+ * this method.
*
* @param process process
- * @param stream input stream
+ * @param stream input stream
*/
private void sendInput(final Process process, final InputStream stream) {
- Thread t = new Thread(() -> {
- OutputStream stdin = process.getOutputStream();
- try {
- IOUtils.copy(stream, stdin);
- } catch (IOException e) {
- //swallow
- }
- });
+ Thread t =
+ new Thread(
+ () -> {
+ OutputStream stdin = process.getOutputStream();
+ try {
+ IOUtils.copy(stream, stdin);
+ } catch (IOException e) {
+ // swallow
+ }
+ });
t.start();
try {
t.join();
@@ -410,36 +406,39 @@
}
private void extractMetadata(final InputStream stream, final Metadata metadata) {
- Thread t = new Thread(() -> {
- BufferedReader reader;
- reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
- try {
- String line;
- while ((line = reader.readLine()) != null) {
- boolean consumed = false;
- for (Map.Entry<Pattern, String> entry : metadataPatterns.entrySet()) {
- Matcher m = entry.getKey().matcher(line);
- if (m.find()) {
- consumed = true;
- if (entry.getValue() != null &&
- !entry.getValue().equals("")) {
- metadata.add(entry.getValue(), m.group(1));
- } else {
- metadata.add(m.group(1), m.group(2));
+ Thread t =
+ new Thread(
+ () -> {
+ BufferedReader reader;
+ reader = new BufferedReader(new InputStreamReader(stream, UTF_8));
+ try {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ boolean consumed = false;
+ for (Map.Entry<Pattern, String> entry :
+ metadataPatterns.entrySet()) {
+ Matcher m = entry.getKey().matcher(line);
+ if (m.find()) {
+ consumed = true;
+ if (entry.getValue() != null
+ && !entry.getValue().equals("")) {
+ metadata.add(entry.getValue(), m.group(1));
+ } else {
+ metadata.add(m.group(1), m.group(2));
+ }
+ }
+ }
+ if (!consumed) {
+ ignoredLineConsumer.consume(line);
+ }
+ }
+ } catch (IOException e) {
+ // Ignore
+ } finally {
+ IOUtils.closeQuietly(reader);
+ IOUtils.closeQuietly(stream);
}
- }
- }
- if (!consumed) {
- ignoredLineConsumer.consume(line);
- }
- }
- } catch (IOException e) {
- // Ignore
- } finally {
- IOUtils.closeQuietly(reader);
- IOUtils.closeQuietly(stream);
- }
- });
+ });
t.start();
try {
t.join();
@@ -453,12 +452,11 @@
* @since Apache Tika 1.14
*/
public interface LineConsumer extends Serializable {
- /**
- * A null consumer
- */
- LineConsumer NULL = line -> {
- // ignores
- };
+ /** A null consumer */
+ LineConsumer NULL =
+ line -> {
+ // ignores
+ };
/**
* Consume a line
@@ -467,6 +465,4 @@
*/
void consume(String line);
}
-
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
index 3c79fd3..0d14d2f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java
@@ -27,7 +27,10 @@
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
-
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.mime.MimeTypeException;
+import org.apache.tika.utils.XMLReaderUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
@@ -35,16 +38,9 @@
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.mime.MimeTypeException;
-import org.apache.tika.utils.XMLReaderUtils;
-
/**
- * Builds up ExternalParser instances based on XML file(s)
- * which define what to run, for what, and how to process
- * any output metadata.
- * Typically used to configure up a series of external programs
+ * Builds up ExternalParser instances based on XML file(s) which define what to run, for what, and
+ * how to process any output metadata. Typically used to configure up a series of external programs
* (like catdoc or pdf2txt) to extract text content from documents.
*
* <pre>
@@ -86,16 +82,17 @@
}
} else {
throw new MimeTypeException(
- "Not a <" + EXTERNAL_PARSERS_TAG + "/> configuration document: " +
- (element != null ? element.getTagName() : "n/a"));
+ "Not a <"
+ + EXTERNAL_PARSERS_TAG
+ + "/> configuration document: "
+ + (element != null ? element.getTagName() : "n/a"));
}
return parsers;
}
/**
- * Builds and Returns an ExternalParser, or null if a check
- * command was given that didn't match.
+ * Builds and Returns an ExternalParser, or null if a check command was given that didn't match.
*/
private static ExternalParser readParser(Element parserDef) throws TikaException {
ExternalParser parser = new ExternalParser();
@@ -122,7 +119,8 @@
parser.setMetadataExtractionPatterns(readMetadataPatterns(child));
break;
default:
- throw new IllegalArgumentException("reaction not defined for " + child.getTagName());
+ throw new IllegalArgumentException(
+ "reaction not defined for " + child.getTagName());
}
}
}
@@ -186,7 +184,7 @@
String s = st.nextToken();
errorVals.add(Integer.parseInt(s));
} catch (NumberFormatException e) {
- //swallow
+ // swallow
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
index 86369c6..dadb5ba 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.parser.external;
-/**
- * Met Keys used by the {@link ExternalParsersConfigReader}.
- */
+/** Met Keys used by the {@link ExternalParsersConfigReader}. */
public interface ExternalParsersConfigReaderMetKeys {
String EXTERNAL_PARSERS_TAG = "external-parsers";
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
index 561cbe7..f4137ac 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java
@@ -24,7 +24,6 @@
import java.util.Enumeration;
import java.util.List;
import java.util.Map;
-
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
@@ -33,8 +32,7 @@
import org.apache.tika.parser.Parser;
/**
- * Creates instances of ExternalParser based on XML
- * configuration files.
+ * Creates instances of ExternalParser based on XML configuration files.
*
* @see ExternalParsersConfigReader
*/
@@ -52,8 +50,9 @@
public static List<ExternalParser> create(String filename, ServiceLoader loader)
throws IOException, TikaException {
String filepath =
- ExternalParsersFactory.class.getPackage().getName().replace('.', '/') + "/" +
- filename;
+ ExternalParsersFactory.class.getPackage().getName().replace('.', '/')
+ + "/"
+ + filename;
Enumeration<URL> files = loader.findServiceResources(filepath);
ArrayList<URL> list = Collections.list(files);
URL[] urls = list.toArray(new URL[0]);
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java b/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
index 4ee27b9..8c9e2ae 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * External parser process.
- */
+/** External parser process. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.parser.external;
diff --git a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
index 5dbea57..273f6f1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java
@@ -29,12 +29,6 @@
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -54,14 +48,17 @@
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.utils.FileProcessResult;
import org.apache.tika.utils.ProcessUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * This is a next generation external parser that uses some of the more
- * recent additions to Tika. This is an experimental alternative to the
- * {@link org.apache.tika.parser.external.ExternalParser}.
- * Specifically, it relies more on configuration than the SPI model.
- * Further, users can specify a parser to handle the output
- * of the external process.
+ * This is a next generation external parser that uses some of the more recent additions to Tika.
+ * This is an experimental alternative to the {@link
+ * org.apache.tika.parser.external.ExternalParser}. Specifically, it relies more on configuration
+ * than the SPI model. Further, users can specify a parser to handle the output of the external
+ * process.
*/
public class ExternalParser implements Parser, Initializable {
@@ -98,9 +95,10 @@
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
- //this may remain null, depending on whether the external parser writes to a file
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ // this may remain null, depending on whether the external parser writes to a file
Path outFile = null;
try (TemporaryResources tmp = new TemporaryResources()) {
TikaInputStream tis = TikaInputStream.get(stream, tmp, metadata);
@@ -111,13 +109,18 @@
boolean outputFileInCommandline = false;
for (String c : commandLine) {
if (inputMatcher.reset(c).find()) {
- String updated = c.replace(INPUT_FILE_TOKEN,
- ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()));
+ String updated =
+ c.replace(
+ INPUT_FILE_TOKEN,
+ ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()));
thisCommandLine.add(updated);
} else if (outputMatcher.reset(c).find()) {
outFile = Files.createTempFile("tika-external2-", "");
- String updated = c.replace(OUTPUT_FILE_TOKEN,
- ProcessUtils.escapeCommandLine(outFile.toAbsolutePath().toString()));
+ String updated =
+ c.replace(
+ OUTPUT_FILE_TOKEN,
+ ProcessUtils.escapeCommandLine(
+ outFile.toAbsolutePath().toString()));
thisCommandLine.add(updated);
outputFileInCommandline = true;
} else {
@@ -127,21 +130,27 @@
FileProcessResult result = null;
long localTimeoutMillis = TikaTaskTimeout.getTimeoutMillis(context, timeoutMs);
if (outputFileInCommandline) {
- result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine),
- localTimeoutMillis, maxStdOut, maxStdErr);
+ result =
+ ProcessUtils.execute(
+ new ProcessBuilder(thisCommandLine),
+ localTimeoutMillis,
+ maxStdOut,
+ maxStdErr);
} else {
outFile = Files.createTempFile("tika-external2-", "");
- result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine),
- localTimeoutMillis, outFile, maxStdErr);
+ result =
+ ProcessUtils.execute(
+ new ProcessBuilder(thisCommandLine),
+ localTimeoutMillis,
+ outFile,
+ maxStdErr);
}
metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout());
metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
metadata.set(ExternalProcess.STD_OUT_LENGTH, result.getStdoutLength());
- metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED,
- result.isStdoutTruncated());
+ metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED, result.isStdoutTruncated());
metadata.set(ExternalProcess.STD_ERR_LENGTH, result.getStderrLength());
- metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED,
- result.isStderrTruncated());
+ metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED, result.isStderrTruncated());
if (returnStdout) {
metadata.set(ExternalProcess.STD_OUT, result.getStdout());
@@ -160,23 +169,26 @@
}
}
- private void handleOutput(FileProcessResult result, Path outFile,
- XHTMLContentHandler xhtml, Metadata metadata,
- ParseContext parseContext) throws SAXException, TikaException,
- IOException {
+ private void handleOutput(
+ FileProcessResult result,
+ Path outFile,
+ XHTMLContentHandler xhtml,
+ Metadata metadata,
+ ParseContext parseContext)
+ throws SAXException, TikaException, IOException {
if (outputParser == EmptyParser.INSTANCE) {
if (outFile != null) {
try (BufferedReader reader = Files.newBufferedReader(outFile)) {
String line = reader.readLine();
while (line != null) {
- //do we want to wrap this in <p></p> elements?
+ // do we want to wrap this in <p></p> elements?
xhtml.characters(line);
xhtml.newline();
line = reader.readLine();
}
}
} else {
- //read this in line by line and wrap <p></p> elements?
+ // read this in line by line and wrap <p></p> elements?
xhtml.characters(result.getStdout());
}
} else {
@@ -185,18 +197,17 @@
outputParser.parse(is, new BodyContentHandler(xhtml), metadata, parseContext);
}
} else {
- try (InputStream is = TikaInputStream.get(
- result.getStdout().getBytes(StandardCharsets.UTF_8))) {
+ try (InputStream is =
+ TikaInputStream.get(result.getStdout().getBytes(StandardCharsets.UTF_8))) {
outputParser.parse(is, new BodyContentHandler(xhtml), metadata, parseContext);
}
}
}
-
}
/**
- * This is set during initialization from a tika-config.
- * Any calls after initialization will result in a {@link IllegalStateException}.
+ * This is set during initialization from a tika-config. Any calls after initialization will
+ * result in a {@link IllegalStateException}.
*
* @param supportedTypes
*/
@@ -226,9 +237,8 @@
}
/**
- * Use this to specify the full commandLine. The commandline must
- * include at least {@link ExternalParser#INPUT_FILE_TOKEN}.
- * If the external process writes to an output file, specify
+ * Use this to specify the full commandLine. The commandline must include at least {@link
+ * ExternalParser#INPUT_FILE_TOKEN}. If the external process writes to an output file, specify
* {@link ExternalParser#OUTPUT_FILE_TOKEN}.
*
* @param commandLine
@@ -238,12 +248,10 @@
this.commandLine = commandLine;
}
-
/**
- * If set to true, this will return the stdout in the metadata
- * via {@link org.apache.tika.metadata.ExternalProcess#STD_OUT}.
- * Default is <code>false</code> because this should normally
- * be handled by the outputParser
+ * If set to true, this will return the stdout in the metadata via {@link
+ * org.apache.tika.metadata.ExternalProcess#STD_OUT}. Default is <code>false</code> because this
+ * should normally be handled by the outputParser
*
* @param returnStdout
*/
@@ -253,9 +261,9 @@
}
/**
- * If set to true, this will return the stderr in the metadata
- * via {@link org.apache.tika.metadata.ExternalProcess#STD_ERR}.
- * Default is <code>true</code>
+ * If set to true, this will return the stderr in the metadata via {@link
+ * org.apache.tika.metadata.ExternalProcess#STD_ERR}. Default is <code>true</code>
+ *
* @param returnStderr
*/
@Field
@@ -264,10 +272,10 @@
}
/**
- * This parser is called on the output of the process.
- * If the process writes to an output file, specified by
- * {@link ExternalParser#OUTPUT_FILE_TOKEN}, this parser will parse that file,
+ * This parser is called on the output of the process. If the process writes to an output file,
+ * specified by {@link ExternalParser#OUTPUT_FILE_TOKEN}, this parser will parse that file,
* otherwise it will parse the UTF-8 encoded bytes from the process' STD_OUT.
+ *
* @param parser
*/
@Field
@@ -281,7 +289,7 @@
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
- //no-op
+ // no-op
}
@Override
@@ -295,9 +303,9 @@
}
if (outputParser == EmptyParser.INSTANCE) {
- LOG.debug("no parser selected for the output; contents will be " +
- "written to the content handler");
+ LOG.debug(
+ "no parser selected for the output; contents will be "
+ + "written to the content handler");
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
index 9f2ea8a..c2a9125 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/AbstractMultipleParser.java
@@ -30,10 +30,6 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
@@ -46,56 +42,55 @@
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.utils.ParserUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * Abstract base class for parser wrappers which may / will
- * process a given stream multiple times, merging the results
- * of the various parsers used.
- * End users should normally use {@link FallbackParser} or
- * {@link SupplementingParser} along with a Strategy.
- * Note that unless you give a {@link ContentHandlerFactory},
- * you'll get content from every parser tried mushed together!
+ * Abstract base class for parser wrappers which may / will process a given stream multiple times,
+ * merging the results of the various parsers used. End users should normally use {@link
+ * FallbackParser} or {@link SupplementingParser} along with a Strategy. Note that unless you give a
+ * {@link ContentHandlerFactory}, you'll get content from every parser tried mushed together!
*
* @since Apache Tika 1.18
*/
public abstract class AbstractMultipleParser implements Parser {
protected static final String METADATA_POLICY_CONFIG_KEY = "metadataPolicy";
- /**
- * Serial version UID.
- */
+
+ /** Serial version UID. */
private static final long serialVersionUID = 5383668090329836559L;
- /**
- * How we should handle metadata clashes
- */
+
+ /** How we should handle metadata clashes */
private final MetadataPolicy policy;
- /**
- * List of the multiple parsers to try.
- */
+
+ /** List of the multiple parsers to try. */
private final Collection<? extends Parser> parsers;
+
/**
- * Computed list of Mime Types to offer, which is all
- * those in common between the parsers.
- * For explicit mimetypes only, use a {@link ParserDecorator}
+ * Computed list of Mime Types to offer, which is all those in common between the parsers. For
+ * explicit mimetypes only, use a {@link ParserDecorator}
*/
private final Set<MediaType> offeredTypes;
- /**
- * Media type registry.
- */
+
+ /** Media type registry. */
private MediaTypeRegistry registry;
@SuppressWarnings("rawtypes")
- public AbstractMultipleParser(MediaTypeRegistry registry, Collection<? extends Parser> parsers,
- Map<String, Param> params) {
+ public AbstractMultipleParser(
+ MediaTypeRegistry registry,
+ Collection<? extends Parser> parsers,
+ Map<String, Param> params) {
this(registry, getMetadataPolicy(params), parsers);
}
- public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy,
- Parser... parsers) {
+ public AbstractMultipleParser(
+ MediaTypeRegistry registry, MetadataPolicy policy, Parser... parsers) {
this(registry, policy, Arrays.asList(parsers));
}
- public AbstractMultipleParser(MediaTypeRegistry registry, MetadataPolicy policy,
- Collection<? extends Parser> parsers) {
+ public AbstractMultipleParser(
+ MediaTypeRegistry registry,
+ MetadataPolicy policy,
+ Collection<? extends Parser> parsers) {
this.policy = policy;
this.parsers = parsers;
this.registry = registry;
@@ -117,8 +112,8 @@
"Required parameter '" + METADATA_POLICY_CONFIG_KEY + "' not supplied");
}
- protected static Metadata mergeMetadata(Metadata newMetadata, Metadata lastMetadata,
- MetadataPolicy policy) {
+ protected static Metadata mergeMetadata(
+ Metadata newMetadata, Metadata lastMetadata, MetadataPolicy policy) {
if (policy == MetadataPolicy.DISCARD_ALL) {
return newMetadata;
}
@@ -211,56 +206,60 @@
return Collections.unmodifiableList(new ArrayList<>(parsers));
}
- /**
- * Used to allow implementations to prepare or change things
- * before parsing occurs
- */
- protected void parserPrepare(Parser parser, Metadata metadata, ParseContext context) {
- }
+ /** Used to allow implementations to prepare or change things before parsing occurs */
+ protected void parserPrepare(Parser parser, Metadata metadata, ParseContext context) {}
/**
- * Used to notify implementations that a Parser has Finished
- * or Failed, and to allow them to decide to continue or
- * abort further parsing
+ * Used to notify implementations that a Parser has Finished or Failed, and to allow them to
+ * decide to continue or abort further parsing
*/
- protected abstract boolean parserCompleted(Parser parser, Metadata metadata,
- ContentHandler handler, ParseContext context,
- Exception exception);
+ protected abstract boolean parserCompleted(
+ Parser parser,
+ Metadata metadata,
+ ContentHandler handler,
+ ParseContext context,
+ Exception exception);
/**
- * Processes the given Stream through one or more parsers,
- * resetting things between parsers as requested by policy.
- * The actual processing is delegated to one or more {@link Parser}s.
- * <p>
- * Note that you'll get text from every parser this way, to have
- * control of which content is from which parser you need to
- * call the method with a {@link ContentHandlerFactory} instead.
+ * Processes the given Stream through one or more parsers, resetting things between parsers as
+ * requested by policy. The actual processing is delegated to one or more {@link Parser}s.
+ *
+ * <p>Note that you'll get text from every parser this way, to have control of which content is
+ * from which parser you need to call the method with a {@link ContentHandlerFactory} instead.
*/
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
parse(stream, handler, null, metadata, context);
}
/**
- * Processes the given Stream through one or more parsers,
- * resetting things between parsers as requested by policy.
- * The actual processing is delegated to one or more {@link Parser}s.
- * You will get one ContentHandler fetched for each Parser used.
- * TODO Do we need to return all the ContentHandler instances we created?
+ * Processes the given Stream through one or more parsers, resetting things between parsers as
+ * requested by policy. The actual processing is delegated to one or more {@link Parser}s. You
+ * will get one ContentHandler fetched for each Parser used. TODO Do we need to return all the
+ * ContentHandler instances we created?
*
- * @deprecated The {@link ContentHandlerFactory} override is still experimental
- * and the method signature is subject to change before Tika 2.0
+ * @deprecated The {@link ContentHandlerFactory} override is still experimental and the method
+ * signature is subject to change before Tika 2.0
*/
@Deprecated
- public void parse(InputStream stream, ContentHandlerFactory handlers, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream,
+ ContentHandlerFactory handlers,
+ Metadata metadata,
+ ParseContext context)
+ throws IOException, SAXException, TikaException {
parse(stream, null, handlers, metadata, context);
}
- private void parse(InputStream stream, ContentHandler handler,
- ContentHandlerFactory handlerFactory, Metadata originalMetadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ private void parse(
+ InputStream stream,
+ ContentHandler handler,
+ ContentHandlerFactory handlerFactory,
+ Metadata originalMetadata,
+ ParseContext context)
+ throws IOException, SAXException, TikaException {
// Track the metadata between parsers, so we can apply our policy
Metadata lastMetadata = cloneMetadata(originalMetadata);
Metadata metadata = lastMetadata;
@@ -270,7 +269,8 @@
try {
// Ensure we'll be able to re-read safely, buffering to disk if so,
// to permit Parsers 2+ to be able to read the same data
- InputStream taggedStream = ParserUtils.ensureStreamReReadable(stream, tmp, originalMetadata);
+ InputStream taggedStream =
+ ParserUtils.ensureStreamReReadable(stream, tmp, originalMetadata);
for (Parser p : parsers) {
// Get a new handler for this parser, if we can
@@ -342,31 +342,20 @@
}
/**
- * The various strategies for handling metadata emitted by
- * multiple parsers.
- * Note that not all will be supported by all subclasses.
+ * The various strategies for handling metadata emitted by multiple parsers. Note that not all
+ * will be supported by all subclasses.
*/
public enum MetadataPolicy {
- /**
- * Before moving onto another parser, throw away
- * all previously seen metadata
- */
+ /** Before moving onto another parser, throw away all previously seen metadata */
DISCARD_ALL,
- /**
- * The first parser to output a given key wins,
- * merge in non-clashing other keys
- */
+ /** The first parser to output a given key wins, merge in non-clashing other keys */
FIRST_WINS,
/**
- * The last parser to output a given key wins,
- * overriding previous parser values for a
+ * The last parser to output a given key wins, overriding previous parser values for a
* clashing key.
*/
LAST_WINS,
- /**
- * Where multiple parsers output a given key,
- * store all their different (unique) values
- */
+ /** Where multiple parsers output a given key, store all their different (unique) values */
KEEP_ALL
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
index e538e59..78cd108 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/FallbackParser.java
@@ -20,43 +20,41 @@
import java.util.Collection;
import java.util.List;
import java.util.Map;
-
-import org.xml.sax.ContentHandler;
-
import org.apache.tika.config.Param;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
/**
* Tries multiple parsers in turn, until one succeeds.
- * <p>
- * Can optionally keep Metadata from failed parsers when
- * trying the next one, depending on the {@link AbstractMultipleParser.MetadataPolicy}
- * chosen.
+ *
+ * <p>Can optionally keep Metadata from failed parsers when trying the next one, depending on the
+ * {@link AbstractMultipleParser.MetadataPolicy} chosen.
*
* @since Apache Tika 1.18
*/
public class FallbackParser extends AbstractMultipleParser {
- /**
- * The different Metadata Policies we support (all)
- */
+ /** The different Metadata Policies we support (all) */
public static final List<MetadataPolicy> allowedPolicies =
Arrays.asList(MetadataPolicy.values());
- /**
- * Serial version UID.
- */
+
+ /** Serial version UID. */
private static final long serialVersionUID = 5844409020977206167L;
@SuppressWarnings("rawtypes")
- public FallbackParser(MediaTypeRegistry registry, Collection<? extends Parser> parsers,
- Map<String, Param> params) {
+ public FallbackParser(
+ MediaTypeRegistry registry,
+ Collection<? extends Parser> parsers,
+ Map<String, Param> params) {
super(registry, parsers, params);
}
- public FallbackParser(MediaTypeRegistry registry, MetadataPolicy policy,
- Collection<? extends Parser> parsers) {
+ public FallbackParser(
+ MediaTypeRegistry registry,
+ MetadataPolicy policy,
+ Collection<? extends Parser> parsers) {
super(registry, policy, parsers);
}
@@ -65,12 +63,15 @@
}
@Override
- protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandler handler,
- ParseContext context, Exception exception) {
+ protected boolean parserCompleted(
+ Parser parser,
+ Metadata metadata,
+ ContentHandler handler,
+ ParseContext context,
+ Exception exception) {
// If there was no exception, abort further parsers
return exception != null;
// Have the next parser tried
}
}
-
diff --git a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
index 8cf83c0..213d8c5 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/multiple/SupplementingParser.java
@@ -20,52 +20,49 @@
import java.util.Collection;
import java.util.List;
import java.util.Map;
-
-import org.xml.sax.ContentHandler;
-
import org.apache.tika.config.Param;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
+import org.xml.sax.ContentHandler;
/**
- * Runs the input stream through all available parsers,
- * merging the metadata from them based on the
+ * Runs the input stream through all available parsers, merging the metadata from them based on the
* {@link AbstractMultipleParser.MetadataPolicy} chosen.
- * <p>
- * Warning - currently only one Parser should output
- * any Content to the {@link ContentHandler}, the rest
- * should only output {@link Metadata}. A solution to
- * multiple-content is still being worked on...
+ *
+ * <p>Warning - currently only one Parser should output any Content to the {@link ContentHandler},
+ * the rest should only output {@link Metadata}. A solution to multiple-content is still being
+ * worked on...
*
* @since Apache Tika 1.18
*/
public class SupplementingParser extends AbstractMultipleParser {
- /**
- * The different Metadata Policies we support (not discard)
- */
+ /** The different Metadata Policies we support (not discard) */
public static final List<MetadataPolicy> allowedPolicies =
- Arrays.asList(MetadataPolicy.FIRST_WINS, MetadataPolicy.LAST_WINS,
- MetadataPolicy.KEEP_ALL);
- /**
- * Serial version UID.
- */
+ Arrays.asList(
+ MetadataPolicy.FIRST_WINS, MetadataPolicy.LAST_WINS, MetadataPolicy.KEEP_ALL);
+
+ /** Serial version UID. */
private static final long serialVersionUID = 313179254565350994L;
@SuppressWarnings("rawtypes")
- public SupplementingParser(MediaTypeRegistry registry, Collection<? extends Parser> parsers,
- Map<String, Param> params) {
+ public SupplementingParser(
+ MediaTypeRegistry registry,
+ Collection<? extends Parser> parsers,
+ Map<String, Param> params) {
super(registry, parsers, params);
}
- public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy,
- Parser... parsers) {
+ public SupplementingParser(
+ MediaTypeRegistry registry, MetadataPolicy policy, Parser... parsers) {
this(registry, policy, Arrays.asList(parsers));
}
- public SupplementingParser(MediaTypeRegistry registry, MetadataPolicy policy,
- Collection<? extends Parser> parsers) {
+ public SupplementingParser(
+ MediaTypeRegistry registry,
+ MetadataPolicy policy,
+ Collection<? extends Parser> parsers) {
super(registry, policy, parsers);
// Ensure it's a supported policy
@@ -76,8 +73,12 @@
}
@Override
- protected boolean parserCompleted(Parser parser, Metadata metadata, ContentHandler handler,
- ParseContext context, Exception exception) {
+ protected boolean parserCompleted(
+ Parser parser,
+ Metadata metadata,
+ ContentHandler handler,
+ ParseContext context,
+ Exception exception) {
// If there was no exception, just carry on to the next
if (exception == null) {
return true;
@@ -87,4 +88,3 @@
return true;
}
}
-
diff --git a/tika-core/src/main/java/org/apache/tika/parser/package-info.java b/tika-core/src/main/java/org/apache/tika/parser/package-info.java
index 10df69e..f536df4 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Tika parsers.
- */
+/** Tika parsers. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.parser;
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java b/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java
index f8dcffb..1e01008 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/CompositePipesReporter.java
@@ -20,7 +20,6 @@
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -37,7 +36,6 @@
for (PipesReporter reporter : pipesReporters) {
reporter.report(t, result, elapsed);
}
-
}
@Override
@@ -82,7 +80,7 @@
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
- //no-op
+ // no-op
}
@Override
@@ -97,8 +95,8 @@
}
/**
- * Tries to close all resources. Throws the last encountered IOException
- * if any are thrown by the component reporters.
+ * Tries to close all resources. Throws the last encountered IOException if any are thrown by
+ * the component reporters.
*
* @throws IOException
*/
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java b/tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java
index fd49927..f34bd0e 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/FailedToStartClientException.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.pipes;
-/**
- * This should be catastrophic
- */
+/** This should be catastrophic */
public class FailedToStartClientException extends RuntimeException {
public FailedToStartClientException(Throwable t) {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java b/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java
index 0c0334f..3d4f79d 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/FetchEmitTuple.java
@@ -18,7 +18,6 @@
import java.io.Serializable;
import java.util.Objects;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.extractor.EmbeddedDocumentBytesConfig;
@@ -29,7 +28,8 @@
public static final ON_PARSE_EXCEPTION DEFAULT_ON_PARSE_EXCEPTION = ON_PARSE_EXCEPTION.EMIT;
public enum ON_PARSE_EXCEPTION {
- SKIP, EMIT
+ SKIP,
+ EMIT
}
private final String id;
@@ -42,28 +42,61 @@
private EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig;
public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey) {
- this(id, fetchKey, emitKey, new Metadata(), HandlerConfig.DEFAULT_HANDLER_CONFIG,
+ this(
+ id,
+ fetchKey,
+ emitKey,
+ new Metadata(),
+ HandlerConfig.DEFAULT_HANDLER_CONFIG,
DEFAULT_ON_PARSE_EXCEPTION);
}
- public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey, ON_PARSE_EXCEPTION onParseException) {
- this(id, fetchKey, emitKey, new Metadata(), HandlerConfig.DEFAULT_HANDLER_CONFIG,
+
+ public FetchEmitTuple(
+ String id, FetchKey fetchKey, EmitKey emitKey, ON_PARSE_EXCEPTION onParseException) {
+ this(
+ id,
+ fetchKey,
+ emitKey,
+ new Metadata(),
+ HandlerConfig.DEFAULT_HANDLER_CONFIG,
onParseException);
}
public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey, Metadata metadata) {
- this(id, fetchKey, emitKey, metadata, HandlerConfig.DEFAULT_HANDLER_CONFIG,
+ this(
+ id,
+ fetchKey,
+ emitKey,
+ metadata,
+ HandlerConfig.DEFAULT_HANDLER_CONFIG,
DEFAULT_ON_PARSE_EXCEPTION);
}
- public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey, Metadata metadata,
- HandlerConfig handlerConfig, ON_PARSE_EXCEPTION onParseException) {
- this(id, fetchKey, emitKey, metadata, handlerConfig, onParseException,
+ public FetchEmitTuple(
+ String id,
+ FetchKey fetchKey,
+ EmitKey emitKey,
+ Metadata metadata,
+ HandlerConfig handlerConfig,
+ ON_PARSE_EXCEPTION onParseException) {
+ this(
+ id,
+ fetchKey,
+ emitKey,
+ metadata,
+ handlerConfig,
+ onParseException,
EmbeddedDocumentBytesConfig.SKIP);
}
- public FetchEmitTuple(String id, FetchKey fetchKey, EmitKey emitKey, Metadata metadata,
- HandlerConfig handlerConfig, ON_PARSE_EXCEPTION onParseException,
- EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig) {
+ public FetchEmitTuple(
+ String id,
+ FetchKey fetchKey,
+ EmitKey emitKey,
+ Metadata metadata,
+ HandlerConfig handlerConfig,
+ ON_PARSE_EXCEPTION onParseException,
+ EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig) {
this.id = id;
this.fetchKey = fetchKey;
this.emitKey = emitKey;
@@ -76,6 +109,7 @@
public String getId() {
return id;
}
+
public FetchKey getFetchKey() {
return fetchKey;
}
@@ -148,16 +182,32 @@
result = 31 * result + (metadata != null ? metadata.hashCode() : 0);
result = 31 * result + (onParseException != null ? onParseException.hashCode() : 0);
result = 31 * result + (handlerConfig != null ? handlerConfig.hashCode() : 0);
- result = 31 * result +
- (embeddedDocumentBytesConfig != null ? embeddedDocumentBytesConfig.hashCode() : 0);
+ result =
+ 31 * result
+ + (embeddedDocumentBytesConfig != null
+ ? embeddedDocumentBytesConfig.hashCode()
+ : 0);
return result;
}
@Override
public String toString() {
- return "FetchEmitTuple{" + "id='" + id + '\'' + ", fetchKey=" + fetchKey + ", emitKey=" +
- emitKey + ", metadata=" + metadata + ", onParseException=" + onParseException +
- ", handlerConfig=" + handlerConfig + ", embeddedDocumentBytesConfig=" +
- embeddedDocumentBytesConfig + '}';
+ return "FetchEmitTuple{"
+ + "id='"
+ + id
+ + '\''
+ + ", fetchKey="
+ + fetchKey
+ + ", emitKey="
+ + emitKey
+ + ", metadata="
+ + metadata
+ + ", onParseException="
+ + onParseException
+ + ", handlerConfig="
+ + handlerConfig
+ + ", embeddedDocumentBytesConfig="
+ + embeddedDocumentBytesConfig
+ + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java
index d128dcb..af77bdc 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/HandlerConfig.java
@@ -19,31 +19,27 @@
import java.io.Serializable;
import java.util.Locale;
import java.util.Objects;
-
import org.apache.tika.sax.BasicContentHandlerFactory;
public class HandlerConfig implements Serializable {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -3861669115439125268L;
public static final HandlerConfig DEFAULT_HANDLER_CONFIG =
- new HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, PARSE_MODE.RMETA,
- -1, -1, true);
+ new HandlerConfig(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, PARSE_MODE.RMETA, -1, -1, true);
/**
- * {@link PARSE_MODE#RMETA} "recursive metadata" is the same as the -J option
- * in tika-app and the /rmeta endpoint in tika-server. Each embedded file is represented as
- * its own metadata object.
+ * {@link PARSE_MODE#RMETA} "recursive metadata" is the same as the -J option in tika-app and
+ * the /rmeta endpoint in tika-server. Each embedded file is represented as its own metadata
+ * object.
*
- * {@link PARSE_MODE#CONCATENATE} is similar
- * to the legacy tika-app behavior and the /tika endpoint (accept: application/json) in
- * tika-server. This concatenates the
- * contents of embedded files and returns a single metadata object for the file no
- * matter how many embedded objects there are; this option throws away metadata from
- * embedded objects and silently skips exceptions in embedded objects.
+ * <p>{@link PARSE_MODE#CONCATENATE} is similar to the legacy tika-app behavior and the /tika
+ * endpoint (accept: application/json) in tika-server. This concatenates the contents of
+ * embedded files and returns a single metadata object for the file no matter how many embedded
+ * objects there are; this option throws away metadata from embedded objects and silently skips
+ * exceptions in embedded objects.
*/
public enum PARSE_MODE {
RMETA,
@@ -63,8 +59,11 @@
}
sb.append(m.name().toLowerCase(Locale.US));
}
- throw new IllegalArgumentException("mode must be one of: (" + sb +
- "). I regret I do not understand: " + modeString);
+ throw new IllegalArgumentException(
+ "mode must be one of: ("
+ + sb
+ + "). I regret I do not understand: "
+ + modeString);
}
}
@@ -77,10 +76,12 @@
boolean throwOnWriteLimitReached = true;
PARSE_MODE parseMode = PARSE_MODE.RMETA;
-
- public HandlerConfig(BasicContentHandlerFactory.HANDLER_TYPE type, PARSE_MODE parseMode,
- int writeLimit,
- int maxEmbeddedResources, boolean throwOnWriteLimitReached) {
+ public HandlerConfig(
+ BasicContentHandlerFactory.HANDLER_TYPE type,
+ PARSE_MODE parseMode,
+ int writeLimit,
+ int maxEmbeddedResources,
+ boolean throwOnWriteLimitReached) {
this.type = type;
this.parseMode = parseMode;
this.writeLimit = writeLimit;
@@ -117,21 +118,32 @@
return false;
}
HandlerConfig that = (HandlerConfig) o;
- return writeLimit == that.writeLimit && maxEmbeddedResources == that.maxEmbeddedResources &&
- throwOnWriteLimitReached == that.throwOnWriteLimitReached && type == that.type &&
- parseMode == that.parseMode;
+ return writeLimit == that.writeLimit
+ && maxEmbeddedResources == that.maxEmbeddedResources
+ && throwOnWriteLimitReached == that.throwOnWriteLimitReached
+ && type == that.type
+ && parseMode == that.parseMode;
}
@Override
public int hashCode() {
- return Objects.hash(type, writeLimit, maxEmbeddedResources, throwOnWriteLimitReached,
- parseMode);
+ return Objects.hash(
+ type, writeLimit, maxEmbeddedResources, throwOnWriteLimitReached, parseMode);
}
@Override
public String toString() {
- return "HandlerConfig{" + "type=" + type + ", writeLimit=" + writeLimit +
- ", maxEmbeddedResources=" + maxEmbeddedResources + ", throwOnWriteLimitReached=" +
- throwOnWriteLimitReached + ", parseMode=" + parseMode + '}';
+ return "HandlerConfig{"
+ + "type="
+ + type
+ + ", writeLimit="
+ + writeLimit
+ + ", maxEmbeddedResources="
+ + maxEmbeddedResources
+ + ", throwOnWriteLimitReached="
+ + throwOnWriteLimitReached
+ + ", parseMode="
+ + parseMode
+ + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java b/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java
index 5f00880..5262abb 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/LoggingPipesReporter.java
@@ -16,13 +16,10 @@
*/
package org.apache.tika.pipes;
-
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-/**
- * Simple PipesReporter that logs everything at the debug level.
- */
+/** Simple PipesReporter that logs everything at the debug level. */
public class LoggingPipesReporter extends PipesReporter {
Logger LOGGER = LoggerFactory.getLogger(LoggingPipesReporter.class);
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
index 52e72df..64ea871 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesClient.java
@@ -40,24 +40,21 @@
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicInteger;
-
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.pipes.emitter.EmitData;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.utils.ProcessUtils;
import org.apache.tika.utils.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * The PipesClient is designed to be single-threaded. It only allots
- * a single thread for {@link #process(FetchEmitTuple)} processing.
- * See {@link org.apache.tika.pipes.async.AsyncProcessor} for handling
- * multiple PipesClients.
+ * The PipesClient is designed to be single-threaded. It only allots a single thread for {@link
+ * #process(FetchEmitTuple)} processing. See {@link org.apache.tika.pipes.async.AsyncProcessor} for
+ * handling multiple PipesClients.
*/
public class PipesClient implements Closeable {
@@ -65,9 +62,9 @@
private static final int MAX_BYTES_BEFORE_READY = 20000;
private static AtomicInteger CLIENT_COUNTER = new AtomicInteger(0);
private static final long WAIT_ON_DESTROY_MS = 10000;
- //this synchronizes the creation and/or closing of the executorService
- //there are a number of assumptions throughout that PipesClient is run
- //single threaded
+ // this synchronizes the creation and/or closing of the executorService
+ // there are a number of assumptions throughout that PipesClient is run
+ // single threaded
private final Object[] executorServiceLock = new Object[0];
private final PipesConfigBase pipesConfig;
private final int pipesClientId;
@@ -110,7 +107,7 @@
try {
destroyForcibly();
} catch (InterruptedException e) {
- //swallow
+ // swallow
}
}
synchronized (executorServiceLock) {
@@ -125,10 +122,12 @@
boolean restart = false;
if (!ping()) {
restart = true;
- } else if (pipesConfig.getMaxFilesProcessedPerProcess() > 0 &&
- filesProcessed >= pipesConfig.getMaxFilesProcessedPerProcess()) {
- LOG.info("pipesClientId={}: restarting server after hitting max files: {}",
- pipesClientId, filesProcessed);
+ } else if (pipesConfig.getMaxFilesProcessedPerProcess() > 0
+ && filesProcessed >= pipesConfig.getMaxFilesProcessedPerProcess()) {
+ LOG.info(
+ "pipesClientId={}: restarting server after hitting max files: {}",
+ pipesClientId,
+ filesProcessed);
restart = true;
}
if (restart) {
@@ -138,8 +137,10 @@
restart();
successfulRestart = true;
} catch (TimeoutException e) {
- LOG.warn("pipesClientId={}: couldn't restart within {} ms (startupTimeoutMillis)",
- pipesClientId, pipesConfig.getStartupTimeoutMillis());
+ LOG.warn(
+ "pipesClientId={}: couldn't restart within {} ms (startupTimeoutMillis)",
+ pipesClientId,
+ pipesConfig.getStartupTimeoutMillis());
Thread.sleep(pipesConfig.getSleepOnStartupTimeoutMillis());
}
}
@@ -150,52 +151,58 @@
private PipesResult actuallyProcess(FetchEmitTuple t) throws InterruptedException {
long start = System.currentTimeMillis();
final PipesResult[] intermediateResult = new PipesResult[1];
- FutureTask<PipesResult> futureTask = new FutureTask<>(() -> {
+ FutureTask<PipesResult> futureTask =
+ new FutureTask<>(
+ () -> {
+ UnsynchronizedByteArrayOutputStream bos =
+ UnsynchronizedByteArrayOutputStream.builder().get();
+ try (ObjectOutputStream objectOutputStream =
+ new ObjectOutputStream(bos)) {
+ objectOutputStream.writeObject(t);
+ }
- UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get();
- try (ObjectOutputStream objectOutputStream = new ObjectOutputStream(bos)) {
- objectOutputStream.writeObject(t);
- }
+ byte[] bytes = bos.toByteArray();
+ output.write(CALL.getByte());
+ output.writeInt(bytes.length);
+ output.write(bytes);
+ output.flush();
+ if (LOG.isTraceEnabled()) {
+ LOG.trace(
+ "pipesClientId={}: timer -- write tuple: {} ms",
+ pipesClientId,
+ System.currentTimeMillis() - start);
+ }
+ long readStart = System.currentTimeMillis();
+ if (Thread.currentThread().isInterrupted()) {
+ throw new InterruptedException("thread interrupt");
+ }
+ PipesResult result = readResults(t, start);
+ while (result.getStatus()
+ .equals(PipesResult.STATUS.INTERMEDIATE_RESULT)) {
+ intermediateResult[0] = result;
+ result = readResults(t, start);
+ }
+ if (LOG.isDebugEnabled()) {
+ long elapsed = System.currentTimeMillis() - readStart;
+ LOG.debug("finished reading result in {} ms", elapsed);
+ }
- byte[] bytes = bos.toByteArray();
- output.write(CALL.getByte());
- output.writeInt(bytes.length);
- output.write(bytes);
- output.flush();
- if (LOG.isTraceEnabled()) {
- LOG.trace("pipesClientId={}: timer -- write tuple: {} ms",
- pipesClientId,
- System.currentTimeMillis() - start);
- }
- long readStart = System.currentTimeMillis();
- if (Thread.currentThread().isInterrupted()) {
- throw new InterruptedException("thread interrupt");
- }
- PipesResult result = readResults(t, start);
- while (result.getStatus().equals(PipesResult.STATUS.INTERMEDIATE_RESULT)) {
- intermediateResult[0] = result;
- result = readResults(t, start);
- }
- if (LOG.isDebugEnabled()) {
- long elapsed = System.currentTimeMillis() - readStart;
- LOG.debug("finished reading result in {} ms", elapsed);
- }
-
- if (LOG.isTraceEnabled()) {
- LOG.trace("pipesClientId={}: timer -- read result: {} ms",
- pipesClientId,
- System.currentTimeMillis() - readStart);
- }
- if (result.getStatus() == PipesResult.STATUS.OOM) {
- return buildFatalResult(result, intermediateResult);
- }
- return result;
- });
+ if (LOG.isTraceEnabled()) {
+ LOG.trace(
+ "pipesClientId={}: timer -- read result: {} ms",
+ pipesClientId,
+ System.currentTimeMillis() - readStart);
+ }
+ if (result.getStatus() == PipesResult.STATUS.OOM) {
+ return buildFatalResult(result, intermediateResult);
+ }
+ return result;
+ });
try {
if (closed) {
- throw new IllegalArgumentException("pipesClientId=" + pipesClientId +
- ": PipesClient closed");
+ throw new IllegalArgumentException(
+ "pipesClientId=" + pipesClientId + ": PipesClient closed");
}
executorService.execute(futureTask);
return futureTask.get(pipesConfig.getTimeoutMillis(), TimeUnit.MILLISECONDS);
@@ -207,23 +214,36 @@
long elapsed = System.currentTimeMillis() - start;
pauseThenDestroy();
if (!process.isAlive() && TIMEOUT_EXIT_CODE == process.exitValue()) {
- LOG.warn("pipesClientId={} server timeout: {} in {} ms", pipesClientId, t.getId(),
+ LOG.warn(
+ "pipesClientId={} server timeout: {} in {} ms",
+ pipesClientId,
+ t.getId(),
elapsed);
return buildFatalResult(PipesResult.TIMEOUT, intermediateResult);
}
process.waitFor(500, TimeUnit.MILLISECONDS);
if (process.isAlive()) {
- LOG.warn("pipesClientId={} crash: {} in {} ms with no exit code available",
- pipesClientId, t.getId(), elapsed);
+ LOG.warn(
+ "pipesClientId={} crash: {} in {} ms with no exit code available",
+ pipesClientId,
+ t.getId(),
+ elapsed);
} else {
- LOG.warn("pipesClientId={} crash: {} in {} ms with exit code {}", pipesClientId,
- t.getId(), elapsed, process.exitValue());
+ LOG.warn(
+ "pipesClientId={} crash: {} in {} ms with exit code {}",
+ pipesClientId,
+ t.getId(),
+ elapsed,
+ process.exitValue());
}
return buildFatalResult(PipesResult.UNSPECIFIED_CRASH, intermediateResult);
} catch (TimeoutException e) {
long elapsed = System.currentTimeMillis() - start;
destroyForcibly();
- LOG.warn("pipesClientId={} client timeout: {} in {} ms", pipesClientId, t.getId(),
+ LOG.warn(
+ "pipesClientId={} client timeout: {} in {} ms",
+ pipesClientId,
+ t.getId(),
elapsed);
return buildFatalResult(PipesResult.TIMEOUT, intermediateResult);
} finally {
@@ -231,8 +251,7 @@
}
}
- private PipesResult buildFatalResult(PipesResult result,
- PipesResult[] intermediateResult) {
+ private PipesResult buildFatalResult(PipesResult result, PipesResult[] intermediateResult) {
if (intermediateResult[0] == null) {
return result;
@@ -240,16 +259,18 @@
if (LOG.isTraceEnabled()) {
LOG.trace("intermediate result: {}", intermediateResult[0].getEmitData());
}
- intermediateResult[0].getEmitData().getMetadataList().get(0).set(
- TikaCoreProperties.PIPES_RESULT, result.getStatus().toString());
- return new PipesResult(result.getStatus(),
- intermediateResult[0].getEmitData(), true);
+ intermediateResult[0]
+ .getEmitData()
+ .getMetadataList()
+ .get(0)
+ .set(TikaCoreProperties.PIPES_RESULT, result.getStatus().toString());
+ return new PipesResult(result.getStatus(), intermediateResult[0].getEmitData(), true);
}
}
private void pauseThenDestroy() throws InterruptedException {
- //wait just a little bit to let process end to get exit value
- //if there's a timeout on the server side
+ // wait just a little bit to let process end to get exit value
+ // if there's a timeout on the server side
try {
process.waitFor(200, TimeUnit.MILLISECONDS);
} finally {
@@ -260,19 +281,19 @@
private void destroyForcibly() throws InterruptedException {
process.destroyForcibly();
process.waitFor(WAIT_ON_DESTROY_MS, TimeUnit.MILLISECONDS);
- //important to close streams so that threads running in this
- //process receive notice that they really ought to stop.
- //TIKA-3588 showed that we can't trust that forcibly destroying
- //the process caused the actuallyProcess thread in this process to stop.
+ // important to close streams so that threads running in this
+ // process receive notice that they really ought to stop.
+ // TIKA-3588 showed that we can't trust that forcibly destroying
+ // the process caused the actuallyProcess thread in this process to stop.
try {
input.close();
} catch (IOException closeException) {
- //swallow
+ // swallow
}
try {
output.close();
} catch (IOException closeException) {
- //swallow
+ // swallow
}
if (process.isAlive()) {
LOG.error("Process still alive after {}ms", WAIT_ON_DESTROY_MS);
@@ -289,7 +310,7 @@
} catch (IllegalArgumentException e) {
String byteString = "-1";
if (statusByte > -1) {
- byteString = String.format(Locale.US, "%02x", (byte)statusByte);
+ byteString = String.format(Locale.US, "%02x", (byte) statusByte);
}
throw new IOException("problem reading response from server: " + byteString, e);
}
@@ -299,49 +320,76 @@
LOG.warn("pipesClientId={} oom: {} in {} ms", pipesClientId, t.getId(), millis);
return PipesResult.OOM;
case TIMEOUT:
- LOG.warn("pipesClientId={} server response timeout: {} in {} ms", pipesClientId,
- t.getId(), millis);
+ LOG.warn(
+ "pipesClientId={} server response timeout: {} in {} ms",
+ pipesClientId,
+ t.getId(),
+ millis);
return PipesResult.TIMEOUT;
case EMIT_EXCEPTION:
- LOG.warn("pipesClientId={} emit exception: {} in {} ms", pipesClientId, t.getId(),
+ LOG.warn(
+ "pipesClientId={} emit exception: {} in {} ms",
+ pipesClientId,
+ t.getId(),
millis);
return readMessage(PipesResult.STATUS.EMIT_EXCEPTION);
case EMITTER_NOT_FOUND:
- LOG.warn("pipesClientId={} emitter not found: {} in {} ms", pipesClientId,
- t.getId(), millis);
+ LOG.warn(
+ "pipesClientId={} emitter not found: {} in {} ms",
+ pipesClientId,
+ t.getId(),
+ millis);
return readMessage(PipesResult.STATUS.NO_EMITTER_FOUND);
case FETCHER_NOT_FOUND:
- LOG.warn("pipesClientId={} fetcher not found: {} in {} ms", pipesClientId,
- t.getId(), millis);
+ LOG.warn(
+ "pipesClientId={} fetcher not found: {} in {} ms",
+ pipesClientId,
+ t.getId(),
+ millis);
return readMessage(PipesResult.STATUS.NO_FETCHER_FOUND);
case FETCHER_INITIALIZATION_EXCEPTION:
- LOG.warn("pipesClientId={} fetcher initialization exception: {} in {} ms",
- pipesClientId, t.getId(), millis);
+ LOG.warn(
+ "pipesClientId={} fetcher initialization exception: {} in {} ms",
+ pipesClientId,
+ t.getId(),
+ millis);
return readMessage(PipesResult.STATUS.FETCHER_INITIALIZATION_EXCEPTION);
case FETCH_EXCEPTION:
- LOG.warn("pipesClientId={} fetch exception: {} in {} ms", pipesClientId, t.getId(),
+ LOG.warn(
+ "pipesClientId={} fetch exception: {} in {} ms",
+ pipesClientId,
+ t.getId(),
millis);
return readMessage(PipesResult.STATUS.FETCH_EXCEPTION);
case INTERMEDIATE_RESULT:
- LOG.debug("pipesClientId={} intermediate success: {} in {} ms", pipesClientId,
- t.getId(), millis);
+ LOG.debug(
+ "pipesClientId={} intermediate success: {} in {} ms",
+ pipesClientId,
+ t.getId(),
+ millis);
return deserializeIntermediateResult(t.getEmitKey());
case PARSE_SUCCESS:
- //there may have been a parse exception, but the parse didn't crash
- LOG.debug("pipesClientId={} parse success: {} in {} ms", pipesClientId, t.getId(),
+ // there may have been a parse exception, but the parse didn't crash
+ LOG.debug(
+ "pipesClientId={} parse success: {} in {} ms",
+ pipesClientId,
+ t.getId(),
millis);
return deserializeEmitData();
case PARSE_EXCEPTION_NO_EMIT:
return readMessage(PipesResult.STATUS.PARSE_EXCEPTION_NO_EMIT);
case EMIT_SUCCESS:
- LOG.debug("pipesClientId={} emit success: {} in {} ms", pipesClientId, t.getId(),
+ LOG.debug(
+ "pipesClientId={} emit success: {} in {} ms",
+ pipesClientId,
+ t.getId(),
millis);
return PipesResult.EMIT_SUCCESS;
case EMIT_SUCCESS_PARSE_EXCEPTION:
return readMessage(PipesResult.STATUS.EMIT_SUCCESS_PARSE_EXCEPTION);
case EMPTY_OUTPUT:
return PipesResult.EMPTY_OUTPUT;
- //fall through
+ // fall through
case READY:
case CALL:
case PING:
@@ -350,11 +398,10 @@
default:
throw new IOException("Need to handle procesing for: " + status);
}
-
}
private PipesResult readMessage(PipesResult.STATUS status) throws IOException {
- //readInt checks for EOF
+ // readInt checks for EOF
int length = input.readInt();
byte[] bytes = new byte[length];
input.readFully(bytes);
@@ -366,8 +413,8 @@
int length = input.readInt();
byte[] bytes = new byte[length];
input.readFully(bytes);
- try (ObjectInputStream objectInputStream = new ObjectInputStream(
- new UnsynchronizedByteArrayInputStream(bytes))) {
+ try (ObjectInputStream objectInputStream =
+ new ObjectInputStream(new UnsynchronizedByteArrayInputStream(bytes))) {
EmitData emitData = (EmitData) objectInputStream.readObject();
String stack = emitData.getContainerStackTrace();
@@ -378,7 +425,7 @@
}
} catch (ClassNotFoundException e) {
LOG.error("class not found exception deserializing data", e);
- //this should be catastrophic
+ // this should be catastrophic
throw new RuntimeException(e);
}
}
@@ -388,14 +435,14 @@
int length = input.readInt();
byte[] bytes = new byte[length];
input.readFully(bytes);
- try (ObjectInputStream objectInputStream = new ObjectInputStream(
- new UnsynchronizedByteArrayInputStream(bytes))) {
+ try (ObjectInputStream objectInputStream =
+ new ObjectInputStream(new UnsynchronizedByteArrayInputStream(bytes))) {
Metadata metadata = (Metadata) objectInputStream.readObject();
EmitData emitData = new EmitData(emitKey, Collections.singletonList(metadata));
return new PipesResult(PipesResult.STATUS.INTERMEDIATE_RESULT, emitData, true);
} catch (ClassNotFoundException e) {
LOG.error("class not found exception deserializing data", e);
- //this should be catastrophic
+ // this should be catastrophic
throw new RuntimeException(e);
}
}
@@ -405,18 +452,18 @@
LOG.debug("process still alive; trying to destroy it");
destroyForcibly();
boolean processEnded = process.waitFor(30, TimeUnit.SECONDS);
- if (! processEnded) {
+ if (!processEnded) {
LOG.warn("pipesClientId={}: process has not yet ended", pipesClientId);
}
executorService.shutdownNow();
boolean shutdown = executorService.awaitTermination(30, TimeUnit.SECONDS);
- if (! shutdown) {
+ if (!shutdown) {
LOG.warn("pipesClientId={}: executorService has not yet shutdown", pipesClientId);
}
synchronized (executorServiceLock) {
if (closed) {
- throw new IllegalArgumentException("pipesClientId=" + pipesClientId +
- ": PipesClient closed");
+ throw new IllegalArgumentException(
+ "pipesClientId=" + pipesClientId + ": PipesClient closed");
}
executorService = Executors.newFixedThreadPool(1);
}
@@ -430,41 +477,57 @@
try {
process = pb.start();
} catch (Exception e) {
- //Do we ever want this to be not fatal?!
+ // Do we ever want this to be not fatal?!
LOG.error("failed to start client", e);
throw new FailedToStartClientException(e);
}
input = new DataInputStream(process.getInputStream());
output = new DataOutputStream(process.getOutputStream());
- //wait for ready signal
- final UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get();
- FutureTask<Integer> futureTask = new FutureTask<>(() -> {
- int b = input.read();
- int read = 1;
- while (read < MAX_BYTES_BEFORE_READY && b != READY.getByte()) {
+ // wait for ready signal
+ final UnsynchronizedByteArrayOutputStream bos =
+ UnsynchronizedByteArrayOutputStream.builder().get();
+ FutureTask<Integer> futureTask =
+ new FutureTask<>(
+ () -> {
+ int b = input.read();
+ int read = 1;
+ while (read < MAX_BYTES_BEFORE_READY && b != READY.getByte()) {
- if (b == -1) {
- throw new RuntimeException(getMsg("pipesClientId=" + pipesClientId + ": " +
- "Couldn't start server -- read EOF before 'ready' byte.\n" +
- " process isAlive=" + process.isAlive(), bos));
- }
- bos.write(b);
- b = input.read();
- read++;
- }
- if (read >= MAX_BYTES_BEFORE_READY) {
- throw new RuntimeException(getMsg("pipesClientId=" + pipesClientId + ": " +
- "Couldn't start server: read too many bytes before 'ready' byte.\n" +
- " Make absolutely certain that your logger is not writing to " +
- "stdout.\n", bos));
- }
- if (bos.size() > 0) {
- LOG.warn("pipesClientId={}: From forked process before start byte: {}",
- pipesClientId, bos.toString(StandardCharsets.UTF_8));
- }
- return 1;
- });
+ if (b == -1) {
+ throw new RuntimeException(
+ getMsg(
+ "pipesClientId="
+ + pipesClientId
+ + ": "
+ + "Couldn't start server -- read EOF before 'ready' byte.\n"
+ + " process isAlive="
+ + process.isAlive(),
+ bos));
+ }
+ bos.write(b);
+ b = input.read();
+ read++;
+ }
+ if (read >= MAX_BYTES_BEFORE_READY) {
+ throw new RuntimeException(
+ getMsg(
+ "pipesClientId="
+ + pipesClientId
+ + ": "
+ + "Couldn't start server: read too many bytes before 'ready' byte.\n"
+ + " Make absolutely certain that your logger is not writing to "
+ + "stdout.\n",
+ bos));
+ }
+ if (bos.size() > 0) {
+ LOG.warn(
+ "pipesClientId={}: From forked process before start byte: {}",
+ pipesClientId,
+ bos.toString(StandardCharsets.UTF_8));
+ }
+ return 1;
+ });
long start = System.currentTimeMillis();
executorService.submit(futureTask);
try {
@@ -478,10 +541,13 @@
throw new RuntimeException(e);
} catch (TimeoutException e) {
long elapsed = System.currentTimeMillis() - start;
- LOG.error("pipesClientId={} didn't receive ready byte from server within " +
- "StartupTimeoutMillis {}; ms elapsed {}; did read >{}<",
- pipesClientId, pipesConfig.getStartupTimeoutMillis(),
- elapsed, bos.toString(StandardCharsets.UTF_8));
+ LOG.error(
+ "pipesClientId={} didn't receive ready byte from server within "
+ + "StartupTimeoutMillis {}; ms elapsed {}; did read >{}<",
+ pipesClientId,
+ pipesConfig.getStartupTimeoutMillis(),
+ elapsed,
+ bos.toString(StandardCharsets.UTF_8));
destroyForcibly();
throw e;
} finally {
@@ -513,8 +579,8 @@
if (arg.equals("-cp") || arg.equals("--classpath")) {
hasClassPath = true;
}
- if (arg.equals("-XX:+ExitOnOutOfMemoryError") ||
- arg.equals("-XX:+CrashOnOutOfMemoryError")) {
+ if (arg.equals("-XX:+ExitOnOutOfMemoryError")
+ || arg.equals("-XX:+CrashOnOutOfMemoryError")) {
hasExitOnOOM = true;
}
if (arg.startsWith("-Dlog4j.configuration")) {
@@ -543,9 +609,9 @@
}
if (hasExitOnOOM) {
LOG.warn(
- "I notice that you have an exit/crash on OOM. If you run heavy external processes " +
- "like tesseract, this setting may result in orphaned processes which could be disastrous" +
- " for performance.");
+ "I notice that you have an exit/crash on OOM. If you run heavy external processes "
+ + "like tesseract, this setting may result in orphaned processes which could be disastrous"
+ + " for performance.");
}
if (!hasLog4j) {
commandLine.add(
@@ -554,8 +620,9 @@
commandLine.add("-DpipesClientId=" + pipesClientId);
commandLine.addAll(configArgs);
commandLine.add("org.apache.tika.pipes.PipesServer");
- commandLine.add(ProcessUtils.escapeCommandLine(
- pipesConfig.getTikaConfig().toAbsolutePath().toString()));
+ commandLine.add(
+ ProcessUtils.escapeCommandLine(
+ pipesConfig.getTikaConfig().toAbsolutePath().toString()));
commandLine.add(Long.toString(pipesConfig.getMaxForEmitBatchBytes()));
commandLine.add(Long.toString(pipesConfig.getTimeoutMillis()));
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java
index 06783d6..4bfcbed 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfig.java
@@ -21,12 +21,10 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Set;
-
+import org.apache.tika.exception.TikaConfigException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import org.apache.tika.exception.TikaConfigException;
-
public class PipesConfig extends PipesConfigBase {
private static final Logger LOG = LoggerFactory.getLogger(PipesClient.class);
@@ -39,16 +37,16 @@
Set<String> settings = pipesConfig.configure("pipes", is);
}
if (pipesConfig.getTikaConfig() == null) {
- LOG.debug("A separate tikaConfig was not specified in the <pipes/> element in the " +
- "config file; will use {} for pipes", tikaConfig);
+ LOG.debug(
+ "A separate tikaConfig was not specified in the <pipes/> element in the "
+ + "config file; will use {} for pipes",
+ tikaConfig);
pipesConfig.setTikaConfig(tikaConfig);
}
return pipesConfig;
}
- private PipesConfig() {
-
- }
+ private PipesConfig() {}
public long getMaxWaitForClientMillis() {
return maxWaitForClientMillis;
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java
index bf6a6bb..2e19c64 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesConfigBase.java
@@ -21,15 +21,13 @@
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
-
import org.apache.tika.config.ConfigBase;
public class PipesConfigBase extends ConfigBase {
/**
- * default size to send back to the PipesClient for batch
- * emitting. If an extract is larger than this, it will be emitted
- * directly from the forked PipesServer.
+ * default size to send back to the PipesClient for batch emitting. If an extract is larger than
+ * this, it will be emitted directly from the forked PipesServer.
*/
public static final long DEFAULT_MAX_FOR_EMIT_BATCH = 100000;
@@ -43,8 +41,8 @@
public static final int DEFAULT_MAX_FILES_PROCESSED_PER_PROCESS = 10000;
- //if an extract is larger than this, the forked PipesServer should
- //emit the extract directly and not send the contents back to the PipesClient
+ // if an extract is larger than this, the forked PipesServer should
+ // emit the extract directly and not send the contents back to the PipesClient
private long maxForEmitBatchBytes = DEFAULT_MAX_FOR_EMIT_BATCH;
private long timeoutMillis = DEFAULT_TIMEOUT_MILLIS;
private long startupTimeoutMillis = DEFAULT_STARTUP_TIMEOUT_MILLIS;
@@ -65,6 +63,7 @@
/**
* How long to wait in milliseconds before timing out the forked process.
+ *
* @param timeoutMillis
*/
public void setTimeoutMillis(long timeoutMillis) {
@@ -76,8 +75,7 @@
}
/**
- * If the client has been inactive after this many milliseconds,
- * shut it down.
+ * If the client has been inactive after this many milliseconds, shut it down.
*
* @param shutdownClientAfterMillis
*/
@@ -94,7 +92,7 @@
}
public List<String> getForkedJvmArgs() {
- //defensive copy
+ // defensive copy
List<String> ret = new ArrayList<>();
ret.addAll(forkedJvmArgs);
return ret;
@@ -109,8 +107,9 @@
}
/**
- * Restart the forked PipesServer after it has processed this many files to avoid
- * slow-building memory leaks.
+ * Restart the forked PipesServer after it has processed this many files to avoid slow-building
+ * memory leaks.
+ *
* @return
*/
public int getMaxFilesProcessedPerProcess() {
@@ -146,13 +145,12 @@
}
/**
- * What is the maximum bytes size per extract that
- * will be allowed to be shipped back to the emit queue in the forking process.
- * If an extract is too big, skip the emit queue and forward it directly from the
- * forked PipesServer.
- * If set to <code>0</code>, this will never send an extract back for batch emitting,
- * but will always emit the extract directly from the forked PipeServer.
- * If set to <code>-1</code>, this will always send the extract back for batch emitting.
+ * What is the maximum bytes size per extract that will be allowed to be shipped back to the
+ * emit queue in the forking process. If an extract is too big, skip the emit queue and forward
+ * it directly from the forked PipesServer. If set to <code>0</code>, this will never send an
+ * extract back for batch emitting, but will always emit the extract directly from the forked
+ * PipeServer. If set to <code>-1</code>, this will always send the extract back for batch
+ * emitting.
*
* @return the threshold extract size at which to emit directly from the forked PipeServer
*/
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesException.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesException.java
index ee9545f..e6066f5 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesException.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesException.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.pipes;
-/**
- * Fatal exception that means that something went seriously wrong.
- */
+/** Fatal exception that means that something went seriously wrong. */
public class PipesException extends Exception {
public PipesException(Throwable t) {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java
index 8446983..72bfe21 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesParser.java
@@ -25,11 +25,9 @@
public class PipesParser implements Closeable {
-
private final PipesConfig pipesConfig;
private final List<PipesClient> clients = new ArrayList<>();
- private final ArrayBlockingQueue<PipesClient> clientQueue ;
-
+ private final ArrayBlockingQueue<PipesClient> clientQueue;
public PipesParser(PipesConfig pipesConfig) {
this.pipesConfig = pipesConfig;
@@ -41,12 +39,13 @@
}
}
- public PipesResult parse(FetchEmitTuple t) throws InterruptedException,
- PipesException, IOException {
+ public PipesResult parse(FetchEmitTuple t)
+ throws InterruptedException, PipesException, IOException {
PipesClient client = null;
try {
- client = clientQueue.poll(pipesConfig.getMaxWaitForClientMillis(),
- TimeUnit.MILLISECONDS);
+ client =
+ clientQueue.poll(
+ pipesConfig.getMaxWaitForClientMillis(), TimeUnit.MILLISECONDS);
if (client == null) {
return PipesResult.CLIENT_UNAVAILABLE_WITHIN_MS;
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java
index 3978039..69f63f2 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesReporter.java
@@ -18,81 +18,76 @@
import java.io.Closeable;
import java.io.IOException;
-
import org.apache.tika.pipes.pipesiterator.TotalCountResult;
/**
- * This is called asynchronously by the AsyncProcessor. This
- * is not thread safe, and implementers must be careful to implement
- * {@link #report(FetchEmitTuple, PipesResult, long)} in a thread safe
+ * This is called asynchronously by the AsyncProcessor. This is not thread safe, and implementers
+ * must be careful to implement {@link #report(FetchEmitTuple, PipesResult, long)} in a thread safe
* way.
- * <p/>
- * Note, however, that this is not called in the forked processes.
- * Implementers do not have to worry about synchronizing across processes;
- * for example, one could use an in-memory h2 database as a target.
+ *
+ * <p>Note, however, that this is not called in the forked processes. Implementers do not have to
+ * worry about synchronizing across processes; for example, one could use an in-memory h2 database
+ * as a target.
*/
public abstract class PipesReporter implements Closeable {
- public static final PipesReporter NO_OP_REPORTER = new PipesReporter() {
+ public static final PipesReporter NO_OP_REPORTER =
+ new PipesReporter() {
- @Override
- public void report(FetchEmitTuple t, PipesResult result, long elapsed) {
+ @Override
+ public void report(FetchEmitTuple t, PipesResult result, long elapsed) {}
- }
+ @Override
+ public void error(Throwable t) {}
- @Override
- public void error(Throwable t) {
+ @Override
+ public void error(String msg) {}
+ };
- }
-
- @Override
- public void error(String msg) {
-
- }
- };
-
- //Implementers are responsible for preventing reporting after
- //crashes if that is the desired behavior.
+ // Implementers are responsible for preventing reporting after
+ // crashes if that is the desired behavior.
public abstract void report(FetchEmitTuple t, PipesResult result, long elapsed);
-
/**
- * No-op implementation. Override for custom behavior
- * and make sure to override {@link #supportsTotalCount()}
- * to return <code>true</code>
+ * No-op implementation. Override for custom behavior and make sure to override {@link
+ * #supportsTotalCount()} to return <code>true</code>
+ *
* @param totalCountResult
*/
- public void report(TotalCountResult totalCountResult) {
-
- }
+ public void report(TotalCountResult totalCountResult) {}
/**
* Override this if your reporter supports total count.
+ *
* @return <code>false</code> as the baseline implementation
*/
public boolean supportsTotalCount() {
return false;
}
+
/**
- * No-op implementation. Override for custom behavior
+ * No-op implementation. Override for custom behavior
+ *
* @throws IOException
*/
@Override
public void close() throws IOException {
- //no-op
+ // no-op
}
/**
- * This is called if the process has crashed.
- * Implementers should not rely on close() to be called after this.
+ * This is called if the process has crashed. Implementers should not rely on close() to be
+ * called after this.
+ *
* @param t
*/
public abstract void error(Throwable t);
+
/**
- * This is called if the process has crashed.
- * Implementers should not rely on close() to be called after this.
+ * This is called if the process has crashed. Implementers should not rely on close() to be
+ * called after this.
+ *
* @param msg
*/
public abstract void error(String msg);
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java
index 3dcddfa..8a52060 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesReporterBase.java
@@ -20,16 +20,13 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaConfigException;
-/**
- * Base class that includes filtering by {@link PipesResult.STATUS}
- */
+/** Base class that includes filtering by {@link PipesResult.STATUS} */
public abstract class PipesReporterBase extends PipesReporter implements Initializable {
private final Set<PipesResult.STATUS> includes = new HashSet<>();
@@ -42,11 +39,12 @@
statusFilter = buildStatusFilter(includes, excludes);
}
- private StatusFilter buildStatusFilter(Set<PipesResult.STATUS> includes,
- Set<PipesResult.STATUS> excludes) throws TikaConfigException {
+ private StatusFilter buildStatusFilter(
+ Set<PipesResult.STATUS> includes, Set<PipesResult.STATUS> excludes)
+ throws TikaConfigException {
if (includes.size() > 0 && excludes.size() > 0) {
- throw new TikaConfigException("Only one of includes and excludes may have any " +
- "contents");
+ throw new TikaConfigException(
+ "Only one of includes and excludes may have any " + "contents");
}
if (includes.size() > 0) {
return new IncludesFilter(includes);
@@ -58,12 +56,11 @@
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
- throws TikaConfigException {
-
- }
+ throws TikaConfigException {}
/**
* Implementations must call this for the includes/excludes filters to work!
+ *
* @param status
* @return
*/
@@ -150,6 +147,4 @@
return true;
}
}
-
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java
index 639bfc4..29fe3ed 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesResult.java
@@ -27,14 +27,19 @@
FETCHER_INITIALIZATION_EXCEPTION,
FETCH_EXCEPTION,
EMPTY_OUTPUT,
- PARSE_EXCEPTION_NO_EMIT, //within the pipes server
- PARSE_EXCEPTION_EMIT, //within the pipes server
- PARSE_SUCCESS, //when passed back to the async processor for emit
- PARSE_SUCCESS_WITH_EXCEPTION,//when passed back to the async processor for emit
- OOM, TIMEOUT, UNSPECIFIED_CRASH,
+ PARSE_EXCEPTION_NO_EMIT, // within the pipes server
+ PARSE_EXCEPTION_EMIT, // within the pipes server
+ PARSE_SUCCESS, // when passed back to the async processor for emit
+ PARSE_SUCCESS_WITH_EXCEPTION, // when passed back to the async processor for emit
+ OOM,
+ TIMEOUT,
+ UNSPECIFIED_CRASH,
NO_EMITTER_FOUND,
- EMIT_SUCCESS, EMIT_SUCCESS_PARSE_EXCEPTION, EMIT_EXCEPTION,
- INTERRUPTED_EXCEPTION, NO_FETCHER_FOUND,
+ EMIT_SUCCESS,
+ EMIT_SUCCESS_PARSE_EXCEPTION,
+ EMIT_EXCEPTION,
+ INTERRUPTED_EXCEPTION,
+ NO_FETCHER_FOUND,
INTERMEDIATE_RESULT;
}
@@ -44,9 +49,9 @@
public static final PipesResult OOM = new PipesResult(STATUS.OOM);
public static final PipesResult UNSPECIFIED_CRASH = new PipesResult(STATUS.UNSPECIFIED_CRASH);
public static final PipesResult EMIT_SUCCESS = new PipesResult(STATUS.EMIT_SUCCESS);
- public static final PipesResult INTERRUPTED_EXCEPTION = new PipesResult(STATUS.INTERRUPTED_EXCEPTION);
- public static final PipesResult EMPTY_OUTPUT =
- new PipesResult(STATUS.EMPTY_OUTPUT);
+ public static final PipesResult INTERRUPTED_EXCEPTION =
+ new PipesResult(STATUS.INTERRUPTED_EXCEPTION);
+ public static final PipesResult EMPTY_OUTPUT = new PipesResult(STATUS.EMPTY_OUTPUT);
private final STATUS status;
private final EmitData emitData;
private final String message;
@@ -80,8 +85,7 @@
}
/**
- * This assumes that the message is a stack trace (container
- * parse exception).
+ * This assumes that the message is a stack trace (container parse exception).
*
* @param emitData
* @param message
@@ -108,7 +112,16 @@
@Override
public String toString() {
- return "PipesResult{" + "intermediate=" + intermediate + ", status=" + status +
- ", emitData=" + emitData + ", message='" + message + '\'' + '}';
+ return "PipesResult{"
+ + "intermediate="
+ + intermediate
+ + ", status="
+ + status
+ + ", emitData="
+ + emitData
+ + ", message='"
+ + message
+ + '\''
+ + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
index 991694f..066cdfb 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/PipesServer.java
@@ -31,15 +31,9 @@
import java.util.Collections;
import java.util.List;
import java.util.Optional;
-
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
@@ -79,30 +73,46 @@
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.utils.ExceptionUtils;
import org.apache.tika.utils.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * This server is forked from the PipesClient. This class isolates
- * parsing from the client to protect the primary JVM.
- * <p>
- * When configuring logging for this class, make absolutely certain
- * not to write to STDOUT. This class uses STDOUT to communicate with
- * the PipesClient.
+ * This server is forked from the PipesClient. This class isolates parsing from the client to
+ * protect the primary JVM.
+ *
+ * <p>When configuring logging for this class, make absolutely certain not to write to STDOUT. This
+ * class uses STDOUT to communicate with the PipesClient.
*/
public class PipesServer implements Runnable {
private static final Logger LOG = LoggerFactory.getLogger(PipesServer.class);
- //this has to be some number not close to 0-3
- //it looks like the server crashes with exit value 3 on OOM, for example
+ // this has to be some number not close to 0-3
+ // it looks like the server crashes with exit value 3 on OOM, for example
public static final int TIMEOUT_EXIT_CODE = 17;
private DigestingParser.Digester digester;
private Detector detector;
public enum STATUS {
- READY, CALL, PING, FAILED_TO_START, FETCHER_NOT_FOUND, EMITTER_NOT_FOUND,
- FETCHER_INITIALIZATION_EXCEPTION, FETCH_EXCEPTION, PARSE_SUCCESS, PARSE_EXCEPTION_NO_EMIT,
- EMIT_SUCCESS, EMIT_SUCCESS_PARSE_EXCEPTION, EMIT_EXCEPTION, OOM, TIMEOUT, EMPTY_OUTPUT,
+ READY,
+ CALL,
+ PING,
+ FAILED_TO_START,
+ FETCHER_NOT_FOUND,
+ EMITTER_NOT_FOUND,
+ FETCHER_INITIALIZATION_EXCEPTION,
+ FETCH_EXCEPTION,
+ PARSE_SUCCESS,
+ PARSE_EXCEPTION_NO_EMIT,
+ EMIT_SUCCESS,
+ EMIT_SUCCESS_PARSE_EXCEPTION,
+ EMIT_EXCEPTION,
+ OOM,
+ TIMEOUT,
+ EMPTY_OUTPUT,
INTERMEDIATE_RESULT;
byte getByte() {
@@ -129,9 +139,9 @@
private final Path tikaConfigPath;
private final DataInputStream input;
private final DataOutputStream output;
- //if an extract is larger than this value, emit it directly;
- //if it is smaller than this value, write it back to the
- //PipesClient so that it can cache the extracts and then batch emit.
+ // if an extract is larger than this value, emit it directly;
+ // if it is smaller than this value, write it back to the
+ // PipesClient so that it can cache the extracts and then batch emit.
private final long maxForEmitBatchBytes;
private final long serverParseTimeoutMillis;
private final long serverWaitTimeoutMillis;
@@ -143,10 +153,13 @@
private volatile boolean parsing;
private volatile long since;
-
- public PipesServer(Path tikaConfigPath, InputStream in, PrintStream out,
- long maxForEmitBatchBytes, long serverParseTimeoutMillis,
- long serverWaitTimeoutMillis)
+ public PipesServer(
+ Path tikaConfigPath,
+ InputStream in,
+ PrintStream out,
+ long maxForEmitBatchBytes,
+ long serverParseTimeoutMillis,
+ long serverWaitTimeoutMillis)
throws IOException, TikaException, SAXException {
this.tikaConfigPath = tikaConfigPath;
this.input = new DataInputStream(in);
@@ -158,7 +171,6 @@
this.since = System.currentTimeMillis();
}
-
public static void main(String[] args) throws Exception {
try {
Path tikaConfig = Paths.get(args[0]);
@@ -167,8 +179,13 @@
long serverWaitTimeoutMillis = Long.parseLong(args[3]);
PipesServer server =
- new PipesServer(tikaConfig, System.in, System.out, maxForEmitBatchBytes,
- serverParseTimeoutMillis, serverWaitTimeoutMillis);
+ new PipesServer(
+ tikaConfig,
+ System.in,
+ System.out,
+ maxForEmitBatchBytes,
+ serverParseTimeoutMillis,
+ serverWaitTimeoutMillis);
System.setIn(new UnsynchronizedByteArrayInputStream(new byte[0]));
System.setOut(System.err);
Thread watchdog = new Thread(server, "Tika Watchdog");
@@ -188,11 +205,14 @@
synchronized (lock) {
long elapsed = System.currentTimeMillis() - since;
if (parsing && elapsed > serverParseTimeoutMillis) {
- LOG.warn("timeout server; elapsed {} with {}", elapsed,
+ LOG.warn(
+ "timeout server; elapsed {} with {}",
+ elapsed,
serverParseTimeoutMillis);
exit(TIMEOUT_EXIT_CODE);
- } else if (!parsing && serverWaitTimeoutMillis > 0 &&
- elapsed > serverWaitTimeoutMillis) {
+ } else if (!parsing
+ && serverWaitTimeoutMillis > 0
+ && elapsed > serverWaitTimeoutMillis) {
LOG.info("closing down from inactivity");
exit(0);
}
@@ -206,12 +226,13 @@
public void processRequests() {
LOG.debug("processing requests {}");
- //initialize
+ // initialize
try {
long start = System.currentTimeMillis();
initializeResources();
if (LOG.isTraceEnabled()) {
- LOG.trace("timer -- initialize parser and other resources: {} ms",
+ LOG.trace(
+ "timer -- initialize parser and other resources: {} ms",
System.currentTimeMillis() - start);
}
LOG.debug("pipes server initialized");
@@ -225,7 +246,7 @@
}
return;
}
- //main loop
+ // main loop
try {
write(STATUS.READY);
long start = System.currentTimeMillis();
@@ -263,8 +284,8 @@
}
/**
- * returns stack trace if there was a container exception or empty string
- * if there was no stacktrace
+ * returns stack trace if there was a container exception or empty string if there was no
+ * stacktrace
*
* @param t
* @param metadataList
@@ -278,10 +299,12 @@
return (stack != null) ? stack : StringUtils.EMPTY;
}
-
- private void emit(String taskId, EmitKey emitKey,
- boolean isExtractEmbeddedBytes, MetadataListAndEmbeddedBytes parseData,
- String parseExceptionStack) {
+ private void emit(
+ String taskId,
+ EmitKey emitKey,
+ boolean isExtractEmbeddedBytes,
+ MetadataListAndEmbeddedBytes parseData,
+ String parseExceptionStack) {
Emitter emitter = null;
try {
@@ -293,8 +316,7 @@
return;
}
try {
- if (isExtractEmbeddedBytes &&
- parseData.toBePackagedForStreamEmitter()) {
+ if (isExtractEmbeddedBytes && parseData.toBePackagedForStreamEmitter()) {
emitContentsAndBytes(emitter, emitKey, parseData);
} else {
emitter.emit(emitKey.getEmitKey(), parseData.getMetadataList());
@@ -303,25 +325,28 @@
LOG.warn("emit exception", e);
String msg = ExceptionUtils.getStackTrace(e);
byte[] bytes = msg.getBytes(StandardCharsets.UTF_8);
- //for now, we're hiding the parse exception if there was also an emit exception
+ // for now, we're hiding the parse exception if there was also an emit exception
write(STATUS.EMIT_EXCEPTION, bytes);
return;
}
if (StringUtils.isBlank(parseExceptionStack)) {
write(STATUS.EMIT_SUCCESS);
} else {
- write(STATUS.EMIT_SUCCESS_PARSE_EXCEPTION,
+ write(
+ STATUS.EMIT_SUCCESS_PARSE_EXCEPTION,
parseExceptionStack.getBytes(StandardCharsets.UTF_8));
}
}
- private void emitContentsAndBytes(Emitter emitter, EmitKey emitKey,
- MetadataListAndEmbeddedBytes parseData) {
+ private void emitContentsAndBytes(
+ Emitter emitter, EmitKey emitKey, MetadataListAndEmbeddedBytes parseData) {
if (!(emitter instanceof StreamEmitter)) {
- throw new IllegalArgumentException("The emitter for embedded document byte store must" +
- " be a StreamEmitter. I see: " + emitter.getClass());
+ throw new IllegalArgumentException(
+ "The emitter for embedded document byte store must"
+ + " be a StreamEmitter. I see: "
+ + emitter.getClass());
}
- //TODO: implement this
+ // TODO: implement this
throw new UnsupportedOperationException("this is not yet implemented");
}
@@ -335,8 +360,8 @@
long start = System.currentTimeMillis();
t = readFetchEmitTuple();
if (LOG.isTraceEnabled()) {
- LOG.trace("timer -- read fetchEmitTuple: {} ms",
- System.currentTimeMillis() - start);
+ LOG.trace(
+ "timer -- read fetchEmitTuple: {} ms", System.currentTimeMillis() - start);
}
start = System.currentTimeMillis();
actuallyParse(t);
@@ -358,7 +383,7 @@
long start = System.currentTimeMillis();
Fetcher fetcher = getFetcher(t);
if (fetcher == null) {
- //rely on proper logging/exception handling in getFetcher
+ // rely on proper logging/exception handling in getFetcher
return;
}
@@ -371,7 +396,7 @@
MetadataListAndEmbeddedBytes parseData = null;
try {
- //this can be null if there is a fetch exception
+ // this can be null if there is a fetch exception
parseData = parseFromTuple(t, fetcher);
if (LOG.isTraceEnabled()) {
@@ -385,8 +410,9 @@
emitParseData(t, parseData);
} finally {
- if (parseData != null && parseData.hasEmbeddedDocumentByteStore() &&
- parseData.getEmbeddedDocumentBytesHandler() instanceof Closeable) {
+ if (parseData != null
+ && parseData.hasEmbeddedDocumentByteStore()
+ && parseData.getEmbeddedDocumentBytesHandler() instanceof Closeable) {
try {
((Closeable) parseData.getEmbeddedDocumentBytesHandler()).close();
} catch (IOException e) {
@@ -399,10 +425,10 @@
private void emitParseData(FetchEmitTuple t, MetadataListAndEmbeddedBytes parseData) {
long start = System.currentTimeMillis();
String stack = getContainerStacktrace(t, parseData.getMetadataList());
- //we need to apply this after we pull out the stacktrace
+ // we need to apply this after we pull out the stacktrace
filterMetadata(parseData.getMetadataList());
- if (StringUtils.isBlank(stack) ||
- t.getOnParseException() == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) {
+ if (StringUtils.isBlank(stack)
+ || t.getOnParseException() == FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT) {
injectUserMetadata(t.getMetadata(), parseData.getMetadataList());
EmitKey emitKey = t.getEmitKey();
if (StringUtils.isBlank(emitKey.getEmitKey())) {
@@ -410,16 +436,24 @@
t.setEmitKey(emitKey);
}
EmitData emitData = new EmitData(t.getEmitKey(), parseData.getMetadataList(), stack);
- if (t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes() &&
- parseData.toBePackagedForStreamEmitter()) {
- emit(t.getId(), emitKey, t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(),
- parseData, stack);
- } else if (maxForEmitBatchBytes >= 0 &&
- emitData.getEstimatedSizeBytes() >= maxForEmitBatchBytes) {
- emit(t.getId(), emitKey, t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(),
- parseData, stack);
+ if (t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()
+ && parseData.toBePackagedForStreamEmitter()) {
+ emit(
+ t.getId(),
+ emitKey,
+ t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(),
+ parseData,
+ stack);
+ } else if (maxForEmitBatchBytes >= 0
+ && emitData.getEstimatedSizeBytes() >= maxForEmitBatchBytes) {
+ emit(
+ t.getId(),
+ emitKey,
+ t.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes(),
+ parseData,
+ stack);
} else {
- //send back to the client
+ // send back to the client
write(emitData);
}
if (LOG.isTraceEnabled()) {
@@ -463,8 +497,13 @@
"fetch key has a range, but the fetcher is not a range fetcher");
}
Metadata metadata = new Metadata();
- try (InputStream stream = ((RangeFetcher) fetcher).fetch(fetchKey.getFetchKey(),
- fetchKey.getRangeStart(), fetchKey.getRangeEnd(), metadata)) {
+ try (InputStream stream =
+ ((RangeFetcher) fetcher)
+ .fetch(
+ fetchKey.getFetchKey(),
+ fetchKey.getRangeStart(),
+ fetchKey.getRangeEnd(),
+ metadata)) {
return parseWithStream(t, stream, metadata);
} catch (SecurityException e) {
LOG.error("security exception " + t.getId(), e);
@@ -518,87 +557,105 @@
return sb.toString();
}
-
private void handleOOM(String taskId, OutOfMemoryError oom) {
write(STATUS.OOM);
LOG.error("oom: " + taskId, oom);
exit(1);
}
- private MetadataListAndEmbeddedBytes parseWithStream(FetchEmitTuple fetchEmitTuple,
- InputStream stream, Metadata metadata)
+ private MetadataListAndEmbeddedBytes parseWithStream(
+ FetchEmitTuple fetchEmitTuple, InputStream stream, Metadata metadata)
throws TikaConfigException {
HandlerConfig handlerConfig = fetchEmitTuple.getHandlerConfig();
List<Metadata> metadataList;
- //this adds the EmbeddedDocumentByteStore to the parsecontext
+ // this adds the EmbeddedDocumentByteStore to the parsecontext
ParseContext parseContext = createParseContext(fetchEmitTuple);
if (handlerConfig.getParseMode() == HandlerConfig.PARSE_MODE.RMETA) {
metadataList =
parseRecursive(fetchEmitTuple, handlerConfig, stream, metadata, parseContext);
} else {
- metadataList = parseConcatenated(fetchEmitTuple, handlerConfig, stream, metadata,
- parseContext);
+ metadataList =
+ parseConcatenated(
+ fetchEmitTuple, handlerConfig, stream, metadata, parseContext);
}
- return new MetadataListAndEmbeddedBytes(metadataList,
- parseContext.get(EmbeddedDocumentBytesHandler.class));
+ return new MetadataListAndEmbeddedBytes(
+ metadataList, parseContext.get(EmbeddedDocumentBytesHandler.class));
}
private ParseContext createParseContext(FetchEmitTuple fetchEmitTuple)
throws TikaConfigException {
ParseContext parseContext = new ParseContext();
- if (! fetchEmitTuple.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()) {
+ if (!fetchEmitTuple.getEmbeddedDocumentBytesConfig().isExtractEmbeddedDocumentBytes()) {
return parseContext;
}
- EmbeddedDocumentExtractorFactory factory = ((AutoDetectParser)autoDetectParser)
- .getAutoDetectParserConfig().getEmbeddedDocumentExtractorFactory();
+ EmbeddedDocumentExtractorFactory factory =
+ ((AutoDetectParser) autoDetectParser)
+ .getAutoDetectParserConfig()
+ .getEmbeddedDocumentExtractorFactory();
if (factory == null) {
- parseContext.set(EmbeddedDocumentExtractor.class, new RUnpackExtractor(parseContext,
- RUnpackExtractorFactory.DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION));
+ parseContext.set(
+ EmbeddedDocumentExtractor.class,
+ new RUnpackExtractor(
+ parseContext,
+ RUnpackExtractorFactory.DEFAULT_MAX_EMBEDDED_BYTES_FOR_EXTRACTION));
} else {
- if (! (factory instanceof EmbeddedDocumentByteStoreExtractorFactory)) {
- throw new TikaConfigException("EmbeddedDocumentExtractorFactory must be an " +
- "instance of EmbeddedDocumentByteStoreExtractorFactory if you want" +
- "to extract embedded bytes! I see this embedded doc factory: " +
- factory.getClass() + "and a request: " +
- fetchEmitTuple.getEmbeddedDocumentBytesConfig());
+ if (!(factory instanceof EmbeddedDocumentByteStoreExtractorFactory)) {
+ throw new TikaConfigException(
+ "EmbeddedDocumentExtractorFactory must be an "
+ + "instance of EmbeddedDocumentByteStoreExtractorFactory if you want"
+ + "to extract embedded bytes! I see this embedded doc factory: "
+ + factory.getClass()
+ + "and a request: "
+ + fetchEmitTuple.getEmbeddedDocumentBytesConfig());
}
}
- //TODO: especially clean this up.
+ // TODO: especially clean this up.
if (!StringUtils.isBlank(fetchEmitTuple.getEmbeddedDocumentBytesConfig().getEmitter())) {
- parseContext.set(EmbeddedDocumentBytesHandler.class,
- new EmittingEmbeddedDocumentBytesHandler(fetchEmitTuple.getEmitKey(),
- fetchEmitTuple.getEmbeddedDocumentBytesConfig(), emitterManager));
+ parseContext.set(
+ EmbeddedDocumentBytesHandler.class,
+ new EmittingEmbeddedDocumentBytesHandler(
+ fetchEmitTuple.getEmitKey(),
+ fetchEmitTuple.getEmbeddedDocumentBytesConfig(),
+ emitterManager));
} else {
- parseContext.set(EmbeddedDocumentBytesHandler.class,
+ parseContext.set(
+ EmbeddedDocumentBytesHandler.class,
new BasicEmbeddedDocumentBytesHandler(
- fetchEmitTuple.getEmbeddedDocumentBytesConfig()));
+ fetchEmitTuple.getEmbeddedDocumentBytesConfig()));
}
return parseContext;
}
- private List<Metadata> parseConcatenated(FetchEmitTuple fetchEmitTuple,
- HandlerConfig handlerConfig, InputStream stream,
- Metadata metadata, ParseContext parseContext) {
+ private List<Metadata> parseConcatenated(
+ FetchEmitTuple fetchEmitTuple,
+ HandlerConfig handlerConfig,
+ InputStream stream,
+ Metadata metadata,
+ ParseContext parseContext) {
ContentHandlerFactory contentHandlerFactory =
- new BasicContentHandlerFactory(handlerConfig.getType(),
- handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(),
+ new BasicContentHandlerFactory(
+ handlerConfig.getType(),
+ handlerConfig.getWriteLimit(),
+ handlerConfig.isThrowOnWriteLimitReached(),
parseContext);
ContentHandler handler = contentHandlerFactory.getNewContentHandler();
- parseContext.set(DocumentSelector.class, new DocumentSelector() {
- final int maxEmbedded = handlerConfig.maxEmbeddedResources;
- int embedded = 0;
+ parseContext.set(
+ DocumentSelector.class,
+ new DocumentSelector() {
+ final int maxEmbedded = handlerConfig.maxEmbeddedResources;
+ int embedded = 0;
- @Override
- public boolean select(Metadata metadata) {
- if (maxEmbedded < 0) {
- return true;
- }
- return embedded++ < maxEmbedded;
- }
- });
+ @Override
+ public boolean select(Metadata metadata) {
+ if (maxEmbedded < 0) {
+ return true;
+ }
+ return embedded++ < maxEmbedded;
+ }
+ });
String containerException = null;
long start = System.currentTimeMillis();
@@ -629,15 +686,22 @@
return Collections.singletonList(metadata);
}
- private List<Metadata> parseRecursive(FetchEmitTuple fetchEmitTuple,
- HandlerConfig handlerConfig, InputStream stream,
- Metadata metadata, ParseContext parseContext) {
- //Intentionally do not add the metadata filter here!
- //We need to let stacktraces percolate
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(handlerConfig.getType(),
- handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(),
- parseContext), handlerConfig.getMaxEmbeddedResources());
+ private List<Metadata> parseRecursive(
+ FetchEmitTuple fetchEmitTuple,
+ HandlerConfig handlerConfig,
+ InputStream stream,
+ Metadata metadata,
+ ParseContext parseContext) {
+ // Intentionally do not add the metadata filter here!
+ // We need to let stacktraces percolate
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(
+ handlerConfig.getType(),
+ handlerConfig.getWriteLimit(),
+ handlerConfig.isThrowOnWriteLimitReached(),
+ parseContext),
+ handlerConfig.getMaxEmbeddedResources());
long start = System.currentTimeMillis();
@@ -661,8 +725,8 @@
return handler.getMetadataList();
}
- private void preParse(FetchEmitTuple t, InputStream stream, Metadata metadata,
- ParseContext parseContext) {
+ private void preParse(
+ FetchEmitTuple t, InputStream stream, Metadata metadata, ParseContext parseContext) {
TemporaryResources tmp = null;
try {
TikaInputStream tis = TikaInputStream.cast(stream);
@@ -673,12 +737,12 @@
} finally {
IOUtils.closeQuietly(tmp);
}
- //do we want to filter the metadata to digest, length, content-type?
+ // do we want to filter the metadata to digest, length, content-type?
writeIntermediate(t.getEmitKey(), metadata);
}
- private void _preParse(FetchEmitTuple t, TikaInputStream tis, Metadata metadata,
- ParseContext parseContext) {
+ private void _preParse(
+ FetchEmitTuple t, TikaInputStream tis, Metadata metadata, ParseContext parseContext) {
if (digester != null) {
try {
digester.digest(tis, metadata, parseContext);
@@ -694,8 +758,8 @@
LOG.warn("problem detecting: " + t.getId(), e);
}
- if (t.getEmbeddedDocumentBytesConfig() != null &&
- t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) {
+ if (t.getEmbeddedDocumentBytesConfig() != null
+ && t.getEmbeddedDocumentBytesConfig().isIncludeOriginal()) {
EmbeddedDocumentBytesHandler embeddedDocumentByteStore =
parseContext.get(EmbeddedDocumentBytesHandler.class);
try (InputStream is = Files.newInputStream(tis.getPath())) {
@@ -708,7 +772,7 @@
private void injectUserMetadata(Metadata userMetadata, List<Metadata> metadataList) {
for (String n : userMetadata.names()) {
- //overwrite whatever was there
+ // overwrite whatever was there
metadataList.get(0).set(n, null);
for (String val : userMetadata.getValues(n)) {
metadataList.get(0).add(n, val);
@@ -725,14 +789,13 @@
System.exit(exitCode);
}
-
private FetchEmitTuple readFetchEmitTuple() {
try {
int length = input.readInt();
byte[] bytes = new byte[length];
input.readFully(bytes);
- try (ObjectInputStream objectInputStream = new ObjectInputStream(
- new UnsynchronizedByteArrayInputStream(bytes))) {
+ try (ObjectInputStream objectInputStream =
+ new ObjectInputStream(new UnsynchronizedByteArrayInputStream(bytes))) {
return (FetchEmitTuple) objectInputStream.readObject();
}
} catch (IOException e) {
@@ -742,16 +805,16 @@
LOG.error("can't find class?!", e);
exit(1);
}
- //unreachable, no?!
+ // unreachable, no?!
return null;
}
protected void initializeResources() throws TikaException, IOException, SAXException {
- //TODO allowed named configurations in tika config
+ // TODO allowed named configurations in tika config
this.tikaConfig = new TikaConfig(tikaConfigPath);
this.fetcherManager = FetcherManager.load(tikaConfigPath);
- //skip initialization of the emitters if emitting
- //from the pipesserver is turned off.
+ // skip initialization of the emitters if emitting
+ // from the pipesserver is turned off.
if (maxForEmitBatchBytes > -1) {
this.emitterManager = EmitterManager.load(tikaConfigPath);
} else {
@@ -759,30 +822,37 @@
this.emitterManager = null;
}
this.autoDetectParser = new AutoDetectParser(this.tikaConfig);
- if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig()
- .getDigesterFactory() != null) {
- this.digester = ((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig()
- .getDigesterFactory().build();
- //override this value because we'll be digesting before parse
- ((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig().getDigesterFactory()
+ if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig().getDigesterFactory()
+ != null) {
+ this.digester =
+ ((AutoDetectParser) autoDetectParser)
+ .getAutoDetectParserConfig()
+ .getDigesterFactory()
+ .build();
+ // override this value because we'll be digesting before parse
+ ((AutoDetectParser) autoDetectParser)
+ .getAutoDetectParserConfig()
+ .getDigesterFactory()
.setSkipContainerDocument(true);
- //if the user hasn't configured an embedded document extractor, set up the
+ // if the user hasn't configured an embedded document extractor, set up the
// RUnpackExtractorFactory
- if (((AutoDetectParser) autoDetectParser).getAutoDetectParserConfig()
- .getEmbeddedDocumentExtractorFactory() == null) {
+ if (((AutoDetectParser) autoDetectParser)
+ .getAutoDetectParserConfig()
+ .getEmbeddedDocumentExtractorFactory()
+ == null) {
((AutoDetectParser) autoDetectParser)
- .getAutoDetectParserConfig().setEmbeddedDocumentExtractorFactory(
- new RUnpackExtractorFactory());
+ .getAutoDetectParserConfig()
+ .setEmbeddedDocumentExtractorFactory(new RUnpackExtractorFactory());
}
}
this.detector = ((AutoDetectParser) this.autoDetectParser).getDetector();
this.rMetaParser = new RecursiveParserWrapper(autoDetectParser);
}
-
private void writeIntermediate(EmitKey emitKey, Metadata metadata) {
try {
- UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get();
+ UnsynchronizedByteArrayOutputStream bos =
+ UnsynchronizedByteArrayOutputStream.builder().get();
try (ObjectOutputStream objectOutputStream = new ObjectOutputStream(bos)) {
objectOutputStream.writeObject(metadata);
}
@@ -795,7 +865,8 @@
private void write(EmitData emitData) {
try {
- UnsynchronizedByteArrayOutputStream bos = UnsynchronizedByteArrayOutputStream.builder().get();
+ UnsynchronizedByteArrayOutputStream bos =
+ UnsynchronizedByteArrayOutputStream.builder().get();
try (ObjectOutputStream objectOutputStream = new ObjectOutputStream(bos)) {
objectOutputStream.writeObject(emitData);
}
@@ -838,8 +909,9 @@
final List<Metadata> metadataList;
final Optional<EmbeddedDocumentBytesHandler> embeddedDocumentBytesHandler;
- public MetadataListAndEmbeddedBytes(List<Metadata> metadataList,
- EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler) {
+ public MetadataListAndEmbeddedBytes(
+ List<Metadata> metadataList,
+ EmbeddedDocumentBytesHandler embeddedDocumentBytesHandler) {
this.metadataList = metadataList;
this.embeddedDocumentBytesHandler = Optional.ofNullable(embeddedDocumentBytesHandler);
}
@@ -853,8 +925,8 @@
}
/**
- * This tests whether there's any type of embedded document store
- * ...that, for example, may require closing at the end of the parse.
+ * This tests whether there's any type of embedded document store ...that, for example, may
+ * require closing at the end of the parse.
*
* @return
*/
@@ -863,15 +935,16 @@
}
/**
- * If the intent is that the metadata and byte store be packaged in a zip
- * or similar and emitted via a single stream emitter.
- * <p>
- * This is basically a test that this is not an EmbeddedDocumentEmitterStore.
+ * If the intent is that the metadata and byte store be packaged in a zip or similar and
+ * emitted via a single stream emitter.
+ *
+ * <p>This is basically a test that this is not an EmbeddedDocumentEmitterStore.
*
* @return
*/
public boolean toBePackagedForStreamEmitter() {
- return !(embeddedDocumentBytesHandler.get() instanceof EmittingEmbeddedDocumentBytesHandler);
+ return !(embeddedDocumentBytesHandler.get()
+ instanceof EmittingEmbeddedDocumentBytesHandler);
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java
index bc55cca..29d8c2e 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncConfig.java
@@ -20,7 +20,6 @@
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.pipes.PipesConfigBase;
import org.apache.tika.pipes.PipesReporter;
@@ -53,9 +52,8 @@
}
/**
- * If nothing has been emitted in this amount of time
- * and the {@link #getEmitMaxEstimatedBytes()} has not been reached yet,
- * emit what's in the emit queue.
+ * If nothing has been emitted in this amount of time and the {@link
+ * #getEmitMaxEstimatedBytes()} has not been reached yet, emit what's in the emit queue.
*
* @param emitWithinMillis
*/
@@ -64,8 +62,9 @@
}
/**
- * When the emit queue hits this estimated size (sum of
- * estimated extract sizes), emit the batch.
+ * When the emit queue hits this estimated size (sum of estimated extract sizes), emit the
+ * batch.
+ *
* @return
*/
public long getEmitMaxEstimatedBytes() {
@@ -76,13 +75,13 @@
this.emitMaxEstimatedBytes = emitMaxEstimatedBytes;
}
-
public void setNumEmitters(int numEmitters) {
this.numEmitters = numEmitters;
}
/**
* FetchEmitTuple queue size
+ *
* @return
*/
public int getQueueSize() {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java
index fce65c5..25d2c67 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncEmitter.java
@@ -26,20 +26,15 @@
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.Callable;
import java.util.concurrent.TimeUnit;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.pipes.emitter.EmitData;
import org.apache.tika.pipes.emitter.Emitter;
import org.apache.tika.pipes.emitter.EmitterManager;
import org.apache.tika.pipes.emitter.TikaEmitterException;
import org.apache.tika.utils.ExceptionUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
-/**
- * Worker thread that takes EmitData off the queue, batches it
- * and tries to emit it as a batch
- */
+/** Worker thread that takes EmitData off the queue, batches it and tries to emit it as a batch */
public class AsyncEmitter implements Callable<Integer> {
static final EmitData EMIT_DATA_STOP_SEMAPHORE = new EmitData(null, null);
@@ -53,8 +48,10 @@
Instant lastEmitted = Instant.now();
- public AsyncEmitter(AsyncConfig asyncConfig, ArrayBlockingQueue<EmitData> emitData,
- EmitterManager emitterManager) {
+ public AsyncEmitter(
+ AsyncConfig asyncConfig,
+ ArrayBlockingQueue<EmitData> emitData,
+ EmitterManager emitterManager) {
this.asyncConfig = asyncConfig;
this.emitDataQueue = emitData;
this.emitterManager = emitterManager;
@@ -71,17 +68,22 @@
return EMITTER_FUTURE_CODE;
}
if (emitData != null) {
- //this can block on emitAll
+ // this can block on emitAll
cache.add(emitData);
} else {
LOG.trace("Nothing on the async queue");
}
- LOG.debug("cache size: ({}) bytes and extract count: {}", cache.estimatedSize,
+ LOG.debug(
+ "cache size: ({}) bytes and extract count: {}",
+ cache.estimatedSize,
cache.size);
long elapsed = ChronoUnit.MILLIS.between(lastEmitted, Instant.now());
if (elapsed > asyncConfig.getEmitWithinMillis()) {
- LOG.debug("{} elapsed > {}, going to emitAll", elapsed, asyncConfig.getEmitWithinMillis());
- //this can block
+ LOG.debug(
+ "{} elapsed > {}, going to emitAll",
+ elapsed,
+ asyncConfig.getEmitWithinMillis());
+ // this can block
cache.emitAll();
}
}
@@ -106,11 +108,14 @@
size++;
long sz = data.getEstimatedSizeBytes();
if (estimatedSize + sz > maxBytes) {
- LOG.debug("estimated size ({}) > maxBytes({}), going to emitAll",
- (estimatedSize + sz), maxBytes);
+ LOG.debug(
+ "estimated size ({}) > maxBytes({}), going to emitAll",
+ (estimatedSize + sz),
+ maxBytes);
emitAll();
}
- List<EmitData> cached = map.computeIfAbsent(data.getEmitKey().getEmitterName(), k -> new ArrayList<>());
+ List<EmitData> cached =
+ map.computeIfAbsent(data.getEmitKey().getEmitterName(), k -> new ArrayList<>());
updateEstimatedSize(sz);
cached.add(data);
}
@@ -136,7 +141,9 @@
try {
emitter.emit(cachedEmitData);
} catch (IOException | TikaEmitterException e) {
- LOG.warn("emitter class ({}): {}", emitter.getClass(),
+ LOG.warn(
+ "emitter class ({}): {}",
+ emitter.getClass(),
ExceptionUtils.getStackTrace(e));
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java
index 3a6751f..850e773 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncProcessor.java
@@ -29,10 +29,6 @@
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.PipesClient;
@@ -44,11 +40,11 @@
import org.apache.tika.pipes.pipesiterator.PipesIterator;
import org.apache.tika.pipes.pipesiterator.TotalCountResult;
import org.apache.tika.pipes.pipesiterator.TotalCounter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * This is the main class for handling async requests. This manages
- * AsyncClients and AsyncEmitters.
- *
+ * This is the main class for handling async requests. This manages AsyncClients and AsyncEmitters.
*/
public class AsyncProcessor implements Closeable {
@@ -73,33 +69,39 @@
this(tikaConfigPath, null);
}
- public AsyncProcessor(Path tikaConfigPath, PipesIterator pipesIterator) throws TikaException, IOException {
+ public AsyncProcessor(Path tikaConfigPath, PipesIterator pipesIterator)
+ throws TikaException, IOException {
this.asyncConfig = AsyncConfig.load(tikaConfigPath);
this.fetchEmitTuples = new ArrayBlockingQueue<>(asyncConfig.getQueueSize());
this.emitData = new ArrayBlockingQueue<>(100);
- //+1 is the watcher thread
- this.executorService = Executors.newFixedThreadPool(
- asyncConfig.getNumClients() + asyncConfig.getNumEmitters() + 1);
- this.executorCompletionService =
- new ExecutorCompletionService<>(executorService);
+ // +1 is the watcher thread
+ this.executorService =
+ Executors.newFixedThreadPool(
+ asyncConfig.getNumClients() + asyncConfig.getNumEmitters() + 1);
+ this.executorCompletionService = new ExecutorCompletionService<>(executorService);
try {
- if (!tikaConfigPath.toAbsolutePath().equals(asyncConfig.getTikaConfig().toAbsolutePath())) {
- LOG.warn("TikaConfig for AsyncProcessor ({}) is different " +
- "from TikaConfig for workers ({}). If this is intended," +
- " please ignore this warning.", tikaConfigPath.toAbsolutePath(),
+ if (!tikaConfigPath
+ .toAbsolutePath()
+ .equals(asyncConfig.getTikaConfig().toAbsolutePath())) {
+ LOG.warn(
+ "TikaConfig for AsyncProcessor ({}) is different "
+ + "from TikaConfig for workers ({}). If this is intended,"
+ + " please ignore this warning.",
+ tikaConfigPath.toAbsolutePath(),
asyncConfig.getTikaConfig().toAbsolutePath());
}
- this.executorCompletionService.submit(() -> {
- while (true) {
- try {
- Thread.sleep(500);
- checkActive();
- } catch (InterruptedException e) {
- return WATCHER_FUTURE_CODE;
- }
- }
- });
- //this is run in a daemon thread
+ this.executorCompletionService.submit(
+ () -> {
+ while (true) {
+ try {
+ Thread.sleep(500);
+ checkActive();
+ } catch (InterruptedException e) {
+ return WATCHER_FUTURE_CODE;
+ }
+ }
+ });
+ // this is run in a daemon thread
if (pipesIterator != null && (pipesIterator instanceof TotalCounter)) {
LOG.debug("going to total counts");
startCounter((TotalCounter) pipesIterator);
@@ -124,23 +126,28 @@
}
private void startCounter(TotalCounter totalCounter) {
- Thread counterThread = new Thread(() -> {
- totalCounter.startTotalCount();
- PipesReporter pipesReporter = asyncConfig.getPipesReporter();
- TotalCountResult.STATUS status = totalCounter.getTotalCount().getStatus();
- while (status == TotalCountResult.STATUS.NOT_COMPLETED) {
- try {
- Thread.sleep(500);
- TotalCountResult result = totalCounter.getTotalCount();
- LOG.trace("counter total {} {} ", result.getStatus(), result.getTotalCount());
- pipesReporter.report(result);
- status = result.getStatus();
- } catch (InterruptedException e) {
- return;
- }
- }
-
- });
+ Thread counterThread =
+ new Thread(
+ () -> {
+ totalCounter.startTotalCount();
+ PipesReporter pipesReporter = asyncConfig.getPipesReporter();
+ TotalCountResult.STATUS status =
+ totalCounter.getTotalCount().getStatus();
+ while (status == TotalCountResult.STATUS.NOT_COMPLETED) {
+ try {
+ Thread.sleep(500);
+ TotalCountResult result = totalCounter.getTotalCount();
+ LOG.trace(
+ "counter total {} {} ",
+ result.getStatus(),
+ result.getTotalCount());
+ pipesReporter.report(result);
+ status = result.getStatus();
+ } catch (InterruptedException e) {
+ return;
+ }
+ }
+ });
counterThread.setDaemon(true);
counterThread.start();
}
@@ -152,8 +159,8 @@
"Can't call offer after calling close() or " + "shutdownNow()");
}
if (newFetchEmitTuples.size() > asyncConfig.getQueueSize()) {
- throw new OfferLargerThanQueueSize(newFetchEmitTuples.size(),
- asyncConfig.getQueueSize());
+ throw new OfferLargerThanQueueSize(
+ newFetchEmitTuples.size(), asyncConfig.getQueueSize());
}
long start = System.currentTimeMillis();
long elapsed = System.currentTimeMillis() - start;
@@ -163,8 +170,8 @@
fetchEmitTuples.addAll(newFetchEmitTuples);
return true;
} catch (IllegalStateException e) {
- //this means that the add all failed because the queue couldn't
- //take the full list
+ // this means that the add all failed because the queue couldn't
+ // take the full list
LOG.debug("couldn't add full list", e);
}
}
@@ -192,11 +199,14 @@
public void finished() throws InterruptedException {
for (int i = 0; i < asyncConfig.getNumClients(); i++) {
- boolean offered = fetchEmitTuples.offer(PipesIterator.COMPLETED_SEMAPHORE,
- MAX_OFFER_WAIT_MS, TimeUnit.MILLISECONDS);
- if (! offered) {
- throw new RuntimeException("Couldn't offer completed semaphore within " +
- MAX_OFFER_WAIT_MS + " ms");
+ boolean offered =
+ fetchEmitTuples.offer(
+ PipesIterator.COMPLETED_SEMAPHORE,
+ MAX_OFFER_WAIT_MS,
+ TimeUnit.MILLISECONDS);
+ if (!offered) {
+ throw new RuntimeException(
+ "Couldn't offer completed semaphore within " + MAX_OFFER_WAIT_MS + " ms");
}
}
}
@@ -208,19 +218,20 @@
try {
Integer i = future.get();
switch (i) {
- case PARSER_FUTURE_CODE :
+ case PARSER_FUTURE_CODE:
numParserThreadsFinished++;
LOG.debug("fetchEmitWorker finished, total {}", numParserThreadsFinished);
break;
- case AsyncEmitter.EMITTER_FUTURE_CODE :
+ case AsyncEmitter.EMITTER_FUTURE_CODE:
numEmitterThreadsFinished++;
LOG.debug("emitter thread finished, total {}", numEmitterThreadsFinished);
break;
- case WATCHER_FUTURE_CODE :
+ case WATCHER_FUTURE_CODE:
LOG.debug("watcher thread finished");
break;
- default :
- throw new IllegalArgumentException("Don't recognize this future code: " + i);
+ default:
+ throw new IllegalArgumentException(
+ "Don't recognize this future code: " + i);
}
} catch (ExecutionException e) {
LOG.error("execution exception", e);
@@ -228,15 +239,20 @@
throw new RuntimeException(e);
}
}
- if (numParserThreadsFinished == asyncConfig.getNumClients() && ! addedEmitterSemaphores) {
+ if (numParserThreadsFinished == asyncConfig.getNumClients() && !addedEmitterSemaphores) {
for (int i = 0; i < asyncConfig.getNumEmitters(); i++) {
try {
- boolean offered = emitData.offer(AsyncEmitter.EMIT_DATA_STOP_SEMAPHORE,
- MAX_OFFER_WAIT_MS,
- TimeUnit.MILLISECONDS);
- if (! offered) {
- throw new RuntimeException("Couldn't offer emit data stop semaphore " +
- "within " + MAX_OFFER_WAIT_MS + " ms");
+ boolean offered =
+ emitData.offer(
+ AsyncEmitter.EMIT_DATA_STOP_SEMAPHORE,
+ MAX_OFFER_WAIT_MS,
+ TimeUnit.MILLISECONDS);
+ if (!offered) {
+ throw new RuntimeException(
+ "Couldn't offer emit data stop semaphore "
+ + "within "
+ + MAX_OFFER_WAIT_MS
+ + " ms");
}
} catch (InterruptedException e) {
throw new RuntimeException(e);
@@ -244,8 +260,8 @@
}
addedEmitterSemaphores = true;
}
- return !(numParserThreadsFinished == asyncConfig.getNumClients() &&
- numEmitterThreadsFinished == asyncConfig.getNumEmitters());
+ return !(numParserThreadsFinished == asyncConfig.getNumClients()
+ && numEmitterThreadsFinished == asyncConfig.getNumEmitters());
}
@Override
@@ -264,9 +280,10 @@
private final ArrayBlockingQueue<FetchEmitTuple> fetchEmitTuples;
private final ArrayBlockingQueue<EmitData> emitDataQueue;
- private FetchEmitWorker(AsyncConfig asyncConfig,
- ArrayBlockingQueue<FetchEmitTuple> fetchEmitTuples,
- ArrayBlockingQueue<EmitData> emitDataQueue) {
+ private FetchEmitWorker(
+ AsyncConfig asyncConfig,
+ ArrayBlockingQueue<FetchEmitTuple> fetchEmitTuples,
+ ArrayBlockingQueue<EmitData> emitDataQueue) {
this.asyncConfig = asyncConfig;
this.fetchEmitTuples = fetchEmitTuples;
this.emitDataQueue = emitDataQueue;
@@ -279,7 +296,7 @@
while (true) {
FetchEmitTuple t = fetchEmitTuples.poll(1, TimeUnit.SECONDS);
if (t == null) {
- //skip
+ // skip
if (LOG.isTraceEnabled()) {
LOG.trace("null fetch emit tuple");
}
@@ -298,23 +315,30 @@
result = PipesResult.UNSPECIFIED_CRASH;
}
if (LOG.isTraceEnabled()) {
- LOG.trace("timer -- pipes client process: {} ms",
+ LOG.trace(
+ "timer -- pipes client process: {} ms",
System.currentTimeMillis() - start);
}
long offerStart = System.currentTimeMillis();
if (shouldEmit(result)) {
LOG.trace("adding result to emitter queue: " + result.getEmitData());
- boolean offered = emitDataQueue.offer(result.getEmitData(),
- MAX_OFFER_WAIT_MS,
- TimeUnit.MILLISECONDS);
- if (! offered) {
- throw new RuntimeException("Couldn't offer emit data to queue " +
- "within " + MAX_OFFER_WAIT_MS + " ms");
+ boolean offered =
+ emitDataQueue.offer(
+ result.getEmitData(),
+ MAX_OFFER_WAIT_MS,
+ TimeUnit.MILLISECONDS);
+ if (!offered) {
+ throw new RuntimeException(
+ "Couldn't offer emit data to queue "
+ + "within "
+ + MAX_OFFER_WAIT_MS
+ + " ms");
}
}
if (LOG.isTraceEnabled()) {
- LOG.trace("timer -- offered: {} ms",
+ LOG.trace(
+ "timer -- offered: {} ms",
System.currentTimeMillis() - offerStart);
}
long elapsed = System.currentTimeMillis() - start;
@@ -327,8 +351,8 @@
private boolean shouldEmit(PipesResult result) {
- if (result.getStatus() == PipesResult.STATUS.PARSE_SUCCESS ||
- result.getStatus() == PipesResult.STATUS.PARSE_SUCCESS_WITH_EXCEPTION) {
+ if (result.getStatus() == PipesResult.STATUS.PARSE_SUCCESS
+ || result.getStatus() == PipesResult.STATUS.PARSE_SUCCESS_WITH_EXCEPTION) {
return true;
}
return result.isIntermediate() && asyncConfig.isEmitIntermediateResults();
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java
index 46a58ff..ecd779e 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/async/AsyncStatus.java
@@ -19,7 +19,6 @@
import java.time.Instant;
import java.util.HashMap;
import java.util.Map;
-
import org.apache.tika.pipes.PipesResult;
import org.apache.tika.pipes.pipesiterator.TotalCountResult;
import org.apache.tika.utils.StringUtils;
@@ -31,10 +30,12 @@
COMPLETED,
CRASHED
}
+
private final Instant started;
private Instant lastUpdate;
- private TotalCountResult totalCountResult = new TotalCountResult(0, TotalCountResult.STATUS.NOT_COMPLETED);
+ private TotalCountResult totalCountResult =
+ new TotalCountResult(0, TotalCountResult.STATUS.NOT_COMPLETED);
private Map<PipesResult.STATUS, Long> statusCounts = new HashMap<>();
private ASYNC_STATUS asyncStatus = ASYNC_STATUS.STARTED;
@@ -45,8 +46,10 @@
lastUpdate = started;
}
- public synchronized void update(Map<PipesResult.STATUS, Long> statusCounts,
- TotalCountResult totalCountResult, ASYNC_STATUS status) {
+ public synchronized void update(
+ Map<PipesResult.STATUS, Long> statusCounts,
+ TotalCountResult totalCountResult,
+ ASYNC_STATUS status) {
this.lastUpdate = Instant.now();
this.statusCounts = statusCounts;
this.totalCountResult = totalCountResult;
@@ -83,8 +86,20 @@
@Override
public String toString() {
- return "AsyncStatus{" + "started=" + started + ", lastUpdate=" + lastUpdate +
- ", totalCountResult=" + totalCountResult + ", statusCounts=" + statusCounts +
- ", asyncStatus=" + asyncStatus + ", crashMessage='" + crashMessage + '\'' + '}';
+ return "AsyncStatus{"
+ + "started="
+ + started
+ + ", lastUpdate="
+ + lastUpdate
+ + ", totalCountResult="
+ + totalCountResult
+ + ", statusCounts="
+ + statusCounts
+ + ", asyncStatus="
+ + asyncStatus
+ + ", crashMessage='"
+ + crashMessage
+ + '\''
+ + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java b/tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java
index da96c80..0896af6 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/async/OfferLargerThanQueueSize.java
@@ -27,8 +27,7 @@
@Override
public String getMessage() {
- return "sizeOffered (" + sizeOffered + ") is greater than queue size (" +
- queueSize + ")";
+ return "sizeOffered (" + sizeOffered + ") is greater than queue size (" + queueSize + ")";
}
public int getQueueSize() {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java
index 648e094..13312e0 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/AbstractEmitter.java
@@ -33,9 +33,9 @@
}
/**
- * The default behavior is to call {@link #emit(String, List)} on each item.
- * Some implementations, e.g. Solr/ES/vespa, can benefit from subclassing this and
- * emitting a bunch of docs at once.
+ * The default behavior is to call {@link #emit(String, List)} on each item. Some
+ * implementations, e.g. Solr/ES/vespa, can benefit from subclassing this and emitting a bunch
+ * of docs at once.
*
* @param emitData
* @throws IOException
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java
index 95376a9..b56f4e9 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitData.java
@@ -18,14 +18,11 @@
import java.io.Serializable;
import java.util.List;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.utils.StringUtils;
public class EmitData implements Serializable {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -3861669115439125268L;
private final EmitKey emitKey;
@@ -40,8 +37,8 @@
public EmitData(EmitKey emitKey, List<Metadata> metadataList, String containerStackTrace) {
this.emitKey = emitKey;
this.metadataList = metadataList;
- this.containerStackTrace = (containerStackTrace == null) ? StringUtils.EMPTY :
- containerStackTrace;
+ this.containerStackTrace =
+ (containerStackTrace == null) ? StringUtils.EMPTY : containerStackTrace;
}
public EmitKey getEmitKey() {
@@ -57,11 +54,12 @@
}
public long getEstimatedSizeBytes() {
- return estimateSizeInBytes(getEmitKey().getEmitKey(), getMetadataList(), containerStackTrace);
+ return estimateSizeInBytes(
+ getEmitKey().getEmitKey(), getMetadataList(), containerStackTrace);
}
- private static long estimateSizeInBytes(String id, List<Metadata> metadataList,
- String containerStackTrace) {
+ private static long estimateSizeInBytes(
+ String id, List<Metadata> metadataList, String containerStackTrace) {
long sz = 36 + id.length() * 2;
sz += 36 + containerStackTrace.length() * 2;
for (Metadata m : metadataList) {
@@ -77,7 +75,14 @@
@Override
public String toString() {
- return "EmitData{" + "emitKey=" + emitKey + ", metadataList=" + metadataList +
- ", containerStackTrace='" + containerStackTrace + '\'' + '}';
+ return "EmitData{"
+ + "emitKey="
+ + emitKey
+ + ", metadataList="
+ + metadataList
+ + ", containerStackTrace='"
+ + containerStackTrace
+ + '\''
+ + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java
index e570064..8ab3b95 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitKey.java
@@ -21,18 +21,15 @@
public class EmitKey implements Serializable {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -3861669115439125268L;
private String emitterName;
private String emitKey;
- //for serialization only...yuck.
- public EmitKey() {
+ // for serialization only...yuck.
+ public EmitKey() {}
- }
public EmitKey(String emitterName, String emitKey) {
this.emitterName = emitterName;
this.emitKey = emitKey;
@@ -48,8 +45,14 @@
@Override
public String toString() {
- return "EmitterKey{" + "emitterName='" + emitterName + '\'' + ", emitterKey='" + emitKey +
- '\'' + '}';
+ return "EmitterKey{"
+ + "emitterName='"
+ + emitterName
+ + '\''
+ + ", emitterKey='"
+ + emitKey
+ + '\''
+ + '}';
}
@Override
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java
index f60ef3b..0c15ec0 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/Emitter.java
@@ -18,7 +18,6 @@
import java.io.IOException;
import java.util.List;
-
import org.apache.tika.metadata.Metadata;
public interface Emitter {
@@ -28,7 +27,7 @@
void emit(String emitKey, List<Metadata> metadataList) throws IOException, TikaEmitterException;
void emit(List<? extends EmitData> emitData) throws IOException, TikaEmitterException;
- //TODO -- add this later for xhtml?
- //void emit(String txt, Metadata metadata) throws IOException, TikaException;
+ // TODO -- add this later for xhtml?
+ // void emit(String txt, Metadata metadata) throws IOException, TikaException;
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java
index 7d1aba1..c1245b8 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmitterManager.java
@@ -24,32 +24,26 @@
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
-
import org.apache.tika.config.ConfigBase;
import org.apache.tika.exception.TikaConfigException;
/**
- * Utility class that will apply the appropriate fetcher
- * to the fetcherString based on the prefix.
- * <p>
- * This does not allow multiple fetchers supporting the same prefix.
+ * Utility class that will apply the appropriate fetcher to the fetcherString based on the prefix.
+ *
+ * <p>This does not allow multiple fetchers supporting the same prefix.
*/
public class EmitterManager extends ConfigBase {
private final Map<String, Emitter> emitterMap = new ConcurrentHashMap<>();
public static EmitterManager load(Path tikaConfigPath) throws IOException, TikaConfigException {
- try (InputStream is = Files.newInputStream(tikaConfigPath) ) {
+ try (InputStream is = Files.newInputStream(tikaConfigPath)) {
return EmitterManager.buildComposite(
- "emitters", EmitterManager.class,
- "emitter",
- Emitter.class, is);
+ "emitters", EmitterManager.class, "emitter", Emitter.class, is);
}
}
- private EmitterManager() {
-
- }
+ private EmitterManager() {}
public EmitterManager(List<Emitter> emitters) {
for (Emitter emitter : emitters) {
@@ -58,7 +52,6 @@
"Multiple emitters cannot support the same name: " + emitter.getName());
}
emitterMap.put(emitter.getName(), emitter);
-
}
}
@@ -66,7 +59,6 @@
return emitterMap.keySet();
}
-
public Emitter getEmitter(String emitterName) {
Emitter emitter = emitterMap.get(emitterName);
if (emitter == null) {
@@ -76,9 +68,10 @@
}
/**
- * Convenience method that returns an emitter if only one emitter
- * is specified in the tika-config file. If 0 or > 1 emitters
- * are specified, this throws an IllegalArgumentException.
+ * Convenience method that returns an emitter if only one emitter is specified in the
+ * tika-config file. If 0 or > 1 emitters are specified, this throws an
+ * IllegalArgumentException.
+ *
* @return
*/
public Emitter getEmitter() {
@@ -86,13 +79,13 @@
throw new IllegalArgumentException("emitters size must == 1 for the no arg call");
}
if (emitterMap.size() > 1) {
- throw new IllegalArgumentException("need to specify 'emitterName' if > 1 emitters are" +
- " available");
+ throw new IllegalArgumentException(
+ "need to specify 'emitterName' if > 1 emitters are" + " available");
}
for (Emitter emitter : emitterMap.values()) {
return emitter;
}
- //this should be unreachable?!
+ // this should be unreachable?!
throw new IllegalArgumentException("emitters size must == 0");
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java
index b77107b..ee98511 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/EmptyEmitter.java
@@ -18,7 +18,6 @@
import java.io.IOException;
import java.util.List;
-
import org.apache.tika.metadata.Metadata;
public class EmptyEmitter implements Emitter {
@@ -30,12 +29,8 @@
@Override
public void emit(String emitKey, List<Metadata> metadataList)
- throws IOException, TikaEmitterException {
-
- }
+ throws IOException, TikaEmitterException {}
@Override
- public void emit(List<? extends EmitData> emitData) throws IOException, TikaEmitterException {
-
- }
+ public void emit(List<? extends EmitData> emitData) throws IOException, TikaEmitterException {}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java b/tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java
index 10526eb..4876c80 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/emitter/StreamEmitter.java
@@ -18,7 +18,6 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.metadata.Metadata;
public interface StreamEmitter extends Emitter {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
index 071de05..7d6bc87 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmbeddedDocumentBytesConfig.java
@@ -21,16 +21,15 @@
public class EmbeddedDocumentBytesConfig implements Serializable {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -3861669115439125268L;
-
public static EmbeddedDocumentBytesConfig SKIP = new EmbeddedDocumentBytesConfig(false);
public enum SUFFIX_STRATEGY {
- NONE, EXISTING, DETECTED;
+ NONE,
+ EXISTING,
+ DETECTED;
public static SUFFIX_STRATEGY parse(String s) {
if (s.equalsIgnoreCase("none")) {
@@ -43,6 +42,7 @@
throw new IllegalArgumentException("can't parse " + s);
}
}
+
private final boolean extractEmbeddedDocumentBytes;
private int zeroPadName = 0;
@@ -56,9 +56,8 @@
private boolean includeOriginal = false;
/**
- * Create an EmbeddedDocumentBytesConfig with
- * {@link EmbeddedDocumentBytesConfig#extractEmbeddedDocumentBytes}
- * set to <code>true</code>
+ * Create an EmbeddedDocumentBytesConfig with {@link
+ * EmbeddedDocumentBytesConfig#extractEmbeddedDocumentBytes} set to <code>true</code>
*/
public EmbeddedDocumentBytesConfig() {
this.extractEmbeddedDocumentBytes = true;
@@ -118,11 +117,22 @@
@Override
public String toString() {
- return "EmbeddedDocumentBytesConfig{" + "extractEmbeddedDocumentBytes=" +
- extractEmbeddedDocumentBytes + ", zeroPadName=" + zeroPadName +
- ", suffixStrategy=" + suffixStrategy + ", embeddedIdPrefix='" + embeddedIdPrefix +
- '\'' + ", emitter='" + emitter + '\'' + ", includeOriginal=" + includeOriginal +
- '}';
+ return "EmbeddedDocumentBytesConfig{"
+ + "extractEmbeddedDocumentBytes="
+ + extractEmbeddedDocumentBytes
+ + ", zeroPadName="
+ + zeroPadName
+ + ", suffixStrategy="
+ + suffixStrategy
+ + ", embeddedIdPrefix='"
+ + embeddedIdPrefix
+ + '\''
+ + ", emitter='"
+ + emitter
+ + '\''
+ + ", includeOriginal="
+ + includeOriginal
+ + '}';
}
@Override
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
index 1132a4b..92a51b5 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/extractor/EmittingEmbeddedDocumentBytesHandler.java
@@ -19,9 +19,7 @@
import java.io.Closeable;
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.commons.io.IOExceptionWithCause;
-
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.extractor.AbstractEmbeddedDocumentBytesHandler;
import org.apache.tika.metadata.Metadata;
@@ -37,26 +35,30 @@
private final StreamEmitter emitter;
private static final Metadata METADATA = new Metadata();
- public EmittingEmbeddedDocumentBytesHandler(EmitKey containerEmitKey,
- EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig,
- EmitterManager emitterManager) throws TikaConfigException {
+
+ public EmittingEmbeddedDocumentBytesHandler(
+ EmitKey containerEmitKey,
+ EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig,
+ EmitterManager emitterManager)
+ throws TikaConfigException {
this.containerEmitKey = containerEmitKey;
this.embeddedDocumentBytesConfig = embeddedDocumentBytesConfig;
- Emitter tmpEmitter =
- emitterManager.getEmitter(embeddedDocumentBytesConfig.getEmitter());
- if (! (tmpEmitter instanceof StreamEmitter)) {
- throw new TikaConfigException("Emitter " +
- embeddedDocumentBytesConfig.getEmitter()
- + " must implement a StreamEmitter");
+ Emitter tmpEmitter = emitterManager.getEmitter(embeddedDocumentBytesConfig.getEmitter());
+ if (!(tmpEmitter instanceof StreamEmitter)) {
+ throw new TikaConfigException(
+ "Emitter "
+ + embeddedDocumentBytesConfig.getEmitter()
+ + " must implement a StreamEmitter");
}
this.emitter = (StreamEmitter) tmpEmitter;
}
@Override
public void add(int id, Metadata metadata, InputStream inputStream) throws IOException {
- //intentionally do not call super.add, because we want the ids list to be empty
- String emitKey = getEmitKey(containerEmitKey.getEmitKey(),
- id, embeddedDocumentBytesConfig, metadata);
+ // intentionally do not call super.add, because we want the ids list to be empty
+ String emitKey =
+ getEmitKey(
+ containerEmitKey.getEmitKey(), id, embeddedDocumentBytesConfig, metadata);
try {
emitter.emit(emitKey, inputStream, METADATA);
} catch (TikaEmitterException e) {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java
index 0b417e3..76fde46 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/AbstractFetcher.java
@@ -18,14 +18,11 @@
import org.apache.tika.config.Field;
-
public abstract class AbstractFetcher implements Fetcher {
private String name;
- public AbstractFetcher() {
-
- }
+ public AbstractFetcher() {}
public AbstractFetcher(String name) {
this.name = name;
@@ -40,5 +37,4 @@
public void setName(String name) {
this.name = name;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java
index 022d00a..bd72416 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/EmptyFetcher.java
@@ -18,7 +18,6 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java
index 148e353..3f1d204 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetchKey.java
@@ -20,13 +20,11 @@
import java.util.Objects;
/**
- * Pair of fetcherName (which fetcher to call) and the key
- * to send to that fetcher to retrieve a specific file.
+ * Pair of fetcherName (which fetcher to call) and the key to send to that fetcher to retrieve a
+ * specific file.
*/
public class FetchKey implements Serializable {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -3861669115439125268L;
private String fetcherName;
@@ -34,10 +32,8 @@
private long rangeStart = -1;
private long rangeEnd = -1;
- //this is for serialization...yuck
- public FetchKey() {
-
- }
+ // this is for serialization...yuck
+ public FetchKey() {}
public FetchKey(String fetcherName, String fetchKey) {
this(fetcherName, fetchKey, -1, -1);
@@ -79,9 +75,10 @@
return false;
}
FetchKey fetchKey1 = (FetchKey) o;
- return rangeStart == fetchKey1.rangeStart && rangeEnd == fetchKey1.rangeEnd &&
- Objects.equals(fetcherName, fetchKey1.fetcherName) &&
- Objects.equals(fetchKey, fetchKey1.fetchKey);
+ return rangeStart == fetchKey1.rangeStart
+ && rangeEnd == fetchKey1.rangeEnd
+ && Objects.equals(fetcherName, fetchKey1.fetcherName)
+ && Objects.equals(fetchKey, fetchKey1.fetchKey);
}
@Override
@@ -91,7 +88,17 @@
@Override
public String toString() {
- return "FetchKey{" + "fetcherName='" + fetcherName + '\'' + ", fetchKey='" + fetchKey +
- '\'' + ", rangeStart=" + rangeStart + ", rangeEnd=" + rangeEnd + '}';
+ return "FetchKey{"
+ + "fetcherName='"
+ + fetcherName
+ + '\''
+ + ", fetchKey='"
+ + fetchKey
+ + '\''
+ + ", rangeStart="
+ + rangeStart
+ + ", rangeEnd="
+ + rangeEnd
+ + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java
index 1b3fa2a..dea2467 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/Fetcher.java
@@ -18,16 +18,14 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
/**
- * Interface for an object that will fetch an InputStream given
- * a fetch string. This will also update the metadata object
- * based on the fetch.
- * <p>
- * Implementations of Fetcher must be thread safe.
+ * Interface for an object that will fetch an InputStream given a fetch string. This will also
+ * update the metadata object based on the fetch.
+ *
+ * <p>Implementations of Fetcher must be thread safe.
*/
public interface Fetcher {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java
index 40121f9..21fcc41 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherManager.java
@@ -24,25 +24,24 @@
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
-
import org.apache.tika.config.ConfigBase;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
/**
* Utility class to hold multiple fetchers.
- * <p>
- * This forbids multiple fetchers supporting the same name.
+ *
+ * <p>This forbids multiple fetchers supporting the same name.
*/
public class FetcherManager extends ConfigBase {
public static FetcherManager load(Path p) throws IOException, TikaConfigException {
- try (InputStream is =
- Files.newInputStream(p)) {
- return FetcherManager.buildComposite("fetchers", FetcherManager.class,
- "fetcher", Fetcher.class, is);
+ try (InputStream is = Files.newInputStream(p)) {
+ return FetcherManager.buildComposite(
+ "fetchers", FetcherManager.class, "fetcher", Fetcher.class, is);
}
}
+
private final Map<String, Fetcher> fetcherMap = new ConcurrentHashMap<>();
public FetcherManager(List<Fetcher> fetchers) throws TikaConfigException {
@@ -63,8 +62,10 @@
Fetcher fetcher = fetcherMap.get(fetcherName);
if (fetcher == null) {
throw new IllegalArgumentException(
- "Can't find fetcher for fetcherName: " + fetcherName + ". I've loaded: " +
- fetcherMap.keySet());
+ "Can't find fetcher for fetcherName: "
+ + fetcherName
+ + ". I've loaded: "
+ + fetcherMap.keySet());
}
return fetcher;
}
@@ -74,9 +75,9 @@
}
/**
- * Convenience method that returns a fetcher if only one fetcher
- * is specified in the tika-config file. If 0 or > 1 fetchers
- * are specified, this throws an IllegalArgumentException.
+ * Convenience method that returns a fetcher if only one fetcher is specified in the tika-config
+ * file. If 0 or > 1 fetchers are specified, this throws an IllegalArgumentException.
+ *
* @return
*/
public Fetcher getFetcher() {
@@ -84,13 +85,13 @@
throw new IllegalArgumentException("fetchers size must == 1 for the no arg call");
}
if (fetcherMap.size() > 1) {
- throw new IllegalArgumentException("need to specify 'fetcherName' if > 1 fetchers are" +
- " available");
+ throw new IllegalArgumentException(
+ "need to specify 'fetcherName' if > 1 fetchers are" + " available");
}
for (Fetcher fetcher : fetcherMap.values()) {
return fetcher;
}
- //this should be unreachable?!
+ // this should be unreachable?!
throw new IllegalArgumentException("fetchers size must == 0");
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java
index a07439a..97d3972 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/FetcherStringException.java
@@ -18,9 +18,7 @@
import org.apache.tika.exception.TikaException;
-/**
- * If something goes wrong in parsing the fetcher string
- */
+/** If something goes wrong in parsing the fetcher string */
public class FetcherStringException extends TikaException {
public FetcherStringException(String msg) {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java
index 0a3ceae..1679897 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/RangeFetcher.java
@@ -18,17 +18,13 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-/**
- * This class extracts a range of bytes from a given fetch key.
- */
+/** This class extracts a range of bytes from a given fetch key. */
public interface RangeFetcher extends Fetcher {
- //At some point, Tika 3.x?, we may want to add optional ranges to the fetchKey?
+ // At some point, Tika 3.x?, we may want to add optional ranges to the fetchKey?
InputStream fetch(String fetchKey, long startOffset, long endOffset, Metadata metadata)
throws TikaException, IOException;
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java
index d926e3c..8bf2f0b 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcher.java
@@ -26,10 +26,6 @@
import java.nio.file.attribute.FileTime;
import java.util.Date;
import java.util.Map;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -42,18 +38,22 @@
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.pipes.fetcher.AbstractFetcher;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class FileSystemFetcher extends AbstractFetcher implements Initializable {
private static final Logger LOG = LoggerFactory.getLogger(FileSystemFetcher.class);
- //Warning! basePath can be null!
+ // Warning! basePath can be null!
private Path basePath = null;
private boolean extractFileSystemMetadata = false;
static boolean isDescendant(Path root, Path descendant) {
- return descendant.toAbsolutePath().normalize()
+ return descendant
+ .toAbsolutePath()
+ .normalize()
.startsWith(root.toAbsolutePath().normalize());
}
@@ -61,9 +61,10 @@
public InputStream fetch(String fetchKey, Metadata metadata) throws IOException, TikaException {
if (fetchKey.contains("\u0000")) {
- throw new IllegalArgumentException("Path must not contain \u0000. " +
- "Please review the life decisions that led you to requesting " +
- "a file name with this character in it.");
+ throw new IllegalArgumentException(
+ "Path must not contain \u0000. "
+ + "Please review the life decisions that led you to requesting "
+ + "a file name with this character in it.");
}
Path p = null;
if (basePath != null) {
@@ -91,14 +92,14 @@
}
private void updateFileSystemMetadata(Path p, Metadata metadata) throws IOException {
- if (! extractFileSystemMetadata) {
+ if (!extractFileSystemMetadata) {
return;
}
BasicFileAttributes attrs = Files.readAttributes(p, BasicFileAttributes.class);
updateFileTime(FileSystem.CREATED, attrs.creationTime(), metadata);
updateFileTime(FileSystem.MODIFIED, attrs.lastModifiedTime(), metadata);
updateFileTime(FileSystem.ACCESSED, attrs.lastAccessTime(), metadata);
- //TODO extract owner or group?
+ // TODO extract owner or group?
}
private void updateFileTime(Property property, FileTime fileTime, Metadata metadata) {
@@ -109,7 +110,6 @@
}
/**
- *
* @return the basePath or <code>null</code> if no base path was set
*/
public Path getBasePath() {
@@ -117,9 +117,8 @@
}
/**
- * Default behavior si that clients will send in relative paths, this
- * must be set to allow this fetcher to fetch the
- * full path.
+ * Default behavior si that clients will send in relative paths, this must be set to allow this
+ * fetcher to fetch the full path.
*
* @param basePath
*/
@@ -129,8 +128,8 @@
}
/**
- * Extract file system metadata (created, modified, accessed) when fetching file.
- * The default is <code>false</code>.
+ * Extract file system metadata (created, modified, accessed) when fetching file. The default is
+ * <code>false</code>.
*
* @param extractFileSystemMetadata
*/
@@ -141,29 +140,33 @@
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
- //no-op
+ // no-op
}
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
throws TikaConfigException {
if (basePath == null || basePath.toString().trim().length() == 0) {
- LOG.warn("'basePath' has not been set. " +
- "This means that client code or clients can read from any file that this " +
- "process has permissions to read. If you are running tika-server, make " +
- "absolutely certain that you've locked down " +
- "access to tika-server and file-permissions for the tika-server process.");
+ LOG.warn(
+ "'basePath' has not been set. "
+ + "This means that client code or clients can read from any file that this "
+ + "process has permissions to read. If you are running tika-server, make "
+ + "absolutely certain that you've locked down "
+ + "access to tika-server and file-permissions for the tika-server process.");
return;
}
if (basePath.toString().startsWith("http://")) {
- throw new TikaConfigException("FileSystemFetcher only works with local file systems. " +
- " Please use the tika-fetcher-http module for http calls");
+ throw new TikaConfigException(
+ "FileSystemFetcher only works with local file systems. "
+ + " Please use the tika-fetcher-http module for http calls");
} else if (basePath.toString().startsWith("ftp://")) {
- throw new TikaConfigException("FileSystemFetcher only works with local file systems. " +
- " Please consider contributing an ftp fetcher module");
+ throw new TikaConfigException(
+ "FileSystemFetcher only works with local file systems. "
+ + " Please consider contributing an ftp fetcher module");
} else if (basePath.toString().startsWith("s3://")) {
- throw new TikaConfigException("FileSystemFetcher only works with local file systems. " +
- " Please use the tika-fetcher-s3 module");
+ throw new TikaConfigException(
+ "FileSystemFetcher only works with local file systems. "
+ + " Please use the tika-fetcher-s3 module");
}
if (basePath.toAbsolutePath().toString().contains("\u0000")) {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java
index f415a35..d791952 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/fetcher/url/UrlFetcher.java
@@ -20,33 +20,31 @@
import java.io.InputStream;
import java.net.URL;
import java.util.Locale;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.fetcher.AbstractFetcher;
/**
- * Simple fetcher for URLs. This simply calls {@link TikaInputStream#get(URL)}.
- * This intentionally does not support fetching for files.
- * Please use the FileSystemFetcher for that. If you need more advanced control (passwords,
- * timeouts, proxies, etc), please use the tika-fetcher-http module.
+ * Simple fetcher for URLs. This simply calls {@link TikaInputStream#get(URL)}. This intentionally
+ * does not support fetching for files. Please use the FileSystemFetcher for that. If you need more
+ * advanced control (passwords, timeouts, proxies, etc), please use the tika-fetcher-http module.
*/
public class UrlFetcher extends AbstractFetcher {
@Override
public InputStream fetch(String fetchKey, Metadata metadata) throws IOException, TikaException {
if (fetchKey.contains("\u0000")) {
- throw new IllegalArgumentException("URL must not contain \u0000. " +
- "Please review the life decisions that led you to requesting " +
- "a URL with this character in it.");
+ throw new IllegalArgumentException(
+ "URL must not contain \u0000. "
+ + "Please review the life decisions that led you to requesting "
+ + "a URL with this character in it.");
}
if (fetchKey.toLowerCase(Locale.US).trim().startsWith("file:")) {
throw new IllegalArgumentException(
- "The UrlFetcher does not fetch from file shares; " +
- "please use the FileSystemFetcher");
+ "The UrlFetcher does not fetch from file shares; "
+ + "please use the FileSystemFetcher");
}
return TikaInputStream.get(new URL(fetchKey), metadata);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java
index a60784f..b415e6f 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/CallablePipesIterator.java
@@ -20,12 +20,11 @@
import java.util.concurrent.Callable;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
-
import org.apache.tika.pipes.FetchEmitTuple;
/**
- * This is a simple wrapper around {@link PipesIterator}
- * that allows it to be called in its own thread.
+ * This is a simple wrapper around {@link PipesIterator} that allows it to be called in its own
+ * thread.
*/
public class CallablePipesIterator implements Callable<Long> {
@@ -37,48 +36,50 @@
private final int numConsumers;
/**
- * This sets timeoutMillis to -1, meaning that
- * this will block forever trying to add fetchemittuples to the queue.
- * This sets the number of {@link PipesIterator#COMPLETED_SEMAPHORE} to 1.
- * This means that your consumers must put the semaphore back in the queue
- * after they finish.
+ * This sets timeoutMillis to -1, meaning that this will block forever trying to add
+ * fetchemittuples to the queue. This sets the number of {@link
+ * PipesIterator#COMPLETED_SEMAPHORE} to 1. This means that your consumers must put the
+ * semaphore back in the queue after they finish.
*
* @param pipesIterator
* @param queue
*/
- public CallablePipesIterator(PipesIterator pipesIterator,
- ArrayBlockingQueue<FetchEmitTuple> queue) {
+ public CallablePipesIterator(
+ PipesIterator pipesIterator, ArrayBlockingQueue<FetchEmitTuple> queue) {
this(pipesIterator, queue, -1);
}
/**
- * This sets the number of {@link PipesIterator#COMPLETED_SEMAPHORE} to 1.
- * This means that your consumers must put the semaphore back in the queue
- * after they finish.
+ * This sets the number of {@link PipesIterator#COMPLETED_SEMAPHORE} to 1. This means that your
+ * consumers must put the semaphore back in the queue after they finish.
+ *
* @param pipesIterator underlying pipes iterator to use
* @param queue queue to add the fetch emit tuples to
- * @param timeoutMillis how long to try to offer the fetch emit tuples to the queue. If -1,
- * this will block with {@link ArrayBlockingQueue#put(Object)} forever.
- */
- public CallablePipesIterator(PipesIterator pipesIterator,
- ArrayBlockingQueue<FetchEmitTuple> queue, long timeoutMillis) {
+ * @param timeoutMillis how long to try to offer the fetch emit tuples to the queue. If -1, this
+ * will block with {@link ArrayBlockingQueue#put(Object)} forever.
+ */
+ public CallablePipesIterator(
+ PipesIterator pipesIterator,
+ ArrayBlockingQueue<FetchEmitTuple> queue,
+ long timeoutMillis) {
this(pipesIterator, queue, timeoutMillis, 1);
}
/**
- *
* @param pipesIterator underlying pipes iterator to use
* @param queue queue to add the fetch emit tuples to
- * @param timeoutMillis how long to try to offer the fetch emit tuples to the queue. If -1,
- * this will block with {@link ArrayBlockingQueue#put(Object)} forever.
- * @param numConsumers how many {@link PipesIterator#COMPLETED_SEMAPHORE} to add to the
- * queue. If the consumers are adding this back to the queue when they
- * find it, then this should be set to 1, otherwise, for a single semaphore
- * for each consumer, set this to the number of consumers
+ * @param timeoutMillis how long to try to offer the fetch emit tuples to the queue. If -1, this
+ * will block with {@link ArrayBlockingQueue#put(Object)} forever.
+ * @param numConsumers how many {@link PipesIterator#COMPLETED_SEMAPHORE} to add to the queue.
+ * If the consumers are adding this back to the queue when they find it, then this should be
+ * set to 1, otherwise, for a single semaphore for each consumer, set this to the number of
+ * consumers
*/
- public CallablePipesIterator(PipesIterator pipesIterator,
- ArrayBlockingQueue<FetchEmitTuple> queue, long timeoutMillis,
- int numConsumers) {
+ public CallablePipesIterator(
+ PipesIterator pipesIterator,
+ ArrayBlockingQueue<FetchEmitTuple> queue,
+ long timeoutMillis,
+ int numConsumers) {
this.pipesIterator = pipesIterator;
this.queue = queue;
this.timeoutMillis = timeoutMillis;
@@ -91,21 +92,24 @@
if (timeoutMillis > 0) {
for (FetchEmitTuple t : pipesIterator) {
boolean offered = queue.offer(t, timeoutMillis, TimeUnit.MILLISECONDS);
- if (! offered) {
+ if (!offered) {
throw new TimeoutException("timed out trying to offer tuple");
}
added++;
}
for (int i = 0; i < numConsumers; i++) {
- boolean offered = queue.offer(PipesIterator.COMPLETED_SEMAPHORE, timeoutMillis,
- TimeUnit.MILLISECONDS);
+ boolean offered =
+ queue.offer(
+ PipesIterator.COMPLETED_SEMAPHORE,
+ timeoutMillis,
+ TimeUnit.MILLISECONDS);
if (!offered) {
- throw new TimeoutException("timed out trying to offer the completed " +
- "semaphore");
+ throw new TimeoutException(
+ "timed out trying to offer the completed " + "semaphore");
}
}
} else {
- //blocking!
+ // blocking!
for (FetchEmitTuple t : pipesIterator) {
queue.put(t);
added++;
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
index 34706f7..a615a45 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/PipesIterator.java
@@ -28,10 +28,6 @@
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.config.ConfigBase;
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
@@ -42,22 +38,23 @@
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.HandlerConfig;
import org.apache.tika.sax.BasicContentHandlerFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
- * Abstract class that handles the testing for timeouts/thread safety
- * issues. Concrete classes implement the blocking {@link #enqueue()}.
- * If there's an exception in the enqueuing thread, this will throw
- * a RuntimeException. It will throw an IllegalStateException if
- * next() is called after hasNext() has returned false.
+ * Abstract class that handles the testing for timeouts/thread safety issues. Concrete classes
+ * implement the blocking {@link #enqueue()}. If there's an exception in the enqueuing thread, this
+ * will throw a RuntimeException. It will throw an IllegalStateException if next() is called after
+ * hasNext() has returned false.
*/
public abstract class PipesIterator extends ConfigBase
- implements Callable<Integer>, Iterable<FetchEmitTuple>, Initializable {
+ implements Callable<Integer>, Iterable<FetchEmitTuple>, Initializable {
public static final long DEFAULT_MAX_WAIT_MS = 300_000;
public static final int DEFAULT_QUEUE_SIZE = 1000;
public static final FetchEmitTuple COMPLETED_SEMAPHORE =
- new FetchEmitTuple(null,null, null, null, null, null);
+ new FetchEmitTuple(null, null, null, null, null, null);
private static final Logger LOGGER = LoggerFactory.getLogger(PipesIterator.class);
@@ -80,12 +77,9 @@
private int added = 0;
private FutureTask<Integer> futureTask;
- public static PipesIterator build(Path tikaConfigFile) throws IOException,
- TikaConfigException {
+ public static PipesIterator build(Path tikaConfigFile) throws IOException, TikaConfigException {
try (InputStream is = Files.newInputStream(tikaConfigFile)) {
- return buildSingle(
- "pipesIterator",
- PipesIterator.class, is);
+ return buildSingle("pipesIterator", PipesIterator.class, is);
}
}
@@ -138,8 +132,9 @@
@Field
public void setHandlerType(String handlerType) {
- this.handlerType = BasicContentHandlerFactory
- .parseHandlerType(handlerType, BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
+ this.handlerType =
+ BasicContentHandlerFactory.parseHandlerType(
+ handlerType, BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
}
@Field
@@ -173,9 +168,9 @@
}
protected HandlerConfig getHandlerConfig() {
- //TODO: make throwOnWriteLimitReached configurable
- return new HandlerConfig(handlerType, parseMode, writeLimit, maxEmbeddedResources,
- throwOnWriteLimitReached);
+ // TODO: make throwOnWriteLimitReached configurable
+ return new HandlerConfig(
+ handlerType, parseMode, writeLimit, maxEmbeddedResources, throwOnWriteLimitReached);
}
protected abstract void enqueue() throws IOException, TimeoutException, InterruptedException;
@@ -190,13 +185,13 @@
@Override
public void initialize(Map<String, Param> params) throws TikaConfigException {
- //no-op
+ // no-op
}
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
throws TikaConfigException {
- //no-op
+ // no-op
}
@Override
@@ -255,11 +250,10 @@
}
/**
- * this checks to make sure that the thread hasn't terminated early.
- * Will return true if the thread has successfully completed or if
- * it has not completed. Will return false if there has been a thread
- * interrupt. Will throw a RuntimeException if there's been
- * an execution exception in the thread.
+ * this checks to make sure that the thread hasn't terminated early. Will return true if the
+ * thread has successfully completed or if it has not completed. Will return false if there
+ * has been a thread interrupt. Will throw a RuntimeException if there's been an execution
+ * exception in the thread.
*/
private void checkThreadOk() throws InterruptedException {
if (futureTask.isDone()) {
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java
index 8ab7086..e230d76 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/TotalCounter.java
@@ -17,27 +17,26 @@
package org.apache.tika.pipes.pipesiterator;
/**
- * Interface for pipesiterators that allow counting of total
- * documents. This is useful for user-facing frontends where
- * the user does not have easy access to the total number of files
- * for processing.
+ * Interface for pipesiterators that allow counting of total documents. This is useful for
+ * user-facing frontends where the user does not have easy access to the total number of files for
+ * processing.
*
- * This is run in a daemon thread and is not guaranteed to complete before
- * the actual file processing has completed.
+ * <p>This is run in a daemon thread and is not guaranteed to complete before the actual file
+ * processing has completed.
*
- * This is an ancillary task, and should not throw runtime exceptions.
+ * <p>This is an ancillary task, and should not throw runtime exceptions.
*
- * Implementers should be careful to check for thread interrupts.
- *
+ * <p>Implementers should be careful to check for thread interrupts.
*/
public interface TotalCounter {
void startTotalCount();
/**
- * Returns the total count so far. Check the {@link TotalCountResult#getStatus()}
- * to figure out if the count has completed yet, if it is unsupported or if
- * there was an exception during the counting.
+ * Returns the total count so far. Check the {@link TotalCountResult#getStatus()} to figure out
+ * if the count has completed yet, if it is unsupported or if there was an exception during the
+ * counting.
+ *
* @return
*/
TotalCountResult getTotalCount();
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java
index 90cabe8..27ed359 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIterator.java
@@ -23,7 +23,6 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.concurrent.TimeoutException;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -37,45 +36,46 @@
import org.apache.tika.utils.StringUtils;
/**
- * Reads a list of file names/relative paths from a UTF-8 file.
- * One file name/relative path per line. This path is used for the fetch key,
- * the id and the emit key. If you need more customized control of the keys/ids,
- * consider using the jdbc pipes iterator or the csv pipes iterator.
+ * Reads a list of file names/relative paths from a UTF-8 file. One file name/relative path per
+ * line. This path is used for the fetch key, the id and the emit key. If you need more customized
+ * control of the keys/ids, consider using the jdbc pipes iterator or the csv pipes iterator.
*
- * Skips empty lines and lines starting with '#'
- *
- *
+ * <p>Skips empty lines and lines starting with '#'
*/
public class FileListPipesIterator extends PipesIterator implements Initializable {
- @Field
- private String fileList;
+ @Field private String fileList;
- @Field
- private boolean hasHeader = false;
+ @Field private boolean hasHeader = false;
private Path fileListPath;
@Override
protected void enqueue() throws IOException, TimeoutException, InterruptedException {
- try (BufferedReader reader = Files.newBufferedReader(fileListPath, StandardCharsets.UTF_8)) {
+ try (BufferedReader reader =
+ Files.newBufferedReader(fileListPath, StandardCharsets.UTF_8)) {
if (hasHeader) {
reader.readLine();
}
String line = reader.readLine();
while (line != null) {
- if (! line.startsWith("#") && !StringUtils.isBlank(line)) {
+ if (!line.startsWith("#") && !StringUtils.isBlank(line)) {
FetchKey fetchKey = new FetchKey(getFetcherName(), line);
EmitKey emitKey = new EmitKey(getEmitterName(), line);
- tryToAdd(new FetchEmitTuple(line, fetchKey, emitKey,
- new Metadata(), getHandlerConfig(), getOnParseException()));
+ tryToAdd(
+ new FetchEmitTuple(
+ line,
+ fetchKey,
+ emitKey,
+ new Metadata(),
+ getHandlerConfig(),
+ getOnParseException()));
}
line = reader.readLine();
}
}
}
-
@Field
public void setFileList(String path) {
this.fileList = path;
@@ -89,15 +89,18 @@
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
throws TikaConfigException {
- //these should all be fatal
+ // these should all be fatal
TikaConfig.mustNotBeEmpty("fileList", fileList);
TikaConfig.mustNotBeEmpty("fetcherName", getFetcherName());
TikaConfig.mustNotBeEmpty("emitterName", getFetcherName());
fileListPath = Paths.get(fileList);
if (!Files.isRegularFile(fileListPath)) {
- throw new TikaConfigException("file list " + fileList + " does not exist. " +
- "Must specify an existing file");
+ throw new TikaConfigException(
+ "file list "
+ + fileList
+ + " does not exist. "
+ + "Must specify an existing file");
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java
index 9e903fd..509e1b2 100644
--- a/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java
+++ b/tika-core/src/main/java/org/apache/tika/pipes/pipesiterator/fs/FileSystemPipesIterator.java
@@ -27,10 +27,6 @@
import java.util.Map;
import java.util.concurrent.TimeoutException;
import java.util.concurrent.atomic.AtomicLong;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -45,20 +41,20 @@
import org.apache.tika.pipes.pipesiterator.PipesIterator;
import org.apache.tika.pipes.pipesiterator.TotalCountResult;
import org.apache.tika.pipes.pipesiterator.TotalCounter;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class FileSystemPipesIterator extends PipesIterator
implements TotalCounter, Initializable, Closeable {
private static final Logger LOG = LoggerFactory.getLogger(AsyncProcessor.class);
-
private Path basePath;
private boolean countTotal = false;
private FileCountWorker fileCountWorker;
- public FileSystemPipesIterator() {
- }
+ public FileSystemPipesIterator() {}
public FileSystemPipesIterator(Path basePath) {
this.basePath = basePath;
@@ -87,11 +83,10 @@
}
}
-
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
throws TikaConfigException {
- //these should all be fatal
+ // these should all be fatal
TikaConfig.mustNotBeEmpty("basePath", basePath);
TikaConfig.mustNotBeEmpty("fetcherName", getFetcherName());
TikaConfig.mustNotBeEmpty("emitterName", getFetcherName());
@@ -108,9 +103,10 @@
public void setCountTotal(boolean countTotal) {
this.countTotal = countTotal;
}
+
@Override
public void startTotalCount() {
- if (! countTotal) {
+ if (!countTotal) {
return;
}
fileCountWorker.startTotalCount();
@@ -118,7 +114,7 @@
@Override
public TotalCountResult getTotalCount() {
- if (! countTotal) {
+ if (!countTotal) {
return TotalCountResult.UNSUPPORTED;
}
return fileCountWorker.getTotalCount();
@@ -152,9 +148,14 @@
String relPath = basePath.relativize(file).toString();
try {
- tryToAdd(new FetchEmitTuple(relPath, new FetchKey(fetcherName, relPath),
- new EmitKey(emitterName, relPath), new Metadata(), getHandlerConfig(),
- getOnParseException()));
+ tryToAdd(
+ new FetchEmitTuple(
+ relPath,
+ new FetchKey(fetcherName, relPath),
+ new EmitKey(emitterName, relPath),
+ new Metadata(),
+ getHandlerConfig(),
+ getOnParseException()));
} catch (TimeoutException e) {
throw new IOException(e);
} catch (InterruptedException e) {
@@ -174,7 +175,6 @@
}
}
-
private static class FileCountWorker implements TotalCounter, Closeable {
private Thread totalCounterThread;
@@ -191,17 +191,19 @@
@Override
public void startTotalCount() {
- totalCounterThread = new Thread(() -> {
- try {
- Files.walkFileTree(basePath, new FSFileCounter(totalCount));
- status = TotalCountResult.STATUS.COMPLETED;
- finalResult = new TotalCountResult(totalCount.get(), status);
- } catch (IOException e) {
- LOG.warn("problem counting files", e);
- status = TotalCountResult.STATUS.EXCEPTION;
- finalResult = new TotalCountResult(totalCount.get(), status);
- }
- });
+ totalCounterThread =
+ new Thread(
+ () -> {
+ try {
+ Files.walkFileTree(basePath, new FSFileCounter(totalCount));
+ status = TotalCountResult.STATUS.COMPLETED;
+ finalResult = new TotalCountResult(totalCount.get(), status);
+ } catch (IOException e) {
+ LOG.warn("problem counting files", e);
+ status = TotalCountResult.STATUS.EXCEPTION;
+ finalResult = new TotalCountResult(totalCount.get(), status);
+ }
+ });
totalCounterThread.setDaemon(true);
totalCounterThread.start();
}
@@ -222,6 +224,7 @@
private class FSFileCounter implements FileVisitor<Path> {
private final AtomicLong count;
+
private FSFileCounter(AtomicLong count) {
this.count = count;
}
@@ -233,7 +236,8 @@
}
@Override
- public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
+ throws IOException {
count.incrementAndGet();
return FileVisitResult.CONTINUE;
}
@@ -244,7 +248,8 @@
}
@Override
- public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
+ public FileVisitResult postVisitDirectory(Path dir, IOException exc)
+ throws IOException {
return FileVisitResult.CONTINUE;
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
index a98d39c..f06d355 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/CompositeRenderer.java
@@ -13,7 +13,7 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
- */package org.apache.tika.renderer;
+ */ package org.apache.tika.renderer;
import java.io.IOException;
import java.io.InputStream;
@@ -23,7 +23,6 @@
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
-
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.config.Param;
@@ -54,14 +53,16 @@
}
rendererMap = Collections.unmodifiableMap(tmp);
}
+
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return rendererMap.keySet();
}
@Override
- public RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
- RenderRequest... requests) throws IOException, TikaException {
+ public RenderResults render(
+ InputStream is, Metadata metadata, ParseContext parseContext, RenderRequest... requests)
+ throws IOException, TikaException {
String mediaTypeString = metadata.get(TikaCoreProperties.TYPE);
if (mediaTypeString == null) {
@@ -81,20 +82,16 @@
public Renderer getLeafRenderer(MediaType mt) {
return rendererMap.get(mt);
}
- @Override
- public void initialize(Map<String, Param> params) throws TikaConfigException {
- }
+ @Override
+ public void initialize(Map<String, Param> params) throws TikaConfigException {}
@Override
public void checkInitialization(InitializableProblemHandler problemHandler)
- throws TikaConfigException {
-
- }
+ throws TikaConfigException {}
private static List<Renderer> getDefaultRenderers(ServiceLoader loader) {
- List<Renderer> staticRenderers =
- loader.loadStaticServiceProviders(Renderer.class);
+ List<Renderer> staticRenderers = loader.loadStaticServiceProviders(Renderer.class);
ServiceLoaderUtils.sortLoadedClasses(staticRenderers);
return staticRenderers;
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
index d80ff7c..a803f0b 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/PageBasedRenderResults.java
@@ -20,7 +20,6 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.metadata.TikaPagedText;
@@ -31,6 +30,7 @@
public PageBasedRenderResults(TemporaryResources tmp) {
super(tmp);
}
+
public void add(RenderResult result) {
Integer page = result.getMetadata().getInt(TikaPagedText.PAGE_NUMBER);
if (page != null) {
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java
index 2534d70..4cfbf97 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/PageRangeRequest.java
@@ -18,9 +18,7 @@
import java.util.Objects;
-/**
- * The range of pages to render. These are 1-based, and "to" is inclusive.
- */
+/** The range of pages to render. These are 1-based, and "to" is inclusive. */
public class PageRangeRequest implements RenderRequest {
public static PageRangeRequest RENDER_ALL = new PageRangeRequest(1, -1);
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
index 3277d86..caaeee9 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderRequest.java
@@ -17,11 +17,9 @@
package org.apache.tika.renderer;
/**
- * Empty interface for requests to a renderer. Different
- * file formats and different use cases will have different types of requests.
- * For page based, it could be a page range (render the full pages from 2 to 5);
- * or it could be a single page with an x-y bounding box. For video files,
- * it could be a temporal offset or a temporal offset with an x-y bounding box.
+ * Empty interface for requests to a renderer. Different file formats and different use cases will
+ * have different types of requests. For page based, it could be a page range (render the full pages
+ * from 2 to 5); or it could be a single page with an x-y bounding box. For video files, it could be
+ * a temporal offset or a temporal offset with an x-y bounding box.
*/
-public interface RenderRequest {
-}
+public interface RenderRequest {}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
index 3fd8d7d..cc2aab6 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResult.java
@@ -21,7 +21,6 @@
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -33,14 +32,15 @@
EXCEPTION,
TIMEOUT
}
+
private final STATUS status;
private final int id;
private final Object result;
- //TODO: we're relying on metadata to bring in a bunch of info.
- //Might be cleaner to add specific parameters for page number, embedded path, etc.?
+ // TODO: we're relying on metadata to bring in a bunch of info.
+ // Might be cleaner to add specific parameters for page number, embedded path, etc.?
private final Metadata metadata;
TemporaryResources tmp = new TemporaryResources();
@@ -51,12 +51,13 @@
this.result = result;
this.metadata = metadata;
if (result instanceof Path) {
- tmp.addResource(new Closeable() {
- @Override
- public void close() throws IOException {
- Files.delete((Path)result);
- }
- });
+ tmp.addResource(
+ new Closeable() {
+ @Override
+ public void close() throws IOException {
+ Files.delete((Path) result);
+ }
+ });
} else if (result instanceof Closeable) {
tmp.addResource((Closeable) result);
}
@@ -64,7 +65,7 @@
public InputStream getInputStream() throws IOException {
if (result instanceof Path) {
- return TikaInputStream.get((Path)result, metadata);
+ return TikaInputStream.get((Path) result, metadata);
} else {
TikaInputStream tis = TikaInputStream.get(new byte[0]);
tis.setOpenContainer(result);
@@ -88,5 +89,4 @@
public void close() throws IOException {
tmp.close();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
index 108c062..7e1643e 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderResults.java
@@ -20,7 +20,6 @@
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
-
import org.apache.tika.io.TemporaryResources;
public class RenderResults implements Closeable {
@@ -28,9 +27,11 @@
private List<RenderResult> results = new ArrayList<>();
private final TemporaryResources tmp;
+
public RenderResults(TemporaryResources tmp) {
this.tmp = tmp;
}
+
public void add(RenderResult result) {
tmp.addResource(result);
results.add(result);
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
index bc4261f..2272ca2 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/Renderer.java
@@ -20,24 +20,20 @@
import java.io.InputStream;
import java.io.Serializable;
import java.util.Set;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
/**
- * Interface for a renderer. This should be flexible enough to run on the initial design: PDF pages
+ * Interface for a renderer. This should be flexible enough to run on the initial design: PDF pages
* but also on portions of PDF pages as well as on other document types.
- *
*/
public interface Renderer extends Serializable {
-
-
/**
- * Returns the set of media types supported by this renderer when used
- * with the given parse context.
+ * Returns the set of media types supported by this renderer when used with the given parse
+ * context.
*
* @param context parse context
* @return immutable set of media types
@@ -45,9 +41,9 @@
*/
Set<MediaType> getSupportedTypes(ParseContext context);
- RenderResults render(InputStream is, Metadata metadata, ParseContext parseContext,
- RenderRequest ... requests) throws IOException,
- TikaException;
+ RenderResults render(
+ InputStream is, Metadata metadata, ParseContext parseContext, RenderRequest... requests)
+ throws IOException, TikaException;
/*
At some point, we might need/want to add something like this, where for a given
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
index ed82500..38591a3 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingState.java
@@ -17,10 +17,7 @@
package org.apache.tika.renderer;
/**
- * This should be to track state for each file (embedded or otherwise).
- * This should be reset in the parseContext at the beginning of a parse
- * and then replaced at the end of the parse.
+ * This should be to track state for each file (embedded or otherwise). This should be reset in the
+ * parseContext at the beginning of a parse and then replaced at the end of the parse.
*/
-public class RenderingState {
-
-}
+public class RenderingState {}
diff --git a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
index 2e31432..e20c00d 100644
--- a/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
+++ b/tika-core/src/main/java/org/apache/tika/renderer/RenderingTracker.java
@@ -17,12 +17,11 @@
package org.apache.tika.renderer;
/**
- * Use this in the ParseContext to keep track of unique ids for rendered
- * images in embedded docs. This should be used for the full parse of
- * a main document and its embedded document.
+ * Use this in the ParseContext to keep track of unique ids for rendered images in embedded docs.
+ * This should be used for the full parse of a main document and its embedded document.
*
- * This is different from RenderingState, which is used to track
- * rendering per file/per embedded doc.
+ * <p>This is different from RenderingState, which is used to track rendering per file/per embedded
+ * doc.
*/
public class RenderingTracker {
diff --git a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
index d423009..8753977 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/AbstractRecursiveParserWrapperHandler.java
@@ -19,26 +19,26 @@
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.charset.Charset;
-
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-
/**
- * This is a special handler to be used only with the
- * {@link org.apache.tika.parser.RecursiveParserWrapper}.
- * It allows for finer-grained processing of embedded documents than in the legacy handlers.
- * Subclasses can choose how to process individual embedded documents.
+ * This is a special handler to be used only with the {@link
+ * org.apache.tika.parser.RecursiveParserWrapper}. It allows for finer-grained processing of
+ * embedded documents than in the legacy handlers. Subclasses can choose how to process individual
+ * embedded documents.
*/
public abstract class AbstractRecursiveParserWrapperHandler extends DefaultHandler
implements Serializable {
- public final static Property EMBEDDED_RESOURCE_LIMIT_REACHED = Property.internalBoolean(
- TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_resource_limit_reached");
+ public static final Property EMBEDDED_RESOURCE_LIMIT_REACHED =
+ Property.internalBoolean(
+ TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX
+ + "embedded_resource_limit_reached");
private static final int MAX_DEPTH = 100;
private final ContentHandlerFactory contentHandlerFactory;
private final int maxEmbeddedResources;
@@ -49,8 +49,8 @@
this(contentHandlerFactory, -1);
}
- public AbstractRecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory,
- int maxEmbeddedResources) {
+ public AbstractRecursiveParserWrapperHandler(
+ ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
this.contentHandlerFactory = contentHandlerFactory;
this.maxEmbeddedResources = maxEmbeddedResources;
}
@@ -64,12 +64,12 @@
}
/**
- * This is called before parsing each embedded document. Override this
- * for custom behavior. Make sure to call this in your custom classes
- * because this tracks the number of embedded documents.
+ * This is called before parsing each embedded document. Override this for custom behavior. Make
+ * sure to call this in your custom classes because this tracks the number of embedded
+ * documents.
*
* @param contentHandler local handler to be used on this embedded document
- * @param metadata embedded document's metadata
+ * @param metadata embedded document's metadata
*/
public void startEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
throws SAXException {
@@ -82,11 +82,11 @@
}
/**
- * This is called after parsing each embedded document. Override this
- * for custom behavior. This is currently a no-op.
+ * This is called after parsing each embedded document. Override this for custom behavior. This
+ * is currently a no-op.
*
* @param contentHandler content handler that was used on this embedded document
- * @param metadata metadata for this embedded document
+ * @param metadata metadata for this embedded document
* @throws SAXException
*/
public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
@@ -95,13 +95,12 @@
}
/**
- * This is called after the full parse has completed. Override this
- * for custom behavior. Make sure to call this as <code>super.endDocument(...)</code>
- * in subclasses because this adds whether or not the embedded resource
- * maximum has been hit to the metadata.
+ * This is called after the full parse has completed. Override this for custom behavior. Make
+ * sure to call this as <code>super.endDocument(...)</code> in subclasses because this adds
+ * whether or not the embedded resource maximum has been hit to the metadata.
*
* @param contentHandler content handler that was used on the main document
- * @param metadata metadata that was gathered for the main document
+ * @param metadata metadata that was gathered for the main document
* @throws SAXException
*/
public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException {
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
index 361b781..110e115 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BasicContentHandlerFactory.java
@@ -16,21 +16,16 @@
*/
package org.apache.tika.sax;
-
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.util.Locale;
-
+import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.parser.ParseContext;
-
-/**
- * Basic factory for creating common types of ContentHandlers
- */
+/** Basic factory for creating common types of ContentHandlers */
public class BasicContentHandlerFactory implements ContentHandlerFactory, WriteLimiter {
private final HANDLER_TYPE type;
@@ -42,45 +37,46 @@
/**
* Create a BasicContentHandlerFactory with {@link #throwOnWriteLimitReached} is true
- * @param type basic type of handler
- * @param writeLimit max number of characters to store; if < 0,
- * the handler will store all characters
+ *
+ * @param type basic type of handler
+ * @param writeLimit max number of characters to store; if < 0, the handler will store all
+ * characters
*/
public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit) {
this(type, writeLimit, true, null);
}
/**
- *
* @param type basic type of handler
* @param writeLimit maximum number of characters to store
- * @param throwOnWriteLimitReached whether or not to throw a
- * {@link org.apache.tika.exception.WriteLimitReachedException}
- * when the write limit has been reached
- * @param parseContext to store the writelimitreached warning if
- * throwOnWriteLimitReached is set to <code>false</code>
+ * @param throwOnWriteLimitReached whether or not to throw a {@link
+ * org.apache.tika.exception.WriteLimitReachedException} when the write limit has been
+ * reached
+ * @param parseContext to store the writelimitreached warning if throwOnWriteLimitReached is set
+ * to <code>false</code>
*/
- public BasicContentHandlerFactory(HANDLER_TYPE type, int writeLimit,
- boolean throwOnWriteLimitReached, ParseContext parseContext) {
+ public BasicContentHandlerFactory(
+ HANDLER_TYPE type,
+ int writeLimit,
+ boolean throwOnWriteLimitReached,
+ ParseContext parseContext) {
this.type = type;
this.writeLimit = writeLimit;
this.throwOnWriteLimitReached = throwOnWriteLimitReached;
this.parseContext = parseContext;
if (throwOnWriteLimitReached == false && parseContext == null) {
- throw new IllegalArgumentException("parse context must not be null if " +
- "throwOnWriteLimitReached is false");
+ throw new IllegalArgumentException(
+ "parse context must not be null if " + "throwOnWriteLimitReached is false");
}
-
}
/**
- * Tries to parse string into handler type. Returns default if string is null or
- * parse fails.
- * <p/>
- * Options: xml, html, text, body, ignore (no content)
+ * Tries to parse string into handler type. Returns default if string is null or parse fails.
+ *
+ * <p>Options: xml, html, text, body, ignore (no content)
*
* @param handlerTypeName string to parse
- * @param defaultType type to return if parse fails
+ * @param defaultType type to return if parse fails
* @return handler type
*/
public static HANDLER_TYPE parseHandlerType(String handlerTypeName, HANDLER_TYPE defaultType) {
@@ -112,8 +108,11 @@
if (type == HANDLER_TYPE.BODY) {
return new BodyContentHandler(
- new WriteOutContentHandler(new ToTextContentHandler(), writeLimit,
- throwOnWriteLimitReached, parseContext));
+ new WriteOutContentHandler(
+ new ToTextContentHandler(),
+ writeLimit,
+ throwOnWriteLimitReached,
+ parseContext));
} else if (type == HANDLER_TYPE.IGNORE) {
return new DefaultHandler();
}
@@ -121,8 +120,8 @@
if (writeLimit < 0) {
return formatHandler;
}
- return new WriteOutContentHandler(formatHandler, writeLimit, throwOnWriteLimitReached,
- parseContext);
+ return new WriteOutContentHandler(
+ formatHandler, writeLimit, throwOnWriteLimitReached, parseContext);
}
private ContentHandler getFormatHandler() {
@@ -176,7 +175,6 @@
return new ToXMLContentHandler(os, charset.name());
default:
return new ToTextContentHandler(os, charset.name());
-
}
}
} catch (UnsupportedEncodingException e) {
@@ -191,12 +189,13 @@
return type;
}
- /**
- * Common handler types for content.
- */
+ /** Common handler types for content. */
public enum HANDLER_TYPE {
- BODY, IGNORE, //don't store content
- TEXT, HTML, XML
+ BODY,
+ IGNORE, // don't store content
+ TEXT,
+ HTML,
+ XML
}
public int getWriteLimit() {
diff --git a/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java
index dfdecb8..de614b8 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/BodyContentHandler.java
@@ -17,35 +17,28 @@
package org.apache.tika.sax;
import java.io.Writer;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.sax.xpath.Matcher;
import org.apache.tika.sax.xpath.MatchingContentHandler;
import org.apache.tika.sax.xpath.XPathParser;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * Content handler decorator that only passes everything inside
- * the XHTML <body/> tag to the underlying handler. Note that
- * the <body/> tag itself is <em>not</em> passed on.
+ * Content handler decorator that only passes everything inside the XHTML <body/> tag to the
+ * underlying handler. Note that the <body/> tag itself is <em>not</em> passed on.
*/
public class BodyContentHandler extends ContentHandlerDecorator {
- /**
- * XHTML XPath parser.
- */
+ /** XHTML XPath parser. */
private static final XPathParser PARSER = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
- /**
- * The XPath matcher used to select the XHTML body contents.
- */
+ /** The XPath matcher used to select the XHTML body contents. */
private static final Matcher MATCHER =
PARSER.parse("/xhtml:html/xhtml:body/descendant::node()");
/**
- * Creates a content handler that passes all XHTML body events to the
- * given underlying content handler.
+ * Creates a content handler that passes all XHTML body events to the given underlying content
+ * handler.
*
* @param handler content handler
*/
@@ -54,8 +47,7 @@
}
/**
- * Creates a content handler that writes XHTML body character events to
- * the given writer.
+ * Creates a content handler that writes XHTML body character events to the given writer.
*
* @param writer writer
*/
@@ -64,15 +56,14 @@
}
/**
- * Creates a content handler that writes XHTML body character events to
- * an internal string buffer. The contents of the buffer can be retrieved
- * using the {@link #toString()} method.
- * <p>
- * The internal string buffer is bounded at the given number of characters.
- * If this write limit is reached, then a {@link SAXException} is thrown.
+ * Creates a content handler that writes XHTML body character events to an internal string
+ * buffer. The contents of the buffer can be retrieved using the {@link #toString()} method.
*
- * @param writeLimit maximum number of characters to include in the string,
- * or -1 to disable the write limit
+ * <p>The internal string buffer is bounded at the given number of characters. If this write
+ * limit is reached, then a {@link SAXException} is thrown.
+ *
+ * @param writeLimit maximum number of characters to include in the string, or -1 to disable the
+ * write limit
* @since Apache Tika 0.7
*/
public BodyContentHandler(int writeLimit) {
@@ -80,15 +71,13 @@
}
/**
- * Creates a content handler that writes XHTML body character events to
- * an internal string buffer. The contents of the buffer can be retrieved
- * using the {@link #toString()} method.
- * <p>
- * The internal string buffer is bounded at 100k characters. If this write
- * limit is reached, then a {@link SAXException} is thrown.
+ * Creates a content handler that writes XHTML body character events to an internal string
+ * buffer. The contents of the buffer can be retrieved using the {@link #toString()} method.
+ *
+ * <p>The internal string buffer is bounded at 100k characters. If this write limit is reached,
+ * then a {@link SAXException} is thrown.
*/
public BodyContentHandler() {
this(new WriteOutContentHandler());
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java b/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java
index 6e6ddcd..6a33073 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/CleanPhoneText.java
@@ -22,136 +22,232 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-/**
- * Class to help de-obfuscate phone numbers in text.
- */
+/** Class to help de-obfuscate phone numbers in text. */
public class CleanPhoneText {
public static final String[][][] cleanSubstitutions =
- new String[][][]{{{"&#\\d{1,3};", ""}}, // first simply remove numeric entities
- {{"th0usand", "thousand"}, // handle common misspellings
- {"th1rteen", "thirteen"}, {"f0urteen", "fourteen"},
- {"e1ghteen", "eighteen"}, {"n1neteen", "nineteen"},
- {"f1fteen", "fifteen"}, {"s1xteen", "sixteen"}, {"th1rty", "thirty"},
- {"e1ghty", "eighty"}, {"n1nety", "ninety"}, {"fourty", "forty"},
- {"f0urty", "forty"}, {"e1ght", "eight"}, {"f0rty", "forty"},
- {"f1fty", "fifty"}, {"s1xty", "sixty"}, {"zer0", "zero"},
- {"f0ur", "four"}, {"f1ve", "five"}, {"n1ne", "nine"}, {"0ne", "one"},
- {"tw0", "two"}, {"s1x", "six"}},
- // mixed compound numeral words
- // consider 7teen, etc.
- {{"twenty[\\W_]{0,3}1", "twenty-one"}, {"twenty[\\W_]{0,3}2", "twenty-two"},
- {"twenty[\\W_]{0,3}3", "twenty-three"},
- {"twenty[\\W_]{0,3}4", "twenty-four"},
- {"twenty[\\W_]{0,3}5", "twenty-five"},
- {"twenty[\\W_]{0,3}6", "twenty-six"},
- {"twenty[\\W_]{0,3}7", "twenty-seven"},
- {"twenty[\\W_]{0,3}8", "twenty-eight"},
- {"twenty[\\W_]{0,3}9", "twenty-nine"},
- {"thirty[\\W_]{0,3}1", "thirty-one"},
- {"thirty[\\W_]{0,3}2", "thirty-two"},
- {"thirty[\\W_]{0,3}3", "thirty-three"},
- {"thirty[\\W_]{0,3}4", "thirty-four"},
- {"thirty[\\W_]{0,3}5", "thirty-five"},
- {"thirty[\\W_]{0,3}6", "thirty-six"},
- {"thirty[\\W_]{0,3}7", "thirty-seven"},
- {"thirty[\\W_]{0,3}8", "thirty-eight"},
- {"thirty[\\W_]{0,3}9", "thirty-nine"},
- {"forty[\\W_]{0,3}1", "forty-one"}, {"forty[\\W_]{0,3}2", "forty-two"},
- {"forty[\\W_]{0,3}3", "forty-three"},
- {"forty[\\W_]{0,3}4", "forty-four"},
- {"forty[\\W_]{0,3}5", "forty-five"}, {"forty[\\W_]{0,3}6", "forty-six"},
- {"forty[\\W_]{0,3}7", "forty-seven"},
- {"forty[\\W_]{0,3}8", "forty-eight"},
- {"forty[\\W_]{0,3}9", "forty-nine"}, {"fifty[\\W_]{0,3}1", "fifty-one"},
- {"fifty[\\W_]{0,3}2", "fifty-two"},
- {"fifty[\\W_]{0,3}3", "fifty-three"},
- {"fifty[\\W_]{0,3}4", "fifty-four"},
- {"fifty[\\W_]{0,3}5", "fifty-five"}, {"fifty[\\W_]{0,3}6", "fifty-six"},
- {"fifty[\\W_]{0,3}7", "fifty-seven"},
- {"fifty[\\W_]{0,3}8", "fifty-eight"},
- {"fifty[\\W_]{0,3}9", "fifty-nine"}, {"sixty[\\W_]{0,3}1", "sixty-one"},
- {"sixty[\\W_]{0,3}2", "sixty-two"},
- {"sixty[\\W_]{0,3}3", "sixty-three"},
- {"sixty[\\W_]{0,3}4", "sixty-four"},
- {"sixty[\\W_]{0,3}5", "sixty-five"}, {"sixty[\\W_]{0,3}6", "sixty-six"},
- {"sixty[\\W_]{0,3}7", "sixty-seven"},
- {"sixty[\\W_]{0,3}8", "sixty-eight"},
- {"sixty[\\W_]{0,3}9", "sixty-nine"},
- {"seventy[\\W_]{0,3}1", "seventy-one"},
- {"seventy[\\W_]{0,3}2", "seventy-two"},
- {"seventy[\\W_]{0,3}3", "seventy-three"},
- {"seventy[\\W_]{0,3}4", "seventy-four"},
- {"seventy[\\W_]{0,3}5", "seventy-five"},
- {"seventy[\\W_]{0,3}6", "seventy-six"},
- {"seventy[\\W_]{0,3}7", "seventy-seven"},
- {"seventy[\\W_]{0,3}8", "seventy-eight"},
- {"seventy[\\W_]{0,3}9", "seventy-nine"},
- {"eighty[\\W_]{0,3}1", "eighty-one"},
- {"eighty[\\W_]{0,3}2", "eighty-two"},
- {"eighty[\\W_]{0,3}3", "eighty-three"},
- {"eighty[\\W_]{0,3}4", "eighty-four"},
- {"eighty[\\W_]{0,3}5", "eighty-five"},
- {"eighty[\\W_]{0,3}6", "eighty-six"},
- {"eighty[\\W_]{0,3}7", "eighty-seven"},
- {"eighty[\\W_]{0,3}8", "eighty-eight"},
- {"eighty[\\W_]{0,3}9", "eighty-nine"},
- {"ninety[\\W_]{0,3}1", "ninety-one"},
- {"ninety[\\W_]{0,3}2", "ninety-two"},
- {"ninety[\\W_]{0,3}3", "ninety-three"},
- {"ninety[\\W_]{0,3}4", "ninety-four"},
- {"ninety[\\W_]{0,3}5", "ninety-five"},
- {"ninety[\\W_]{0,3}6", "ninety-six"},
- {"ninety[\\W_]{0,3}7", "ninety-seven"},
- {"ninety[\\W_]{0,3}8", "ninety-eight"},
- {"ninety[\\W_]{0,3}9", "ninety-nine"}},
- // now resolve compound numeral words
- {{"twenty-one", "21"}, {"twenty-two", "22"}, {"twenty-three", "23"},
- {"twenty-four", "24"}, {"twenty-five", "25"}, {"twenty-six", "26"},
- {"twenty-seven", "27"}, {"twenty-eight", "28"}, {"twenty-nine", "29"},
- {"thirty-one", "31"}, {"thirty-two", "32"}, {"thirty-three", "33"},
- {"thirty-four", "34"}, {"thirty-five", "35"}, {"thirty-six", "36"},
- {"thirty-seven", "37"}, {"thirty-eight", "38"}, {"thirty-nine", "39"},
- {"forty-one", "41"}, {"forty-two", "42"}, {"forty-three", "43"},
- {"forty-four", "44"}, {"forty-five", "45"}, {"forty-six", "46"},
- {"forty-seven", "47"}, {"forty-eight", "48"}, {"forty-nine", "49"},
- {"fifty-one", "51"}, {"fifty-two", "52"}, {"fifty-three", "53"},
- {"fifty-four", "54"}, {"fifty-five", "55"}, {"fifty-six", "56"},
- {"fifty-seven", "57"}, {"fifty-eight", "58"}, {"fifty-nine", "59"},
- {"sixty-one", "61"}, {"sixty-two", "62"}, {"sixty-three", "63"},
- {"sixty-four", "64"}, {"sixty-five", "65"}, {"sixty-six", "66"},
- {"sixty-seven", "67"}, {"sixty-eight", "68"}, {"sixty-nine", "69"},
- {"seventy-one", "71"}, {"seventy-two", "72"}, {"seventy-three", "73"},
- {"seventy-four", "74"}, {"seventy-five", "75"}, {"seventy-six", "76"},
- {"seventy-seven", "77"}, {"seventy-eight", "78"},
- {"seventy-nine", "79"}, {"eighty-one", "81"}, {"eighty-two", "82"},
- {"eighty-three", "83"}, {"eighty-four", "84"}, {"eighty-five", "85"},
- {"eighty-six", "86"}, {"eighty-seven", "87"}, {"eighty-eight", "88"},
- {"eighty-nine", "89"}, {"ninety-one", "91"}, {"ninety-two", "92"},
- {"ninety-three", "93"}, {"ninety-four", "94"}, {"ninety-five", "95"},
- {"ninety-six", "96"}, {"ninety-seven", "97"}, {"ninety-eight", "98"},
- {"ninety-nine", "99"}},
- // larger units function as suffixes now
- // assume never have three hundred four, three hundred and four
- {{"hundred", "00"}, {"thousand", "000"}},
- // single numeral words now
- // some would have been ambiguous
- {{"seventeen", "17"}, {"thirteen", "13"}, {"fourteen", "14"},
- {"eighteen", "18"}, {"nineteen", "19"}, {"fifteen", "15"},
- {"sixteen", "16"}, {"seventy", "70"}, {"eleven", "11"},
- {"twelve", "12"}, {"twenty", "20"}, {"thirty", "30"}, {"eighty", "80"},
- {"ninety", "90"}, {"three", "3"}, {"seven", "7"}, {"eight", "8"},
- {"forty", "40"}, {"fifty", "50"}, {"sixty", "60"}, {"zero", "0"},
- {"four", "4"}, {"five", "5"}, {"nine", "9"}, {"one", "1"}, {"two", "2"},
- {"six", "6"}, {"ten", "10"}},
- // now do letter for digit substitutions
- {{"oh", "0"}, {"o", "0"}, {"i", "1"}, {"l", "1"}}};
+ new String[][][] {
+ {{"&#\\d{1,3};", ""}}, // first simply remove numeric entities
+ {
+ {"th0usand", "thousand"}, // handle common misspellings
+ {"th1rteen", "thirteen"},
+ {"f0urteen", "fourteen"},
+ {"e1ghteen", "eighteen"},
+ {"n1neteen", "nineteen"},
+ {"f1fteen", "fifteen"},
+ {"s1xteen", "sixteen"},
+ {"th1rty", "thirty"},
+ {"e1ghty", "eighty"},
+ {"n1nety", "ninety"},
+ {"fourty", "forty"},
+ {"f0urty", "forty"},
+ {"e1ght", "eight"},
+ {"f0rty", "forty"},
+ {"f1fty", "fifty"},
+ {"s1xty", "sixty"},
+ {"zer0", "zero"},
+ {"f0ur", "four"},
+ {"f1ve", "five"},
+ {"n1ne", "nine"},
+ {"0ne", "one"},
+ {"tw0", "two"},
+ {"s1x", "six"}
+ },
+ // mixed compound numeral words
+ // consider 7teen, etc.
+ {
+ {"twenty[\\W_]{0,3}1", "twenty-one"},
+ {"twenty[\\W_]{0,3}2", "twenty-two"},
+ {"twenty[\\W_]{0,3}3", "twenty-three"},
+ {"twenty[\\W_]{0,3}4", "twenty-four"},
+ {"twenty[\\W_]{0,3}5", "twenty-five"},
+ {"twenty[\\W_]{0,3}6", "twenty-six"},
+ {"twenty[\\W_]{0,3}7", "twenty-seven"},
+ {"twenty[\\W_]{0,3}8", "twenty-eight"},
+ {"twenty[\\W_]{0,3}9", "twenty-nine"},
+ {"thirty[\\W_]{0,3}1", "thirty-one"},
+ {"thirty[\\W_]{0,3}2", "thirty-two"},
+ {"thirty[\\W_]{0,3}3", "thirty-three"},
+ {"thirty[\\W_]{0,3}4", "thirty-four"},
+ {"thirty[\\W_]{0,3}5", "thirty-five"},
+ {"thirty[\\W_]{0,3}6", "thirty-six"},
+ {"thirty[\\W_]{0,3}7", "thirty-seven"},
+ {"thirty[\\W_]{0,3}8", "thirty-eight"},
+ {"thirty[\\W_]{0,3}9", "thirty-nine"},
+ {"forty[\\W_]{0,3}1", "forty-one"},
+ {"forty[\\W_]{0,3}2", "forty-two"},
+ {"forty[\\W_]{0,3}3", "forty-three"},
+ {"forty[\\W_]{0,3}4", "forty-four"},
+ {"forty[\\W_]{0,3}5", "forty-five"},
+ {"forty[\\W_]{0,3}6", "forty-six"},
+ {"forty[\\W_]{0,3}7", "forty-seven"},
+ {"forty[\\W_]{0,3}8", "forty-eight"},
+ {"forty[\\W_]{0,3}9", "forty-nine"},
+ {"fifty[\\W_]{0,3}1", "fifty-one"},
+ {"fifty[\\W_]{0,3}2", "fifty-two"},
+ {"fifty[\\W_]{0,3}3", "fifty-three"},
+ {"fifty[\\W_]{0,3}4", "fifty-four"},
+ {"fifty[\\W_]{0,3}5", "fifty-five"},
+ {"fifty[\\W_]{0,3}6", "fifty-six"},
+ {"fifty[\\W_]{0,3}7", "fifty-seven"},
+ {"fifty[\\W_]{0,3}8", "fifty-eight"},
+ {"fifty[\\W_]{0,3}9", "fifty-nine"},
+ {"sixty[\\W_]{0,3}1", "sixty-one"},
+ {"sixty[\\W_]{0,3}2", "sixty-two"},
+ {"sixty[\\W_]{0,3}3", "sixty-three"},
+ {"sixty[\\W_]{0,3}4", "sixty-four"},
+ {"sixty[\\W_]{0,3}5", "sixty-five"},
+ {"sixty[\\W_]{0,3}6", "sixty-six"},
+ {"sixty[\\W_]{0,3}7", "sixty-seven"},
+ {"sixty[\\W_]{0,3}8", "sixty-eight"},
+ {"sixty[\\W_]{0,3}9", "sixty-nine"},
+ {"seventy[\\W_]{0,3}1", "seventy-one"},
+ {"seventy[\\W_]{0,3}2", "seventy-two"},
+ {"seventy[\\W_]{0,3}3", "seventy-three"},
+ {"seventy[\\W_]{0,3}4", "seventy-four"},
+ {"seventy[\\W_]{0,3}5", "seventy-five"},
+ {"seventy[\\W_]{0,3}6", "seventy-six"},
+ {"seventy[\\W_]{0,3}7", "seventy-seven"},
+ {"seventy[\\W_]{0,3}8", "seventy-eight"},
+ {"seventy[\\W_]{0,3}9", "seventy-nine"},
+ {"eighty[\\W_]{0,3}1", "eighty-one"},
+ {"eighty[\\W_]{0,3}2", "eighty-two"},
+ {"eighty[\\W_]{0,3}3", "eighty-three"},
+ {"eighty[\\W_]{0,3}4", "eighty-four"},
+ {"eighty[\\W_]{0,3}5", "eighty-five"},
+ {"eighty[\\W_]{0,3}6", "eighty-six"},
+ {"eighty[\\W_]{0,3}7", "eighty-seven"},
+ {"eighty[\\W_]{0,3}8", "eighty-eight"},
+ {"eighty[\\W_]{0,3}9", "eighty-nine"},
+ {"ninety[\\W_]{0,3}1", "ninety-one"},
+ {"ninety[\\W_]{0,3}2", "ninety-two"},
+ {"ninety[\\W_]{0,3}3", "ninety-three"},
+ {"ninety[\\W_]{0,3}4", "ninety-four"},
+ {"ninety[\\W_]{0,3}5", "ninety-five"},
+ {"ninety[\\W_]{0,3}6", "ninety-six"},
+ {"ninety[\\W_]{0,3}7", "ninety-seven"},
+ {"ninety[\\W_]{0,3}8", "ninety-eight"},
+ {"ninety[\\W_]{0,3}9", "ninety-nine"}
+ },
+ // now resolve compound numeral words
+ {
+ {"twenty-one", "21"},
+ {"twenty-two", "22"},
+ {"twenty-three", "23"},
+ {"twenty-four", "24"},
+ {"twenty-five", "25"},
+ {"twenty-six", "26"},
+ {"twenty-seven", "27"},
+ {"twenty-eight", "28"},
+ {"twenty-nine", "29"},
+ {"thirty-one", "31"},
+ {"thirty-two", "32"},
+ {"thirty-three", "33"},
+ {"thirty-four", "34"},
+ {"thirty-five", "35"},
+ {"thirty-six", "36"},
+ {"thirty-seven", "37"},
+ {"thirty-eight", "38"},
+ {"thirty-nine", "39"},
+ {"forty-one", "41"},
+ {"forty-two", "42"},
+ {"forty-three", "43"},
+ {"forty-four", "44"},
+ {"forty-five", "45"},
+ {"forty-six", "46"},
+ {"forty-seven", "47"},
+ {"forty-eight", "48"},
+ {"forty-nine", "49"},
+ {"fifty-one", "51"},
+ {"fifty-two", "52"},
+ {"fifty-three", "53"},
+ {"fifty-four", "54"},
+ {"fifty-five", "55"},
+ {"fifty-six", "56"},
+ {"fifty-seven", "57"},
+ {"fifty-eight", "58"},
+ {"fifty-nine", "59"},
+ {"sixty-one", "61"},
+ {"sixty-two", "62"},
+ {"sixty-three", "63"},
+ {"sixty-four", "64"},
+ {"sixty-five", "65"},
+ {"sixty-six", "66"},
+ {"sixty-seven", "67"},
+ {"sixty-eight", "68"},
+ {"sixty-nine", "69"},
+ {"seventy-one", "71"},
+ {"seventy-two", "72"},
+ {"seventy-three", "73"},
+ {"seventy-four", "74"},
+ {"seventy-five", "75"},
+ {"seventy-six", "76"},
+ {"seventy-seven", "77"},
+ {"seventy-eight", "78"},
+ {"seventy-nine", "79"},
+ {"eighty-one", "81"},
+ {"eighty-two", "82"},
+ {"eighty-three", "83"},
+ {"eighty-four", "84"},
+ {"eighty-five", "85"},
+ {"eighty-six", "86"},
+ {"eighty-seven", "87"},
+ {"eighty-eight", "88"},
+ {"eighty-nine", "89"},
+ {"ninety-one", "91"},
+ {"ninety-two", "92"},
+ {"ninety-three", "93"},
+ {"ninety-four", "94"},
+ {"ninety-five", "95"},
+ {"ninety-six", "96"},
+ {"ninety-seven", "97"},
+ {"ninety-eight", "98"},
+ {"ninety-nine", "99"}
+ },
+ // larger units function as suffixes now
+ // assume never have three hundred four, three hundred and four
+ {{"hundred", "00"}, {"thousand", "000"}},
+ // single numeral words now
+ // some would have been ambiguous
+ {
+ {"seventeen", "17"},
+ {"thirteen", "13"},
+ {"fourteen", "14"},
+ {"eighteen", "18"},
+ {"nineteen", "19"},
+ {"fifteen", "15"},
+ {"sixteen", "16"},
+ {"seventy", "70"},
+ {"eleven", "11"},
+ {"twelve", "12"},
+ {"twenty", "20"},
+ {"thirty", "30"},
+ {"eighty", "80"},
+ {"ninety", "90"},
+ {"three", "3"},
+ {"seven", "7"},
+ {"eight", "8"},
+ {"forty", "40"},
+ {"fifty", "50"},
+ {"sixty", "60"},
+ {"zero", "0"},
+ {"four", "4"},
+ {"five", "5"},
+ {"nine", "9"},
+ {"one", "1"},
+ {"two", "2"},
+ {"six", "6"},
+ {"ten", "10"}
+ },
+ // now do letter for digit substitutions
+ {{"oh", "0"}, {"o", "0"}, {"i", "1"}, {"l", "1"}}
+ };
// Regex to identify a phone number
static final String cleanPhoneRegex = "([2-9]\\d{2}[2-9]\\d{6})";
// Regex which attempts to ignore punctuation and other distractions.
static final String phoneRegex =
- "([{(<]{0,3}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}" +
- "[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}\\d" +
- "[\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d)";
+ "([{(<]{0,3}[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}"
+ + "[2-9][\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,6}\\d"
+ + "[\\W_]{0,3}\\d[\\W_]{0,3}\\d[\\W_]{0,3}\\d)";
public static ArrayList<String> extractPhoneNumbers(String text) {
text = clean(text);
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java
index b7ce5c7..49c5110 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecorator.java
@@ -25,16 +25,13 @@
import org.xml.sax.helpers.DefaultHandler;
/**
- * Decorator base class for the {@link ContentHandler} interface. This class
- * simply delegates all SAX events calls to an underlying decorated handler
- * instance. Subclasses can provide extra decoration by overriding one or more
- * of the SAX event methods.
+ * Decorator base class for the {@link ContentHandler} interface. This class simply delegates all
+ * SAX events calls to an underlying decorated handler instance. Subclasses can provide extra
+ * decoration by overriding one or more of the SAX event methods.
*/
public class ContentHandlerDecorator extends DefaultHandler {
- /**
- * Decorated SAX event handler.
- */
+ /** Decorated SAX event handler. */
private ContentHandler handler;
/**
@@ -48,18 +45,18 @@
}
/**
- * Creates a decorator that by default forwards incoming SAX events to
- * a dummy content handler that simply ignores all the events. Subclasses
- * should use the {@link #setContentHandler(ContentHandler)} method to
- * switch to a more usable underlying content handler.
+ * Creates a decorator that by default forwards incoming SAX events to a dummy content handler
+ * that simply ignores all the events. Subclasses should use the {@link
+ * #setContentHandler(ContentHandler)} method to switch to a more usable underlying content
+ * handler.
*/
protected ContentHandlerDecorator() {
this(new DefaultHandler());
}
/**
- * Sets the underlying content handler. All future SAX events will be
- * directed to this handler instead of the one that was previously used.
+ * Sets the underlying content handler. All future SAX events will be directed to this handler
+ * instead of the one that was previously used.
*
* @param handler content handler
*/
@@ -170,48 +167,46 @@
}
/**
- * Handle any exceptions thrown by methods in this class. This method
- * provides a single place to implement custom exception handling. The
- * default behaviour is simply to re-throw the given exception, but
- * subclasses can also provide alternative ways of handling the situation.
- *
- * If the wrapped handler is itself a ContentHandlerDecorator, the call
- * is delegated to the wrapped handler's {@link ContentHandlerDecorator#handleException(SAXException)}
+ * Handle any exceptions thrown by methods in this class. This method provides a single place to
+ * implement custom exception handling. The default behaviour is simply to re-throw the given
+ * exception, but subclasses can also provide alternative ways of handling the situation.
+ *
+ * <p>If the wrapped handler is itself a ContentHandlerDecorator, the call is delegated to the
+ * wrapped handler's {@link ContentHandlerDecorator#handleException(SAXException)}
*
* @param exception the exception that was thrown
* @throws SAXException the exception (if any) thrown to the client
*/
protected void handleException(SAXException exception) throws SAXException {
if (handler instanceof ContentHandlerDecorator) {
- ((ContentHandlerDecorator)handler).handleException(exception);
+ ((ContentHandlerDecorator) handler).handleException(exception);
} else {
throw exception;
}
}
@Override
- public void warning (SAXParseException exception) throws SAXException {
+ public void warning(SAXParseException exception) throws SAXException {
if (handler instanceof ErrorHandler) {
- ((ErrorHandler)handler).warning(exception);
+ ((ErrorHandler) handler).warning(exception);
} else {
super.warning(exception);
}
}
@Override
- public void error (SAXParseException exception) throws SAXException {
+ public void error(SAXParseException exception) throws SAXException {
if (handler instanceof ErrorHandler) {
- ((ErrorHandler)handler).error(exception);
+ ((ErrorHandler) handler).error(exception);
} else {
super.error(exception);
}
}
@Override
- public void fatalError (SAXParseException exception)
- throws SAXException {
+ public void fatalError(SAXParseException exception) throws SAXException {
if (handler instanceof ErrorHandler) {
- ((ErrorHandler)handler).fatalError(exception);
+ ((ErrorHandler) handler).fatalError(exception);
} else {
super.fatalError(exception);
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
index 967e186..df92df0 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerDecoratorFactory.java
@@ -17,14 +17,12 @@
package org.apache.tika.sax;
import java.io.Serializable;
-
-import org.xml.sax.ContentHandler;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
public interface ContentHandlerDecoratorFactory extends Serializable {
- ContentHandler decorate(ContentHandler contentHandler, Metadata metadata,
- ParseContext parseContext);
+ ContentHandler decorate(
+ ContentHandler contentHandler, Metadata metadata, ParseContext parseContext);
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
index dc2f338..81962b8 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ContentHandlerFactory.java
@@ -16,19 +16,14 @@
*/
package org.apache.tika.sax;
-
import java.io.OutputStream;
import java.io.Serializable;
import java.nio.charset.Charset;
-
import org.xml.sax.ContentHandler;
-/**
- * Interface to allow easier injection of code for getting a new ContentHandler
- */
+/** Interface to allow easier injection of code for getting a new ContentHandler */
public interface ContentHandlerFactory extends Serializable {
ContentHandler getNewContentHandler();
ContentHandler getNewContentHandler(OutputStream os, Charset charset);
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java
index b76c4d7..fb5c645 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/DIFContentHandler.java
@@ -17,19 +17,17 @@
package org.apache.tika.sax;
import java.util.Stack;
-
+import org.apache.tika.metadata.Metadata;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.metadata.Metadata;
-
public class DIFContentHandler extends DefaultHandler {
- private static final char[] NEWLINE = new char[]{'\n'};
- private static final char[] TABSPACE = new char[]{'\t'};
+ private static final char[] NEWLINE = new char[] {'\n'};
+ private static final char[] TABSPACE = new char[] {'\t'};
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
private final Stack<String> treeStack;
@@ -65,10 +63,10 @@
this.delegate.characters(title.toCharArray(), 0, title.length());
this.delegate.endElement("", "h3", "h3");
}
- if (this.treeStack.peek().equals("Southernmost_Latitude") ||
- this.treeStack.peek().equals("Northernmost_Latitude") ||
- this.treeStack.peek().equals("Westernmost_Longitude") ||
- this.treeStack.peek().equals("Easternmost_Longitude")) {
+ if (this.treeStack.peek().equals("Southernmost_Latitude")
+ || this.treeStack.peek().equals("Northernmost_Latitude")
+ || this.treeStack.peek().equals("Westernmost_Longitude")
+ || this.treeStack.peek().equals("Easternmost_Longitude")) {
this.delegate.characters(NEWLINE, 0, NEWLINE.length);
this.delegate.characters(TABSPACE, 0, TABSPACE.length);
this.delegate.characters(TABSPACE, 0, TABSPACE.length);
@@ -146,5 +144,4 @@
public String toString() {
return delegate.toString();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
index 4f9d30c..ec28e1b 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ElementMappingContentHandler.java
@@ -19,26 +19,23 @@
import java.util.Collections;
import java.util.Map;
import javax.xml.namespace.QName;
-
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
/**
- * Content handler decorator that maps element <code>QName</code>s using
- * a <code>Map</code>. Not mappable elements are not forwarded.
- * Attributes may also be mapped (for each element different using
- * a <code>Map</code> for attributes), not mappable attributes are not
- * forwarded. The default is to not map any attributes and therefore do
- * not forward any of them.
+ * Content handler decorator that maps element <code>QName</code>s using a <code>Map</code>. Not
+ * mappable elements are not forwarded. Attributes may also be mapped (for each element different
+ * using a <code>Map</code> for attributes), not mappable attributes are not forwarded. The default
+ * is to not map any attributes and therefore do not forward any of them.
*/
public class ElementMappingContentHandler extends ContentHandlerDecorator {
private final Map<QName, TargetElement> mappings;
- public ElementMappingContentHandler(ContentHandler handler,
- Map<QName, TargetElement> mappings) {
+ public ElementMappingContentHandler(
+ ContentHandler handler, Map<QName, TargetElement> mappings) {
super(handler);
this.mappings = mappings;
}
@@ -58,7 +55,10 @@
TargetElement mapping = mappings.get(new QName(namespaceURI, localName));
if (mapping != null) {
QName tag = mapping.getMappedTagName();
- super.startElement(tag.getNamespaceURI(), tag.getLocalPart(), getQNameAsString(tag),
+ super.startElement(
+ tag.getNamespaceURI(),
+ tag.getLocalPart(),
+ getQNameAsString(tag),
mapping.mapAttributes(atts));
}
}
@@ -78,34 +78,29 @@
private final QName mappedTagName;
private final Map<QName, QName> attributesMapping;
- /**
- * Creates an TargetElement, attributes of this element will
- * be mapped as specified
- */
+ /** Creates an TargetElement, attributes of this element will be mapped as specified */
public TargetElement(QName mappedTagName, Map<QName, QName> attributesMapping) {
this.mappedTagName = mappedTagName;
this.attributesMapping = attributesMapping;
}
- /**
- * A shortcut that automatically creates the QName object
- */
- public TargetElement(String mappedTagURI, String mappedTagLocalName,
- Map<QName, QName> attributesMapping) {
+ /** A shortcut that automatically creates the QName object */
+ public TargetElement(
+ String mappedTagURI,
+ String mappedTagLocalName,
+ Map<QName, QName> attributesMapping) {
this(new QName(mappedTagURI, mappedTagLocalName), attributesMapping);
}
/**
- * Creates an TargetElement with no attributes, all attributes
- * will be deleted from SAX stream
+ * Creates an TargetElement with no attributes, all attributes will be deleted from SAX
+ * stream
*/
public TargetElement(QName mappedTagName) {
this(mappedTagName, Collections.emptyMap());
}
- /**
- * A shortcut that automatically creates the QName object
- */
+ /** A shortcut that automatically creates the QName object */
public TargetElement(String mappedTagURI, String mappedTagLocalName) {
this(mappedTagURI, mappedTagLocalName, Collections.emptyMap());
}
@@ -123,13 +118,15 @@
for (int i = 0; i < atts.getLength(); i++) {
QName name = attributesMapping.get(new QName(atts.getURI(i), atts.getLocalName(i)));
if (name != null) {
- natts.addAttribute(name.getNamespaceURI(), name.getLocalPart(),
- getQNameAsString(name), atts.getType(i), atts.getValue(i));
+ natts.addAttribute(
+ name.getNamespaceURI(),
+ name.getLocalPart(),
+ getQNameAsString(name),
+ atts.getType(i),
+ atts.getValue(i));
}
}
return natts;
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java
index 38afb0c..6bfdb61 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/EmbeddedContentHandler.java
@@ -19,19 +19,16 @@
import org.xml.sax.ContentHandler;
/**
- * Content handler decorator that prevents the {@link #startDocument()}
- * and {@link #endDocument()} events from reaching the decorated handler.
- * This is useful when you want to direct the results of parsing multiple
- * different XML documents into a single target document without worrying
- * about the {@link #startDocument()} and {@link #endDocument()} methods
- * being called more than once.
+ * Content handler decorator that prevents the {@link #startDocument()} and {@link #endDocument()}
+ * events from reaching the decorated handler. This is useful when you want to direct the results of
+ * parsing multiple different XML documents into a single target document without worrying about the
+ * {@link #startDocument()} and {@link #endDocument()} methods being called more than once.
*/
public class EmbeddedContentHandler extends ContentHandlerDecorator {
/**
- * Created a decorator that prevents the given handler from
- * receiving {@link #startDocument()} and {@link #endDocument()}
- * events.
+ * Created a decorator that prevents the given handler from receiving {@link #startDocument()}
+ * and {@link #endDocument()} events.
*
* @param handler the content handler to be decorated
*/
@@ -39,18 +36,11 @@
super(handler);
}
- /**
- * Ignored.
- */
+ /** Ignored. */
@Override
- public void startDocument() {
- }
+ public void startDocument() {}
- /**
- * Ignored.
- */
+ /** Ignored. */
@Override
- public void endDocument() {
- }
-
+ public void endDocument() {}
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java
index 544db0d..86d42e9 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/EndDocumentShieldingContentHandler.java
@@ -20,10 +20,9 @@
import org.xml.sax.SAXException;
/**
- * A wrapper around a {@link ContentHandler} which will ignore normal
- * SAX calls to {@link #endDocument()}, and only fire them later.
- * This is typically used to ensure that we can output the metadata
- * before ending the document
+ * A wrapper around a {@link ContentHandler} which will ignore normal SAX calls to {@link
+ * #endDocument()}, and only fire them later. This is typically used to ensure that we can output
+ * the metadata before ending the document
*/
public class EndDocumentShieldingContentHandler extends ContentHandlerDecorator {
private boolean endDocumentCalled;
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java
index e1fa733..41a714d 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ExpandedTitleContentHandler.java
@@ -17,22 +17,21 @@
package org.apache.tika.sax;
import javax.xml.transform.sax.TransformerHandler;
-
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
- * Content handler decorator which wraps a {@link TransformerHandler} in order to
- * allow the <code>TITLE</code> tag to render as <code><title></title></code>
- * rather than <code><title/></code> which is accomplished
- * by calling the {@link TransformerHandler#characters(char[], int, int)} method
- * with a <code>length</code> of 1 but a zero length char array.
- * <p>
- * This workaround is an unfortunate circumstance of the limitations imposed by the
- * implementation of the XML serialization code in the JDK brought over from
- * the xalan project which no longer allows for the specification of an
- * alternate <code>content-handler</code> via xslt templates or other means.
+ * Content handler decorator which wraps a {@link TransformerHandler} in order to allow the <code>
+ * TITLE</code> tag to render as <code><title></title></code> rather than <code>
+ * <title/></code> which is accomplished by calling the {@link
+ * TransformerHandler#characters(char[], int, int)} method with a <code>length</code> of 1 but a
+ * zero length char array.
+ *
+ * <p>This workaround is an unfortunate circumstance of the limitations imposed by the
+ * implementation of the XML serialization code in the JDK brought over from the xalan project which
+ * no longer allows for the specification of an alternate <code>content-handler</code> via xslt
+ * templates or other means.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-725">TIKA-725</a>
*/
@@ -85,5 +84,4 @@
super.characters(ch, start, length);
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/Link.java b/tika-core/src/main/java/org/apache/tika/sax/Link.java
index cf3c25d..150a791 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/Link.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/Link.java
@@ -119,5 +119,4 @@
}
return builder.toString();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java
index 310a183..89f4262 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/LinkContentHandler.java
@@ -21,37 +21,27 @@
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
-
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;
-/**
- * Content handler that collects links from an XHTML document.
- */
+/** Content handler that collects links from an XHTML document. */
public class LinkContentHandler extends DefaultHandler {
/**
- * Stack of link builders, one for each level of nested links currently
- * being processed. A usual case of a nested link would be a hyperlinked
- * image (<code>&a href="..."><img src="..."><></code>),
- * but it's possible (though unlikely) for also other kinds of nesting
- * to occur.
+ * Stack of link builders, one for each level of nested links currently being processed. A usual
+ * case of a nested link would be a hyperlinked image (<code>
+ * &a href="..."><img src="..."><></code>), but it's possible (though unlikely)
+ * for also other kinds of nesting to occur.
*/
private final LinkedList<LinkBuilder> builderStack = new LinkedList<>();
- /**
- * Collected links
- */
+ /** Collected links */
private final List<Link> links = new ArrayList<>();
- /**
- * Whether to collapse whitespace in anchor text
- */
+ /** Whether to collapse whitespace in anchor text */
private final boolean collapseWhitespaceInAnchor;
- /**
- * Default constructor
- */
+ /** Default constructor */
public LinkContentHandler() {
this(false);
}
@@ -76,7 +66,7 @@
return links;
}
- //-------------------------------------------------------< ContentHandler>
+ // -------------------------------------------------------< ContentHandler>
@Override
public void startElement(String uri, String local, String name, Attributes attributes) {
@@ -133,8 +123,11 @@
@Override
public void endElement(String uri, String local, String name) {
if (!builderStack.isEmpty() && XHTML.equals(uri)) {
- if ("a".equals(local) || "img".equals(local) || "link".equals(local) ||
- "script".equals(local) || "iframe".equals(local)) {
+ if ("a".equals(local)
+ || "img".equals(local)
+ || "link".equals(local)
+ || "script".equals(local)
+ || "iframe".equals(local)) {
// ensure this is the correct builder. not all </script> tags correspond
// to a LinkBuilder, e.g. for embedded scripts
if (builderStack.getFirst().getType().equals(local)) {
@@ -144,5 +137,4 @@
}
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java
index 6461e09..729d1fd 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/OfflineContentHandler.java
@@ -21,9 +21,9 @@
import org.xml.sax.InputSource;
/**
- * Content handler decorator that always returns an empty stream from the
- * {@link #resolveEntity(String, String)} method to prevent potential
- * network or other external resources from being accessed by an XML parser.
+ * Content handler decorator that always returns an empty stream from the {@link
+ * #resolveEntity(String, String)} method to prevent potential network or other external resources
+ * from being accessed by an XML parser.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-185">TIKA-185</a>
*/
@@ -34,12 +34,10 @@
}
/**
- * Returns an empty stream. This will make an XML parser silently
- * ignore any external entities.
+ * Returns an empty stream. This will make an XML parser silently ignore any external entities.
*/
@Override
public InputSource resolveEntity(String publicId, String systemId) {
return new InputSource(new ClosedInputStream());
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java
index 9811740..46e87a4 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/PhoneExtractingContentHandler.java
@@ -19,34 +19,29 @@
import java.util.Arrays;
import java.util.List;
-
+import org.apache.tika.metadata.Metadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.metadata.Metadata;
-
/**
* Class used to extract phone numbers while parsing.
- * <p>
- * Every time a document is parsed in Tika, the content is split into SAX events.
- * Those SAX events are handled by a ContentHandler. You can think of these events
- * as marking a tag in an HTML file. Once you're finished parsing, you can call
- * handler.toString(), for example, to get the text contents of the file. On the other
- * hand, any of the metadata of the file will be added to the Metadata object passed
- * in during the parse() call. So, the Parser class sends metadata to the Metadata
- * object and content to the ContentHandler.
- * <p>
- * This class is an example of how to combine a ContentHandler and a Metadata.
- * As content is passed to the handler, we first check to see if it matches a
- * textual pattern for a phone number. If the extracted content is a phone number,
- * we add it to the metadata under the key "phonenumbers". So, if you used this
- * ContentHandler when you parsed a document, then called
- * metadata.getValues("phonenumbers"), you would get an array of Strings of phone
- * numbers found in the document.
- * <p>
- * Please see the PhoneExtractingContentHandlerTest for an example of how to use
- * this class.
+ *
+ * <p>Every time a document is parsed in Tika, the content is split into SAX events. Those SAX
+ * events are handled by a ContentHandler. You can think of these events as marking a tag in an HTML
+ * file. Once you're finished parsing, you can call handler.toString(), for example, to get the text
+ * contents of the file. On the other hand, any of the metadata of the file will be added to the
+ * Metadata object passed in during the parse() call. So, the Parser class sends metadata to the
+ * Metadata object and content to the ContentHandler.
+ *
+ * <p>This class is an example of how to combine a ContentHandler and a Metadata. As content is
+ * passed to the handler, we first check to see if it matches a textual pattern for a phone number.
+ * If the extracted content is a phone number, we add it to the metadata under the key
+ * "phonenumbers". So, if you used this ContentHandler when you parsed a document, then called
+ * metadata.getValues("phonenumbers"), you would get an array of Strings of phone numbers found in
+ * the document.
+ *
+ * <p>Please see the PhoneExtractingContentHandlerTest for an example of how to use this class.
*/
public class PhoneExtractingContentHandler extends ContentHandlerDecorator {
private static final String PHONE_NUMBERS = "phonenumbers";
@@ -65,22 +60,20 @@
}
/**
- * Creates a decorator that by default forwards incoming SAX events to
- * a dummy content handler that simply ignores all the events. Subclasses
- * should use the {@link #setContentHandler(ContentHandler)} method to
- * switch to a more usable underlying content handler.
- * Also creates a dummy Metadata object to store phone numbers in.
+ * Creates a decorator that by default forwards incoming SAX events to a dummy content handler
+ * that simply ignores all the events. Subclasses should use the {@link
+ * #setContentHandler(ContentHandler)} method to switch to a more usable underlying content
+ * handler. Also creates a dummy Metadata object to store phone numbers in.
*/
protected PhoneExtractingContentHandler() {
this(new DefaultHandler(), new Metadata());
}
/**
- * The characters method is called whenever a Parser wants to pass raw...
- * characters to the ContentHandler. But, sometimes, phone numbers are split
- * accross different calls to characters, depending on the specific Parser
- * used. So, we simply add all characters to a StringBuilder and analyze it
- * once the document is finished.
+ * The characters method is called whenever a Parser wants to pass raw... characters to the
+ * ContentHandler. But, sometimes, phone numbers are split accross different calls to
+ * characters, depending on the specific Parser used. So, we simply add all characters to a
+ * StringBuilder and analyze it once the document is finished.
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
@@ -93,10 +86,9 @@
}
}
-
/**
- * This method is called whenever the Parser is done parsing the file. So,
- * we check the output for any phone numbers.
+ * This method is called whenever the Parser is done parsing the file. So, we check the output
+ * for any phone numbers.
*/
@Override
public void endDocument() throws SAXException {
diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
index 7ad6f8b..e88dd53 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java
@@ -18,57 +18,52 @@
import java.util.LinkedList;
import java.util.List;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.metadata.filter.MetadataFilter;
import org.apache.tika.metadata.filter.NoOpFilter;
import org.apache.tika.utils.ParserUtils;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
/**
- * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}.
- * See its documentation for more details.
- * <p>
- * This caches the a metadata object for each embedded file and for the container file.
- * It places the extracted content in the metadata object, with this key:
- * {@link TikaCoreProperties#TIKA_CONTENT}
- * If memory is a concern, subclass AbstractRecursiveParserWrapperHandler to handle each
- * embedded document.
- * <p>
- * <b>NOTE: This handler must only be used with the {@link
+ * This is the default implementation of {@link AbstractRecursiveParserWrapperHandler}. See its
+ * documentation for more details.
+ *
+ * <p>This caches the a metadata object for each embedded file and for the container file. It places
+ * the extracted content in the metadata object, with this key: {@link
+ * TikaCoreProperties#TIKA_CONTENT} If memory is a concern, subclass
+ * AbstractRecursiveParserWrapperHandler to handle each embedded document.
+ *
+ * <p><b>NOTE: This handler must only be used with the {@link
* org.apache.tika.parser.RecursiveParserWrapper}</b>
- * </p>
*/
public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler {
protected final List<Metadata> metadataList = new LinkedList<>();
private final MetadataFilter metadataFilter;
- /**
- * Create a handler with no limit on the number of embedded resources
- */
+ /** Create a handler with no limit on the number of embedded resources */
public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) {
this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER);
}
/**
- * Create a handler that limits the number of embedded resources that will be
- * parsed
+ * Create a handler that limits the number of embedded resources that will be parsed
*
* @param maxEmbeddedResources number of embedded resources that will be parsed
*/
- public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory,
- int maxEmbeddedResources) {
+ public RecursiveParserWrapperHandler(
+ ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) {
this(contentHandlerFactory, maxEmbeddedResources, NoOpFilter.NOOP_FILTER);
}
- public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory,
- int maxEmbeddedResources, MetadataFilter metadataFilter) {
+ public RecursiveParserWrapperHandler(
+ ContentHandlerFactory contentHandlerFactory,
+ int maxEmbeddedResources,
+ MetadataFilter metadataFilter) {
super(contentHandlerFactory, maxEmbeddedResources);
this.metadataFilter = metadataFilter;
}
@@ -77,7 +72,7 @@
* This is called before parsing an embedded document
*
* @param contentHandler - local content handler to use on the embedded document
- * @param metadata metadata to use for the embedded document
+ * @param metadata metadata to use for the embedded document
* @throws SAXException
*/
@Override
@@ -90,7 +85,7 @@
* This is called after parsing an embedded document.
*
* @param contentHandler local contenthandler used on the embedded document
- * @param metadata metadata from the embedded document
+ * @param metadata metadata from the embedded document
* @throws SAXException
*/
@Override
@@ -111,7 +106,7 @@
/**
* @param contentHandler content handler used on the main document
- * @param metadata metadata from the main document
+ * @param metadata metadata from the main document
* @throws SAXException
*/
@Override
@@ -131,7 +126,7 @@
/**
* @return a list of Metadata objects, one for the main document and one for each embedded
- * document
+ * document
*/
public List<Metadata> getMetadataList() {
return metadataList;
@@ -140,14 +135,15 @@
void addContent(ContentHandler handler, Metadata metadata) {
if (handler.getClass().equals(DefaultHandler.class)) {
- //no-op: we can't rely on just testing for
- //empty content because DefaultHandler's toString()
- //returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd"
+ // no-op: we can't rely on just testing for
+ // empty content because DefaultHandler's toString()
+ // returns e.g. "org.xml.sax.helpers.DefaultHandler@6c8b1edd"
} else {
String content = handler.toString();
if (content != null && content.trim().length() > 0) {
metadata.add(TikaCoreProperties.TIKA_CONTENT, content);
- metadata.add(TikaCoreProperties.TIKA_CONTENT_HANDLER,
+ metadata.add(
+ TikaCoreProperties.TIKA_CONTENT_HANDLER,
handler.getClass().getSimpleName());
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java
index c250fa2..dfcd6ac 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/RichTextContentHandler.java
@@ -18,20 +18,17 @@
package org.apache.tika.sax;
import java.io.Writer;
-
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
/**
- * Content handler for Rich Text, it will extract XHTML <img/>
- * tag <alt/> attribute and XHTML <a/> tag <name/>
- * attribute into the output.
+ * Content handler for Rich Text, it will extract XHTML <img/> tag <alt/> attribute and
+ * XHTML <a/> tag <name/> attribute into the output.
*/
public class RichTextContentHandler extends WriteOutContentHandler {
/**
- * Creates a content handler that writes XHTML body character events to
- * the given writer.
+ * Creates a content handler that writes XHTML body character events to the given writer.
*
* @param writer writer
*/
diff --git a/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
index b04c327..e03bce1 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/SafeContentHandler.java
@@ -27,38 +27,36 @@
import org.xml.sax.helpers.AttributesImpl;
/**
- * Content handler decorator that makes sure that the character events
- * ({@link #characters(char[], int, int)} or
- * {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated
- * content handler contain only valid XML characters. All invalid characters
- * are replaced with the Unicode replacement character U+FFFD (though a
- * subclass may change this by overriding the {@link #writeReplacement(Output)} method).
- * <p>
- * The XML standard defines the following Unicode character ranges as
- * valid XML characters:
+ * Content handler decorator that makes sure that the character events ({@link #characters(char[],
+ * int, int)} or {@link #ignorableWhitespace(char[], int, int)}) passed to the decorated content
+ * handler contain only valid XML characters. All invalid characters are replaced with the Unicode
+ * replacement character U+FFFD (though a subclass may change this by overriding the {@link
+ * #writeReplacement(Output)} method).
+ *
+ * <p>The XML standard defines the following Unicode character ranges as valid XML characters:
+ *
* <pre>
* #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
* </pre>
- * <p>
- * Note that currently this class only detects those invalid characters whose
- * UTF-16 representation fits a single char. Also, this class does not ensure
- * that the UTF-16 encoding of incoming characters is correct.
+ *
+ * <p>Note that currently this class only detects those invalid characters whose UTF-16
+ * representation fits a single char. Also, this class does not ensure that the UTF-16 encoding of
+ * incoming characters is correct.
*/
public class SafeContentHandler extends ContentHandlerDecorator {
+ /** Replacement for invalid characters. */
+ private static final char[] REPLACEMENT = new char[] {'\ufffd'};
+
/**
- * Replacement for invalid characters.
- */
- private static final char[] REPLACEMENT = new char[]{'\ufffd'};
- /**
- * Output through the {@link ContentHandler#characters(char[], int, int)}
- * method of the decorated content handler.
+ * Output through the {@link ContentHandler#characters(char[], int, int)} method of the
+ * decorated content handler.
*/
private final Output charactersOutput = SafeContentHandler.super::characters;
+
/**
- * Output through the
- * {@link ContentHandler#ignorableWhitespace(char[], int, int)}
- * method of the decorated content handler.
+ * Output through the {@link ContentHandler#ignorableWhitespace(char[], int, int)} method of the
+ * decorated content handler.
*/
private final Output ignorableWhitespaceOutput = SafeContentHandler.super::ignorableWhitespace;
@@ -67,13 +65,12 @@
}
/**
- * Filters and outputs the contents of the given input buffer. Any
- * invalid characters in the input buffer area handled by sending a
- * replacement (a space character) to the given output. Any sequences
- * of valid characters are passed as-is to the given output.
+ * Filters and outputs the contents of the given input buffer. Any invalid characters in the
+ * input buffer area handled by sending a replacement (a space character) to the given output.
+ * Any sequences of valid characters are passed as-is to the given output.
*
- * @param ch input buffer
- * @param start start offset within the buffer
+ * @param ch input buffer
+ * @param start start offset within the buffer
* @param length number of characters to read from the buffer
* @param output output channel
* @throws SAXException if the filtered characters could not be written out
@@ -110,8 +107,8 @@
* Checks if the given string contains any invalid XML characters.
*
* @param value string to be checked
- * @return <code>true</code> if the string contains invalid XML characters,
- * <code>false</code> otherwise
+ * @return <code>true</code> if the string contains invalid XML characters, <code>false</code>
+ * otherwise
*/
private boolean isInvalid(String value) {
char[] ch = value.toCharArray();
@@ -129,17 +126,17 @@
}
/**
- * Checks whether the given Unicode character is an invalid XML character
- * and should be replaced for output. Subclasses can override this method
- * to use an alternative definition of which characters should be replaced
- * in the XML output. The default definition from the XML specification is:
+ * Checks whether the given Unicode character is an invalid XML character and should be replaced
+ * for output. Subclasses can override this method to use an alternative definition of which
+ * characters should be replaced in the XML output. The default definition from the XML
+ * specification is:
+ *
* <pre>
* Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
* </pre>
*
* @param ch character
- * @return <code>true</code> if the character should be replaced,
- * <code>false</code> otherwise
+ * @return <code>true</code> if the character should be replaced, <code>false</code> otherwise
*/
protected boolean isInvalid(int ch) {
if (ch < 0x20) {
@@ -154,8 +151,8 @@
}
/**
- * Outputs the replacement for an invalid character. Subclasses can
- * override this method to use a custom replacement.
+ * Outputs the replacement for an invalid character. Subclasses can override this method to use
+ * a custom replacement.
*
* @param output where the replacement is written to
* @throws SAXException if the replacement could not be written
@@ -169,7 +166,7 @@
throws SAXException {
// TODO: enable this, but some parsers currently
// trip it
- //assert verifyStartElement(name);
+ // assert verifyStartElement(name);
// Look for any invalid characters in attribute values.
for (int i = 0; i < atts.getLength(); i++) {
if (isInvalid(atts.getValue(i))) {
@@ -183,8 +180,12 @@
filter(value.toCharArray(), 0, value.length(), buffer);
value = buffer.toString();
}
- filtered.addAttribute(atts.getURI(j), atts.getLocalName(j), atts.getQName(j),
- atts.getType(j), value);
+ filtered.addAttribute(
+ atts.getURI(j),
+ atts.getLocalName(j),
+ atts.getQName(j),
+ atts.getType(j),
+ value);
}
atts = filtered;
break;
@@ -197,11 +198,10 @@
public void endElement(String uri, String localName, String name) throws SAXException {
// TODO: enable this, but some parsers currently
// trip it
- //assert verifyEndElement(name);
+ // assert verifyEndElement(name);
super.endElement(uri, localName, name);
}
-
/*
private final List<String> elements = new ArrayList<String>();
@@ -235,13 +235,13 @@
}
*/
- //------------------------------------------------------< ContentHandler >
+ // ------------------------------------------------------< ContentHandler >
@Override
public void endDocument() throws SAXException {
// TODO: enable this, but some parsers currently
// trip it
- //assert verifyEndDocument();
+ // assert verifyEndDocument();
super.endDocument();
}
@@ -256,8 +256,8 @@
}
/**
- * Internal interface that allows both character and
- * ignorable whitespace content to be filtered the same way.
+ * Internal interface that allows both character and ignorable whitespace content to be filtered
+ * the same way.
*/
protected interface Output {
void write(char[] ch, int start, int length) throws SAXException;
@@ -274,7 +274,5 @@
public String toString() {
return builder.toString();
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
index 3f9f3c4..3be1632 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
@@ -18,72 +18,56 @@
import java.io.IOException;
import java.util.LinkedList;
-
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-
/**
- * Content handler decorator that attempts to prevent denial of service
- * attacks against Tika parsers.
- * <p>
- * Currently this class simply compares the number of output characters
- * to to the number of input bytes and keeps track of the XML nesting levels.
- * An exception gets thrown if the output seems excessive compared to the
- * input document. This is a strong indication of a zip bomb.
+ * Content handler decorator that attempts to prevent denial of service attacks against Tika
+ * parsers.
+ *
+ * <p>Currently this class simply compares the number of output characters to to the number of input
+ * bytes and keeps track of the XML nesting levels. An exception gets thrown if the output seems
+ * excessive compared to the input document. This is a strong indication of a zip bomb.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-216">TIKA-216</a>
* @since Apache Tika 0.4
*/
public class SecureContentHandler extends ContentHandlerDecorator {
- /**
- * The input stream that Tika is parsing.
- */
+ /** The input stream that Tika is parsing. */
private final TikaInputStream stream;
- /**
- * Current number of nested <div class="package-entr"> elements.
- */
+
+ /** Current number of nested <div class="package-entr"> elements. */
private final LinkedList<Integer> packageEntryDepths = new LinkedList<>();
- /**
- * Number of output characters that Tika has produced so far.
- */
+
+ /** Number of output characters that Tika has produced so far. */
private long characterCount = 0;
- /**
- * The current XML element depth.
- */
+
+ /** The current XML element depth. */
private int currentDepth = 0;
- /**
- * Output threshold.
- */
+
+ /** Output threshold. */
private long threshold = 1000000;
- /**
- * Maximum compression ratio.
- */
+ /** Maximum compression ratio. */
private long ratio = 100;
- /**
- * Maximum XML element nesting level.
- */
+ /** Maximum XML element nesting level. */
private int maxDepth = 100;
- /**
- * Maximum package entry nesting level.
- */
+ /** Maximum package entry nesting level. */
private int maxPackageEntryDepth = 10;
/**
- * Decorates the given content handler with zip bomb prevention based
- * on the count of bytes read from the given counting input stream.
- * The resulting decorator can be passed to a Tika parser along with
- * the given counting input stream.
+ * Decorates the given content handler with zip bomb prevention based on the count of bytes read
+ * from the given counting input stream. The resulting decorator can be passed to a Tika parser
+ * along with the given counting input stream.
*
* @param handler the content handler to be decorated
- * @param stream the input stream to be parsed
+ * @param stream the input stream to be parsed
*/
public SecureContentHandler(ContentHandler handler, TikaInputStream stream) {
super(handler);
@@ -99,12 +83,10 @@
return threshold;
}
-
/**
- * Sets the threshold for output characters before the zip bomb prevention
- * is activated. This avoids false positives in cases where an otherwise
- * normal document for some reason starts with a highly compressible
- * sequence of bytes.
+ * Sets the threshold for output characters before the zip bomb prevention is activated. This
+ * avoids false positives in cases where an otherwise normal document for some reason starts
+ * with a highly compressible sequence of bytes.
*
* @param threshold new output threshold
*/
@@ -112,7 +94,6 @@
this.threshold = threshold;
}
-
/**
* Returns the maximum compression ratio.
*
@@ -122,11 +103,9 @@
return ratio;
}
-
/**
- * Sets the ratio between output characters and input bytes. If this
- * ratio is exceeded (after the output threshold has been reached) then
- * an exception gets thrown.
+ * Sets the ratio between output characters and input bytes. If this ratio is exceeded (after
+ * the output threshold has been reached) then an exception gets thrown.
*
* @param ratio new maximum compression ratio
*/
@@ -144,8 +123,8 @@
}
/**
- * Sets the maximum XML element nesting level. If this depth level is
- * exceeded then an exception gets thrown.
+ * Sets the maximum XML element nesting level. If this depth level is exceeded then an exception
+ * gets thrown.
*
* @param depth maximum XML element nesting level
*/
@@ -163,8 +142,8 @@
}
/**
- * Sets the maximum package entry nesting level. If this depth level is
- * exceeded then an exception gets thrown.
+ * Sets the maximum package entry nesting level. If this depth level is exceeded then an
+ * exception gets thrown.
*
* @param depth maximum package entry nesting level
*/
@@ -173,9 +152,8 @@
}
/**
- * Converts the given {@link SAXException} to a corresponding
- * {@link TikaException} if it's caused by this instance detecting
- * a zip bomb.
+ * Converts the given {@link SAXException} to a corresponding {@link TikaException} if it's
+ * caused by this instance detecting a zip bomb.
*
* @param e SAX exception
* @throws TikaException zip bomb exception
@@ -199,9 +177,9 @@
}
/**
- * Records the given number of output characters (or more accurately
- * UTF-16 code units). Throws an exception if the recorded number of
- * characters highly exceeds the number of input bytes read.
+ * Records the given number of output characters (or more accurately UTF-16 code units). Throws
+ * an exception if the recorded number of characters highly exceeds the number of input bytes
+ * read.
*
* @param length number of new output characters produced
* @throws SAXException if a zip bomb is detected
@@ -211,8 +189,11 @@
long byteCount = getByteCount();
if (characterCount > threshold && characterCount > byteCount * ratio) {
throw new SecureSAXException(
- "Suspected zip bomb: " + byteCount + " input bytes produced " + characterCount +
- " output characters");
+ "Suspected zip bomb: "
+ + byteCount
+ + " input bytes produced "
+ + characterCount
+ + " output characters");
}
}
@@ -228,8 +209,10 @@
if ("div".equals(name) && "package-entry".equals(atts.getValue("class"))) {
packageEntryDepths.addLast(currentDepth);
if (packageEntryDepths.size() >= maxPackageEntryDepth) {
- throw new SecureSAXException("Suspected zip bomb: " + packageEntryDepths.size() +
- " levels of package entry nesting");
+ throw new SecureSAXException(
+ "Suspected zip bomb: "
+ + packageEntryDepths.size()
+ + " levels of package entry nesting");
}
}
@@ -266,9 +249,7 @@
*/
private class SecureSAXException extends SAXException {
- /**
- * Serial version UID.
- */
+ /** Serial version UID. */
private static final long serialVersionUID = 2285245380321771445L;
public SecureSAXException(String message) throws SAXException {
@@ -278,7 +259,5 @@
public boolean isCausedBy(SecureContentHandler handler) {
return SecureContentHandler.this == handler;
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java b/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java
index c8e89a0..01f6fa4 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardOrganizations.java
@@ -21,12 +21,12 @@
import java.util.TreeMap;
/**
- * This class provides a collection of the most important technical standard organizations.
- * The collection of standard organizations has been obtained from
- * <a href="https://en.wikipedia.org/wiki/List_of_technical_standard_organisations">Wikipedia</a>.
- * Currently, the list is composed of the most important international standard organizations,
- * the regional standard organizations (i.e., Africa, Americas, Asia Pacific, Europe, and Middle
- * East), and British and American standard organizations among the national-based ones.
+ * This class provides a collection of the most important technical standard organizations. The
+ * collection of standard organizations has been obtained from <a
+ * href="https://en.wikipedia.org/wiki/List_of_technical_standard_organisations">Wikipedia</a>.
+ * Currently, the list is composed of the most important international standard organizations, the
+ * regional standard organizations (i.e., Africa, Americas, Asia Pacific, Europe, and Middle East),
+ * and British and American standard organizations among the national-based ones.
*/
public class StandardOrganizations {
@@ -34,28 +34,31 @@
static {
organizations = new TreeMap<>();
- //International standard organizations
+ // International standard organizations
organizations.put("3GPP", "3rd Generation Partnership Project");
organizations.put("3GPP2", "3rd Generation Partnership Project 2");
organizations.put("Accellera", "Accellera Organization");
- organizations.put("A4L",
- "Access for Learning Community (formerly known as the Schools Interoperability " +
- "Framework)");
+ organizations.put(
+ "A4L",
+ "Access for Learning Community (formerly known as the Schools Interoperability "
+ + "Framework)");
organizations.put("AES", "Audio Engineering Society");
organizations.put("AIIM", "Association for Information and Image Management");
- organizations.put("ASAM",
- "Association for Automation and Measuring Systems - Automotive technology");
- organizations.put("ASHRAE",
- "American Society of Heating, Refrigerating and Air-Conditioning Engineers " +
- "(ASHRAE is an international organization, despite its name)");
+ organizations.put(
+ "ASAM", "Association for Automation and Measuring Systems - Automotive technology");
+ organizations.put(
+ "ASHRAE",
+ "American Society of Heating, Refrigerating and Air-Conditioning Engineers "
+ + "(ASHRAE is an international organization, despite its name)");
organizations.put("ASME", "formerly The American Society of Mechanical Engineers");
- organizations
- .put("ASTM", "ASTM (American Society for Testing and Materials) International");
+ organizations.put(
+ "ASTM", "ASTM (American Society for Testing and Materials) International");
organizations.put("ATIS", "Alliance for Telecommunications Industry Solutions");
organizations.put("AUTOSAR", "Automotive technology");
- organizations.put("BIPM, CGPM, and CIPM",
- "Bureau International des Poids et Mesures and the related organizations " +
- "established under the Metre Convention of 1875.");
+ organizations.put(
+ "BIPM, CGPM, and CIPM",
+ "Bureau International des Poids et Mesures and the related organizations "
+ + "established under the Metre Convention of 1875.");
organizations.put("CableLabs", "Cable Television Laboratories");
organizations.put("CCSDS", "Consultative Committee for Space Data Sciences");
organizations.put("CISPR", "International Special Committee on Radio Interference");
@@ -66,9 +69,10 @@
organizations.put("EKOenergy", "EKOenergy Network managed by environmental NGOs");
organizations.put("FAI", "Fédération Aéronautique Internationale");
organizations.put("GlobalPlatform", "Secure element and TEE standards");
- organizations.put("GS1",
- "Global supply chain standards (identification numbers, barcodes, electronic " +
- "commerce transactions, RFID)");
+ organizations.put(
+ "GS1",
+ "Global supply chain standards (identification numbers, barcodes, electronic "
+ + "commerce transactions, RFID)");
organizations.put("HGI", "Home Gateway Initiative");
organizations.put("HFSB", "Hedge Fund Standards Board");
organizations.put("IATA", "International Air Transport Association");
@@ -86,28 +90,32 @@
organizations.put("IPTC", "International Press Telecommunications Council");
organizations.put("ITU", "The International Telecommunication Union");
organizations.put("ITU-R", "ITU Radiocommunications Sector (formerly known as CCIR)");
- organizations.put("CCIR",
+ organizations.put(
+ "CCIR",
"Comité Consultatif International pour la Radio, a forerunner of the ITU-R");
organizations.put("ITU-T", "ITU Telecommunications Sector (formerly known as CCITT)");
- organizations.put("CCITT",
- "Comité Consultatif International Téléphonique et Télégraphique, renamed ITU-T in" +
- " 1993");
+ organizations.put(
+ "CCITT",
+ "Comité Consultatif International Téléphonique et Télégraphique, renamed ITU-T in"
+ + " 1993");
organizations.put("ITU-D", "ITU Telecom Development (formerly known as BDT)");
organizations.put("BDT", "Bureau de développement des télécommunications, renamed ITU-D");
organizations.put("IUPAC", "International Union of Pure and Applied Chemistry");
organizations.put("Liberty Alliance", "Liberty Alliance");
organizations.put("Media Grid", "Media Grid Standards Organization");
- organizations.put("NACE International",
+ organizations.put(
+ "NACE International",
"Formerly known as National Association of Corrosion Engineers");
- organizations.put("OASIS",
- "Organization for the Advancement of Structured Information Standards");
+ organizations.put(
+ "OASIS", "Organization for the Advancement of Structured Information Standards");
organizations.put("OGC", "Open Geospatial Consortium");
organizations.put("OHICC", "Organization of Hotel Industry Classification & Certification");
organizations.put("OMA", "Open Mobile Alliance");
organizations.put("OMG", "Object Management Group");
- organizations.put("OGF",
- "Open Grid Forum (merger of Global Grid Forum (GGF) and Enterprise Grid Alliance " +
- "(EGA))");
+ organizations.put(
+ "OGF",
+ "Open Grid Forum (merger of Global Grid Forum (GGF) and Enterprise Grid Alliance "
+ + "(EGA))");
organizations.put("GGF", "Global Grid Forum");
organizations.put("EGA", "Enterprise Grid Alliance");
organizations.put("OpenTravel Alliance", "OpenTravel Alliance (previously known as OTA)");
@@ -131,37 +139,38 @@
organizations.put("WHO", "World Health Organization");
organizations.put("XSF", "The XMPP Standards Foundation");
organizations.put("FAO", "Food and Agriculture Organization");
- //Regional standards organizations
- //Africa
+ // Regional standards organizations
+ // Africa
organizations.put("ARSO", "African Regional Organization for Standarization");
- organizations.put("SADCSTAN",
+ organizations.put(
+ "SADCSTAN",
"Southern African Development Community (SADC) Cooperation in Standarization");
- //Americas
+ // Americas
organizations.put("COPANT", "Pan American Standards Commission");
organizations.put("AMN", "MERCOSUR Standardization Association");
organizations.put("CROSQ", "CARICOM Regional Organization for Standards and Quality");
organizations.put("AAQG", "America's Aerospace Quality Group");
- //Asia Pacific
+ // Asia Pacific
organizations.put("PASC", "Pacific Area Standards Congress");
organizations.put("ACCSQ", "ASEAN Consultative Committee for Standards and Quality");
- //Europe
+ // Europe
organizations.put("RoyalCert", "RoyalCert International Registrars");
organizations.put("CEN", "European Committee for Standardization");
organizations.put("CENELEC", "European Committee for Electrotechnical Standardization");
organizations.put("URS", "United Registrar of Systems, UK");
organizations.put("ETSI", "European Telecommunications Standards Institute");
- organizations
- .put("EASC", "Euro-Asian Council for Standardization, Metrology and Certification");
- organizations
- .put("IRMM", "Institute for Reference Materials and Measurements (European Union)");
- //Middle East
+ organizations.put(
+ "EASC", "Euro-Asian Council for Standardization, Metrology and Certification");
+ organizations.put(
+ "IRMM", "Institute for Reference Materials and Measurements (European Union)");
+ // Middle East
organizations.put("AIDMO", "Arab Industrial Development and Mining Organization");
organizations.put("IAU", "International Arabic Union");
- //Nationally-based standards organizations
- //United Kingdom
+ // Nationally-based standards organizations
+ // United Kingdom
organizations.put("BSI", "British Standards Institution aka BSI Group");
organizations.put("DStan", "UK Defence Standardization");
- //United States of America
+ // United States of America
organizations.put("ANSI", "American National Standards Institute");
organizations.put("ACI", "American Concrete Institute");
organizations.put("NIST", "National Institute of Standards and Technology");
@@ -172,7 +181,7 @@
* organizations.
*
* @return the map containing the collection of the most important technical standard
- * organizations.
+ * organizations.
*/
public static Map<String, String> getOrganizations() {
return organizations;
@@ -183,7 +192,7 @@
* organizations.
*
* @return the regular expression containing the most important technical standard
- * organizations.
+ * organizations.
*/
public static String getOrganzationsRegex() {
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java b/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java
index 243a031..50e7503 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardReference.java
@@ -17,9 +17,7 @@
package org.apache.tika.sax;
-/**
- * Class that represents a standard reference.
- */
+/** Class that represents a standard reference. */
public class StandardReference {
private String mainOrganization;
private String separator;
@@ -27,8 +25,12 @@
private String identifier;
private double score;
- private StandardReference(String mainOrganizationAcronym, String separator,
- String secondOrganizationAcronym, String identifier, double score) {
+ private StandardReference(
+ String mainOrganizationAcronym,
+ String separator,
+ String secondOrganizationAcronym,
+ String identifier,
+ double score) {
super();
this.mainOrganization = mainOrganizationAcronym;
this.separator = separator;
@@ -105,8 +107,8 @@
this.score = 0;
}
- public StandardReferenceBuilder setSecondOrganization(String separator,
- String secondOrganization) {
+ public StandardReferenceBuilder setSecondOrganization(
+ String separator, String secondOrganization) {
this.separator = separator;
this.secondOrganization = secondOrganization;
return this;
@@ -118,8 +120,8 @@
}
public StandardReference build() {
- return new StandardReference(mainOrganization, separator, secondOrganization,
- identifier, score);
+ return new StandardReference(
+ mainOrganization, separator, secondOrganization, identifier, score);
}
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
index 006034a..0a2418c 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
@@ -19,19 +19,17 @@
import java.util.Arrays;
import java.util.List;
-
+import org.apache.tika.metadata.Metadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.metadata.Metadata;
-
/**
- * StandardsExtractingContentHandler is a Content Handler used to extract
- * standard references while parsing.
- * <p>
- * This handler relies on complex regular expressions which can be slow on some types of
- * input data.
+ * StandardsExtractingContentHandler is a Content Handler used to extract standard references while
+ * parsing.
+ *
+ * <p>This handler relies on complex regular expressions which can be slow on some types of input
+ * data.
*/
public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
public static final String STANDARD_REFERENCES = "standard_references";
@@ -44,7 +42,7 @@
/**
* Creates a decorator for the given SAX event handler and Metadata object.
*
- * @param handler SAX event handler to be decorated.
+ * @param handler SAX event handler to be decorated.
* @param metadata {@link Metadata} object.
*/
public StandardsExtractingContentHandler(ContentHandler handler, Metadata metadata) {
@@ -54,22 +52,21 @@
}
/**
- * Creates a decorator that by default forwards incoming SAX events to a
- * dummy content handler that simply ignores all the events. Subclasses
- * should use the {@link #setContentHandler(ContentHandler)} method to
- * switch to a more usable underlying content handler. Also creates a dummy
- * Metadata object to store phone numbers in.
+ * Creates a decorator that by default forwards incoming SAX events to a dummy content handler
+ * that simply ignores all the events. Subclasses should use the {@link
+ * #setContentHandler(ContentHandler)} method to switch to a more usable underlying content
+ * handler. Also creates a dummy Metadata object to store phone numbers in.
*/
protected StandardsExtractingContentHandler() {
this(new DefaultHandler(), new Metadata());
}
/**
- * Gets the threshold to be used for selecting the standard references found
- * within the text based on their score.
+ * Gets the threshold to be used for selecting the standard references found within the text
+ * based on their score.
*
- * @return the threshold to be used for selecting the standard references
- * found within the text based on their score.
+ * @return the threshold to be used for selecting the standard references found within the text
+ * based on their score.
*/
public double getThreshold() {
return threshold;
@@ -85,11 +82,10 @@
}
/**
- * The characters method is called whenever a Parser wants to pass raw
- * characters to the ContentHandler. However, standard references are often
- * split across different calls to characters, depending on the specific
- * Parser used. Therefore, we simply add all characters to a StringBuilder
- * and analyze it once the document is finished.
+ * The characters method is called whenever a Parser wants to pass raw characters to the
+ * ContentHandler. However, standard references are often split across different calls to
+ * characters, depending on the specific Parser used. Therefore, we simply add all characters to
+ * a StringBuilder and analyze it once the document is finished.
*/
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
@@ -107,8 +103,8 @@
}
/**
- * This method is called whenever the Parser is done parsing the file. So,
- * we check the output for any standard references.
+ * This method is called whenever the Parser is done parsing the file. So, we check the output
+ * for any standard references.
*/
@Override
public void endDocument() throws SAXException {
@@ -120,12 +116,11 @@
}
}
-
/**
* The number of characters to store in memory for checking for standards.
*
- * If this is unbounded, the complex regular expressions can take a long time
- * to process some types of data. Only increase this limit with great caution.
+ * <p>If this is unbounded, the complex regular expressions can take a long time to process some
+ * types of data. Only increase this limit with great caution.
*/
public void setMaxBufferLength(int maxBufferLength) {
this.maxBufferLength = maxBufferLength;
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
index 697eede..c6ef61f 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
@@ -24,37 +24,31 @@
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
/**
- * StandardText relies on regular expressions to extract standard references
- * from text.
+ * StandardText relies on regular expressions to extract standard references from text.
*
- * <p>
- * This class helps to find the standard references from text by performing the
- * following steps:
+ * <p>This class helps to find the standard references from text by performing the following steps:
+ *
* <ol>
- * <li>searches for headers;</li>
- * <li>searches for patterns that are supposed to be standard references
- * (basically, every string mostly composed of uppercase letters followed by an
- * alphanumeric characters);</li>
- * <li>each potential standard reference starts with score equal to 0.25;</li>
- * <li>increases by 0.25 the score of references which include the name of a
- * known standard organization ({@link StandardOrganizations});</li>
- * <li>increases by 0.25 the score of references which include the word
- * Publication or Standard;</li>
- * <li>increases by 0.25 the score of references which have been found within
- * "Applicable Documents" and equivalent sections;</li>
- * <li>returns the standard references along with scores.</li>
+ * <li>searches for headers;
+ * <li>searches for patterns that are supposed to be standard references (basically, every string
+ * mostly composed of uppercase letters followed by an alphanumeric characters);
+ * <li>each potential standard reference starts with score equal to 0.25;
+ * <li>increases by 0.25 the score of references which include the name of a known standard
+ * organization ({@link StandardOrganizations});
+ * <li>increases by 0.25 the score of references which include the word Publication or Standard;
+ * <li>increases by 0.25 the score of references which have been found within "Applicable
+ * Documents" and equivalent sections;
+ * <li>returns the standard references along with scores.
* </ol>
- * </p>
*/
public class StandardsText {
// Regular expression to match uppercase headers
private static final String REGEX_HEADER =
- "(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0," +
- "256}+){5,10}+";
+ "(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0,"
+ + "256}+){5,10}+";
// Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent
// sections
@@ -63,8 +57,8 @@
// Regular expression to match the alphanumeric identifier of the standard
private static final String REGEX_IDENTIFIER =
- "(?<identifier>([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)" +
- "?[A-Z0-9]{1,64}+){0,64}+)";
+ "(?<identifier>([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)"
+ + "?[A-Z0-9]{1,64}+){0,64}+)";
// Regular expression to match the standard organization
private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex();
@@ -75,10 +69,17 @@
// Regular expression to match a string that is supposed to be a standard
// reference
- private static final String REGEX_FALLBACK = "\\(?" + "(?<mainOrganization>[A-Z]\\w{1,64}+)" +
- "\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,64}+\\s)*\\(?" + "(?<secondOrganization>[A-Z" +
- "]\\w{1,64}+)" +
- "\\)?)?" + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER;
+ private static final String REGEX_FALLBACK =
+ "\\(?"
+ + "(?<mainOrganization>[A-Z]\\w{1,64}+)"
+ + "\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,64}+\\s)*\\(?"
+ + "(?<secondOrganization>[A-Z"
+ + "]\\w{1,64}+)"
+ + "\\)?)?"
+ + REGEX_STANDARD_TYPE
+ + "?"
+ + "(-|\\s)?"
+ + REGEX_IDENTIFIER;
// Regular expression to match the standard organization within a string
// that is supposed to be a standard reference
@@ -88,16 +89,15 @@
/**
* Extracts the standard references found within the given text.
*
- * @param text the text from which the standard references are extracted.
- * @param threshold the lower bound limit to be used in order to select only the
- * standard references with score greater than or equal to the
- * threshold. For instance, using a threshold of 0.75 means that
- * only the patterns with score greater than or equal to 0.75
- * will be returned.
+ * @param text the text from which the standard references are extracted.
+ * @param threshold the lower bound limit to be used in order to select only the standard
+ * references with score greater than or equal to the threshold. For instance, using a
+ * threshold of 0.75 means that only the patterns with score greater than or equal to 0.75
+ * will be returned.
* @return the list of standard references extracted from the given text.
*/
- public static ArrayList<StandardReference> extractStandardReferences(String text,
- double threshold) {
+ public static ArrayList<StandardReference> extractStandardReferences(
+ String text, double threshold) {
Map<Integer, String> headers = findHeaders(text);
return findStandards(text, headers, threshold);
@@ -125,16 +125,14 @@
/**
* This method helps to find the standard references within the given text.
*
- * @param text the text from which the standards references are extracted.
- * @param headers the list of headers found within the given text.
- * @param threshold the lower bound limit to be used in order to select only the
- * standard references with score greater than or equal to the
- * threshold.
+ * @param text the text from which the standards references are extracted.
+ * @param headers the list of headers found within the given text.
+ * @param threshold the lower bound limit to be used in order to select only the standard
+ * references with score greater than or equal to the threshold.
* @return the list of standard references extracted from the given text.
*/
- private static ArrayList<StandardReference> findStandards(String text,
- Map<Integer, String> headers,
- double threshold) {
+ private static ArrayList<StandardReference> findStandards(
+ String text, Map<Integer, String> headers, double threshold) {
ArrayList<StandardReference> standards = new ArrayList<>();
double score = 0;
@@ -142,10 +140,12 @@
Matcher matcher = pattern.matcher(text);
while (matcher.find()) {
- StandardReferenceBuilder builder = new StandardReference.StandardReferenceBuilder(
- matcher.group("mainOrganization"), matcher.group("identifier"))
- .setSecondOrganization(matcher.group("separator"),
- matcher.group("secondOrganization"));
+ StandardReferenceBuilder builder =
+ new StandardReference.StandardReferenceBuilder(
+ matcher.group("mainOrganization"), matcher.group("identifier"))
+ .setSecondOrganization(
+ matcher.group("separator"),
+ matcher.group("secondOrganization"));
score = 0.25;
// increases by 0.25 the score of references which include the name of a known
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java b/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java
index c79dd80..2af01d1 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StoppingEarlyException.java
@@ -20,9 +20,8 @@
import org.xml.sax.SAXException;
/**
- * Sentinel exception to stop parsing xml once target is found
- * while SAX parsing. This should be used when the parse
- * can be stopped and the exception ignored.
+ * Sentinel exception to stop parsing xml once target is found while SAX parsing. This should be
+ * used when the parse can be stopped and the exception ignored.
*/
public class StoppingEarlyException extends SAXException {
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java
index fea0b83..0a6f71f 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TaggedContentHandler.java
@@ -20,10 +20,10 @@
import org.xml.sax.SAXException;
/**
- * A content handler decorator that tags potential exceptions so that the
- * handler that caused the exception can easily be identified. This is
- * done by using the {@link TaggedSAXException} class to wrap all thrown
- * {@link SAXException}s. See below for an example of using this class.
+ * A content handler decorator that tags potential exceptions so that the handler that caused the
+ * exception can easily be identified. This is done by using the {@link TaggedSAXException} class to
+ * wrap all thrown {@link SAXException}s. See below for an example of using this class.
+ *
* <pre>
* TaggedContentHandler handler = new TaggedContentHandler(...);
* try {
@@ -39,11 +39,11 @@
* }
* }
* </pre>
- * <p>
- * Alternatively, the {@link #throwIfCauseOf(Exception)} method can be
- * used to let higher levels of code handle the exception caused by this
- * stream while other processing errors are being taken care of at this
- * lower level.
+ *
+ * <p>Alternatively, the {@link #throwIfCauseOf(Exception)} method can be used to let higher levels
+ * of code handle the exception caused by this stream while other processing errors are being taken
+ * care of at this lower level.
+ *
* <pre>
* TaggedContentHandler handler = new TaggedContentHandler(...);
* try {
@@ -71,8 +71,8 @@
* Tests if the given exception was caused by this handler.
*
* @param exception an exception
- * @return <code>true</code> if the exception was thrown by this handler,
- * <code>false</code> otherwise
+ * @return <code>true</code> if the exception was thrown by this handler, <code>false</code>
+ * otherwise
*/
public boolean isCauseOf(SAXException exception) {
if (exception instanceof TaggedSAXException) {
@@ -84,11 +84,10 @@
}
/**
- * Re-throws the original exception thrown by this handler. This method
- * first checks whether the given exception is a {@link TaggedSAXException}
- * wrapper created by this decorator, and then unwraps and throws the
- * original wrapped exception. Returns normally if the exception was
- * not thrown by this handler.
+ * Re-throws the original exception thrown by this handler. This method first checks whether the
+ * given exception is a {@link TaggedSAXException} wrapper created by this decorator, and then
+ * unwraps and throws the original wrapped exception. Returns normally if the exception was not
+ * thrown by this handler.
*
* @param exception an exception
* @throws SAXException original exception, if any, thrown by this handler
@@ -112,5 +111,4 @@
protected void handleException(SAXException e) throws SAXException {
throw new TaggedSAXException(e, this);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java b/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
index 7697cc6..0a6f4ee 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TaggedSAXException.java
@@ -19,22 +19,20 @@
import org.xml.sax.SAXException;
/**
- * A {@link SAXException} wrapper that tags the wrapped exception with
- * a given object reference. Both the tag and the wrapped original exception
- * can be used to determine further processing when this exception is caught.
+ * A {@link SAXException} wrapper that tags the wrapped exception with a given object reference.
+ * Both the tag and the wrapped original exception can be used to determine further processing when
+ * this exception is caught.
*/
public class TaggedSAXException extends SAXException {
- /**
- * The object reference used to tag the exception.
- */
+ /** The object reference used to tag the exception. */
private final Object tag;
/**
* Creates a tagged wrapper for the given exception.
*
* @param original the exception to be tagged
- * @param tag tag object
+ * @param tag tag object
*/
public TaggedSAXException(SAXException original, Object tag) {
super(original.getMessage(), original);
@@ -51,8 +49,8 @@
}
/**
- * Returns the wrapped exception. The only difference to the overridden
- * {@link Throwable#getCause()} method is the narrower return type.
+ * Returns the wrapped exception. The only difference to the overridden {@link
+ * Throwable#getCause()} method is the narrower return type.
*
* @return wrapped exception
*/
@@ -60,5 +58,4 @@
public SAXException getCause() {
return (SAXException) super.getCause();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java
index c54e04f..cc004ae 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TeeContentHandler.java
@@ -23,8 +23,8 @@
import org.xml.sax.helpers.DefaultHandler;
/**
- * Content handler proxy that forwards the received SAX events to zero or
- * more underlying content handlers.
+ * Content handler proxy that forwards the received SAX events to zero or more underlying content
+ * handlers.
*/
public class TeeContentHandler extends DefaultHandler {
@@ -111,5 +111,4 @@
handler.skippedEntity(name);
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java
index ff20829..ffd8b44 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TextAndAttributeContentHandler.java
@@ -29,8 +29,8 @@
this(delegate, false);
}
- public TextAndAttributeContentHandler(ContentHandler delegate,
- boolean addSpaceBetweenElements) {
+ public TextAndAttributeContentHandler(
+ ContentHandler delegate, boolean addSpaceBetweenElements) {
super(delegate, addSpaceBetweenElements);
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
index a510baf..e374501 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/TextContentHandler.java
@@ -22,15 +22,13 @@
import org.xml.sax.helpers.DefaultHandler;
/**
- * Content handler decorator that only passes the
- * {@link #characters(char[], int, int)} and
- * (@link {@link #ignorableWhitespace(char[], int, int)}
- * (plus {@link #startDocument()} and {@link #endDocument()} events to
- * the decorated content handler.
+ * Content handler decorator that only passes the {@link #characters(char[], int, int)} and (@link
+ * {@link #ignorableWhitespace(char[], int, int)} (plus {@link #startDocument()} and {@link
+ * #endDocument()} events to the decorated content handler.
*/
public class TextContentHandler extends DefaultHandler {
- private static final char[] SPACE = new char[]{' '};
+ private static final char[] SPACE = new char[] {' '};
private final ContentHandler delegate;
private final boolean addSpaceBetweenElements;
@@ -81,5 +79,4 @@
public String toString() {
return delegate.toString();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
index 268edb1..e14e57a 100755
--- a/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToHTMLContentHandler.java
@@ -21,21 +21,32 @@
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
-
import org.xml.sax.SAXException;
/**
- * SAX event handler that serializes the HTML document to a character stream.
- * The incoming SAX events are expected to be well-formed (properly nested,
- * etc.) and valid HTML.
+ * SAX event handler that serializes the HTML document to a character stream. The incoming SAX
+ * events are expected to be well-formed (properly nested, etc.) and valid HTML.
*
* @since Apache Tika 0.10
*/
public class ToHTMLContentHandler extends ToXMLContentHandler {
- private static final Set<String> EMPTY_ELEMENTS = new HashSet<>(
- Arrays.asList("area", "base", "basefont", "br", "col", "frame", "hr", "img", "input",
- "isindex", "link", "meta", "param"));
+ private static final Set<String> EMPTY_ELEMENTS =
+ new HashSet<>(
+ Arrays.asList(
+ "area",
+ "base",
+ "basefont",
+ "br",
+ "col",
+ "frame",
+ "hr",
+ "img",
+ "input",
+ "isindex",
+ "link",
+ "meta",
+ "param"));
public ToHTMLContentHandler(OutputStream stream, String encoding)
throws UnsupportedEncodingException {
@@ -47,8 +58,7 @@
}
@Override
- public void startDocument() throws SAXException {
- }
+ public void startDocument() throws SAXException {}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
@@ -64,5 +74,4 @@
super.endElement(uri, localName, qName);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
index 868a3bc..69eb419 100755
--- a/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToTextContentHandler.java
@@ -23,19 +23,15 @@
import java.io.UnsupportedEncodingException;
import java.io.Writer;
import java.util.Locale;
-
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
/**
- * SAX event handler that writes all character content out to a character
- * stream. No escaping or other transformations are made on the character
- * content.
- * <p>
- * As of Tika 1.20, this handler ignores content within <script> and
- * <style> tags.
- * </p>
+ * SAX event handler that writes all character content out to a character stream. No escaping or
+ * other transformations are made on the character content.
+ *
+ * <p>As of Tika 1.20, this handler ignores content within <script> and <style> tags.
*
* @since Apache Tika 0.10
*/
@@ -43,16 +39,15 @@
private static final String STYLE = "STYLE";
private static final String SCRIPT = "SCRIPT";
- /**
- * The character stream.
- */
+
+ /** The character stream. */
private final Writer writer;
+
private int styleDepth = 0;
private int scriptDepth = 0;
/**
- * Creates a content handler that writes character events to
- * the given writer.
+ * Creates a content handler that writes character events to the given writer.
*
* @param writer writer
*/
@@ -61,10 +56,10 @@
}
/**
- * Creates a content handler that writes character events to
- * the given output stream using the given encoding.
+ * Creates a content handler that writes character events to the given output stream using the
+ * given encoding.
*
- * @param stream output stream
+ * @param stream output stream
* @param encoding output encoding
* @throws UnsupportedEncodingException if the encoding is unsupported
*/
@@ -74,17 +69,14 @@
}
/**
- * Creates a content handler that writes character events
- * to an internal string buffer. Use the {@link #toString()}
- * method to access the collected character content.
+ * Creates a content handler that writes character events to an internal string buffer. Use the
+ * {@link #toString()} method to access the collected character content.
*/
public ToTextContentHandler() {
this(new StringWriter());
}
- /**
- * Writes the given characters to the given character stream.
- */
+ /** Writes the given characters to the given character stream. */
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
@@ -99,11 +91,9 @@
}
}
-
/**
- * Writes the given ignorable characters to the given character stream.
- * The default implementation simply forwards the call to the
- * {@link #characters(char[], int, int)} method.
+ * Writes the given ignorable characters to the given character stream. The default
+ * implementation simply forwards the call to the {@link #characters(char[], int, int)} method.
*/
@Override
public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
@@ -111,8 +101,7 @@
}
/**
- * Flushes the character stream so that no characters are forgotten
- * in internal buffers.
+ * Flushes the character stream so that no characters are forgotten in internal buffers.
*
* @throws SAXException if the stream can not be flushed
* @see <a href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a>
@@ -150,15 +139,12 @@
}
/**
- * Returns the contents of the internal string buffer where
- * all the received characters have been collected. Only works
- * when this object was constructed using the empty default
- * constructor or by passing a {@link StringWriter} to the
- * other constructor.
+ * Returns the contents of the internal string buffer where all the received characters have
+ * been collected. Only works when this object was constructed using the empty default
+ * constructor or by passing a {@link StringWriter} to the other constructor.
*/
@Override
public String toString() {
return writer.toString();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
index 60ab35f..a586ce1 100755
--- a/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/ToXMLContentHandler.java
@@ -21,15 +21,13 @@
import java.util.Collections;
import java.util.HashMap;
import java.util.Map;
-
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
/**
- * SAX event handler that serializes the XML document to a character stream.
- * The incoming SAX events are expected to be well-formed (properly nested,
- * etc.) and to explicitly include namespace declaration attributes and
- * corresponding namespace prefixes in element and attribute names.
+ * SAX event handler that serializes the XML document to a character stream. The incoming SAX events
+ * are expected to be well-formed (properly nested, etc.) and to explicitly include namespace
+ * declaration attributes and corresponding namespace prefixes in element and attribute names.
*
* @since Apache Tika 0.10
*/
@@ -42,10 +40,10 @@
private ElementInfo currentElement;
/**
- * Creates an XML serializer that writes to the given byte stream
- * using the given character encoding.
+ * Creates an XML serializer that writes to the given byte stream using the given character
+ * encoding.
*
- * @param stream output stream
+ * @param stream output stream
* @param encoding output encoding
* @throws UnsupportedEncodingException if the encoding is unsupported
*/
@@ -65,9 +63,7 @@
this.encoding = null;
}
- /**
- * Writes the XML prefix.
- */
+ /** Writes the XML prefix. */
@Override
public void startDocument() throws SAXException {
if (encoding != null) {
@@ -168,7 +164,7 @@
* @throws SAXException if the character could not be written
*/
protected void write(char ch) throws SAXException {
- super.characters(new char[]{ch}, 0, 1);
+ super.characters(new char[] {ch}, 0, 1);
}
/**
@@ -184,12 +180,11 @@
/**
* Writes the given characters as-is followed by the given entity.
*
- * @param ch character array
- * @param from start position in the array
- * @param to end position in the array
+ * @param ch character array
+ * @param from start position in the array
+ * @param to end position in the array
* @param entity entity code
- * @return next position in the array,
- * after the characters plus one entity
+ * @return next position in the array, after the characters plus one entity
* @throws SAXException if the characters could not be written
*/
private int writeCharsAndEntity(char[] ch, int from, int to, String entity)
@@ -204,11 +199,11 @@
/**
* Writes the given characters with XML meta characters escaped.
*
- * @param ch character array
- * @param from start position in the array
- * @param to end position in the array
- * @param attribute whether the characters should be escaped as
- * an attribute value or normal character content
+ * @param ch character array
+ * @param from start position in the array
+ * @param to end position in the array
+ * @param attribute whether the characters should be escaped as an attribute value or normal
+ * character content
* @throws SAXException if the characters could not be written
*/
private void writeEscaped(char[] ch, int from, int to, boolean attribute) throws SAXException {
@@ -265,7 +260,5 @@
return localName;
}
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java b/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java
index d82895a..bc37138 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/WriteLimiter.java
@@ -18,5 +18,6 @@
public interface WriteLimiter {
int getWriteLimit();
+
boolean isThrowOnWriteLimitReached();
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
index 22d69f0..8915493 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/WriteOutContentHandler.java
@@ -18,30 +18,24 @@
import java.io.StringWriter;
import java.io.Writer;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.ParseRecord;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * SAX event handler that writes content up to an optional write
- * limit out to a character stream or other decorated handler.
+ * SAX event handler that writes content up to an optional write limit out to a character stream or
+ * other decorated handler.
*/
public class WriteOutContentHandler extends ContentHandlerDecorator {
-
/**
- * The maximum number of characters to write to the character stream.
- * Set to -1 for no limit.
+ * The maximum number of characters to write to the character stream. Set to -1 for no limit.
*/
private final int writeLimit;
- /**
- * Number of characters written so far.
- */
+ /** Number of characters written so far. */
private int writeCount = 0;
private boolean throwOnWriteLimitReached = true;
@@ -51,10 +45,10 @@
private boolean writeLimitReached;
/**
- * Creates a content handler that writes content up to the given
- * write limit to the given content handler.
+ * Creates a content handler that writes content up to the given write limit to the given
+ * content handler.
*
- * @param handler content handler to be decorated
+ * @param handler content handler to be decorated
* @param writeLimit write limit
* @since Apache Tika 0.10
*/
@@ -64,10 +58,10 @@
}
/**
- * Creates a content handler that writes content up to the given
- * write limit to the given character stream.
+ * Creates a content handler that writes content up to the given write limit to the given
+ * character stream.
*
- * @param writer character stream
+ * @param writer character stream
* @param writeLimit write limit
* @since Apache Tika 0.10
*/
@@ -76,8 +70,7 @@
}
/**
- * Creates a content handler that writes character events to
- * the given writer.
+ * Creates a content handler that writes character events to the given writer.
*
* @param writer writer
*/
@@ -86,17 +79,16 @@
}
/**
- * Creates a content handler that writes character events
- * to an internal string buffer. Use the {@link #toString()}
- * method to access the collected character content.
- * <p>
- * The internal string buffer is bounded at the given number of characters.
- * If this write limit is reached, then a {@link SAXException} is thrown.
- * The {@link WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to
- * detect this case.
+ * Creates a content handler that writes character events to an internal string buffer. Use the
+ * {@link #toString()} method to access the collected character content.
*
- * @param writeLimit maximum number of characters to include in the string,
- * or -1 to disable the write limit
+ * <p>The internal string buffer is bounded at the given number of characters. If this write
+ * limit is reached, then a {@link SAXException} is thrown. The {@link
+ * WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to detect this
+ * case.
+ *
+ * @param writeLimit maximum number of characters to include in the string, or -1 to disable the
+ * write limit
* @since Apache Tika 0.7
*/
public WriteOutContentHandler(int writeLimit) {
@@ -104,14 +96,13 @@
}
/**
- * Creates a content handler that writes character events
- * to an internal string buffer. Use the {@link #toString()}
- * method to access the collected character content.
- * <p>
- * The internal string buffer is bounded at 100k characters. If this
- * write limit is reached, then a {@link SAXException} is thrown. The
- * {@link WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to
- * detect this case.
+ * Creates a content handler that writes character events to an internal string buffer. Use the
+ * {@link #toString()} method to access the collected character content.
+ *
+ * <p>The internal string buffer is bounded at 100k characters. If this write limit is reached,
+ * then a {@link SAXException} is thrown. The {@link
+ * WriteLimitReachedException#isWriteLimitReached(Throwable)} method can be used to detect this
+ * case.
*/
public WriteOutContentHandler() {
this(100 * 1000);
@@ -119,23 +110,24 @@
/**
* The default is to throw a {@link WriteLimitReachedException}
+ *
* @param handler
* @param writeLimit
* @param throwOnWriteLimitReached
* @param parseContext
*/
- public WriteOutContentHandler(ContentHandler handler,
- int writeLimit, boolean throwOnWriteLimitReached,
- ParseContext parseContext) {
+ public WriteOutContentHandler(
+ ContentHandler handler,
+ int writeLimit,
+ boolean throwOnWriteLimitReached,
+ ParseContext parseContext) {
super(handler);
this.writeLimit = writeLimit;
this.throwOnWriteLimitReached = throwOnWriteLimitReached;
this.parseContext = parseContext;
}
- /**
- * Writes the given characters to the given character stream.
- */
+ /** Writes the given characters to the given character stream. */
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (writeLimitReached) {
@@ -176,5 +168,4 @@
}
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
index 6ba4232..a7d4454 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
@@ -20,71 +20,89 @@
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
-
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-
/**
- * Content handler decorator that simplifies the task of producing XHTML
- * events for Tika content parsers.
+ * Content handler decorator that simplifies the task of producing XHTML events for Tika content
+ * parsers.
*/
public class XHTMLContentHandler extends SafeContentHandler {
- /**
- * The XHTML namespace URI
- */
+ /** The XHTML namespace URI */
public static final String XHTML = "http://www.w3.org/1999/xhtml";
- /**
- * The elements that get appended with the {@link #NL} character.
- */
+
+ /** The elements that get appended with the {@link #NL} character. */
public static final Set<String> ENDLINE =
- unmodifiableSet("p", "h1", "h2", "h3", "h4", "h5", "h6", "div", "ul", "ol", "dl", "pre",
- "hr", "blockquote", "address", "fieldset", "table", "form", "noscript", "li",
- "dt", "dd", "noframes", "br", "tr", "select", "option", "link", "script");
- /**
- * The newline character that gets inserted after block elements.
- */
- private static final char[] NL = new char[]{'\n'};
- /**
- * The tab character gets inserted before table cells and list items.
- */
- private static final char[] TAB = new char[]{'\t'};
- /**
- * The elements that are in the <head> section.
- */
+ unmodifiableSet(
+ "p",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "div",
+ "ul",
+ "ol",
+ "dl",
+ "pre",
+ "hr",
+ "blockquote",
+ "address",
+ "fieldset",
+ "table",
+ "form",
+ "noscript",
+ "li",
+ "dt",
+ "dd",
+ "noframes",
+ "br",
+ "tr",
+ "select",
+ "option",
+ "link",
+ "script");
+
+ /** The newline character that gets inserted after block elements. */
+ private static final char[] NL = new char[] {'\n'};
+
+ /** The tab character gets inserted before table cells and list items. */
+ private static final char[] TAB = new char[] {'\t'};
+
+ /** The elements that are in the <head> section. */
private static final Set<String> HEAD =
unmodifiableSet("title", "link", "base", "meta", "script");
+
/**
- * The elements that are automatically emitted by lazyStartHead, so
- * skip them if they get sent to startElement/endElement by mistake.
+ * The elements that are automatically emitted by lazyStartHead, so skip them if they get sent
+ * to startElement/endElement by mistake.
*/
private static final Set<String> AUTO = unmodifiableSet("head", "frameset");
- /**
- * The elements that get prepended with the {@link #TAB} character.
- */
+
+ /** The elements that get prepended with the {@link #TAB} character. */
private static final Set<String> INDENT =
unmodifiableSet("li", "dd", "dt", "td", "th", "frame");
+
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
- /**
- * Metadata associated with the document. Used to fill in the
- * <head/> section.
- */
+
+ /** Metadata associated with the document. Used to fill in the <head/> section. */
private final Metadata metadata;
- /**
- * Flag to indicate whether the document has been started.
- */
+
+ /** Flag to indicate whether the document has been started. */
private boolean documentStarted = false;
- /**
- * Flags to indicate whether the document head element has been started/ended.
- */
+
+ /** Flags to indicate whether the document head element has been started/ended. */
private boolean headStarted = false;
+
private boolean headEnded = false;
private boolean useFrameset = false;
+
public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
super(handler);
this.metadata = metadata;
@@ -95,10 +113,8 @@
}
/**
- * Starts an XHTML document by setting up the namespace mappings
- * when called for the first time.
- * The standard XHTML prefix is generated lazily when the first
- * element is started.
+ * Starts an XHTML document by setting up the namespace mappings when called for the first time.
+ * The standard XHTML prefix is generated lazily when the first element is started.
*/
@Override
public void startDocument() throws SAXException {
@@ -111,6 +127,7 @@
/**
* Generates the following XHTML prefix when called for the first time:
+ *
* <pre>
* <html>
* <head>
@@ -139,6 +156,7 @@
/**
* Generates the following XHTML prefix when called for the first time:
+ *
* <pre>
* <html>
* <head>
@@ -199,8 +217,8 @@
}
/**
- * Ends the XHTML document by writing the following footer and
- * clearing the namespace mappings:
+ * Ends the XHTML document by writing the following footer and clearing the namespace mappings:
+ *
* <pre>
* </body>
* </html>
@@ -223,8 +241,8 @@
}
/**
- * Starts the given element. Table cells and list items are automatically
- * indented by emitting a tab character as ignorable whitespace.
+ * Starts the given element. Table cells and list items are automatically indented by emitting a
+ * tab character as ignorable whitespace.
*/
@Override
public void startElement(String uri, String local, String name, Attributes attributes)
@@ -247,10 +265,7 @@
}
}
- /**
- * Ends the given element. Block elements are automatically followed
- * by a newline character.
- */
+ /** Ends the given element. Block elements are automatically followed by a newline character. */
@Override
public void endElement(String uri, String local, String name) throws SAXException {
if (!AUTO.contains(name)) {
@@ -270,7 +285,7 @@
super.characters(ch, start, length);
}
- //------------------------------------------< public convenience methods >
+ // ------------------------------------------< public convenience methods >
public void startElement(String name) throws SAXException {
startElement(XHTML, name, name, EMPTY_ATTRIBUTES);
@@ -301,10 +316,10 @@
}
/**
- * Emits an XHTML element with the given text content. If the given
- * text value is null or empty, then the element is not written.
+ * Emits an XHTML element with the given text content. If the given text value is null or empty,
+ * then the element is not written.
*
- * @param name XHTML element name
+ * @param name XHTML element name
* @param value element value, possibly <code>null</code>
* @throws SAXException if the content element could not be written
*/
@@ -324,5 +339,4 @@
// These control chars are invalid in XHTML.
return 0x7F <= ch && ch <= 0x9F;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
index 953ad6a..ec0a128 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XMPContentHandler.java
@@ -16,14 +16,13 @@
*/
package org.apache.tika.sax;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-
/**
* Content handler decorator that simplifies the task of producing XMP output.
*
@@ -31,14 +30,10 @@
*/
public class XMPContentHandler extends SafeContentHandler {
- /**
- * The RDF namespace URI
- */
+ /** The RDF namespace URI */
public static final String RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
- /**
- * The XMP namespace URI
- */
+ /** The XMP namespace URI */
public static final String XMP = "http://ns.adobe.com/xap/1.0/";
private static final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
@@ -49,11 +44,12 @@
super(handler);
}
- //------------------------------------------< public convenience methods >
+ // ------------------------------------------< public convenience methods >
/**
- * Starts an XMP document by setting up the namespace mappings and
- * writing out the following header:
+ * Starts an XMP document by setting up the namespace mappings and writing out the following
+ * header:
+ *
* <pre>
* <rdf:RDF>
* </pre>
@@ -69,8 +65,8 @@
}
/**
- * Ends the XMP document by writing the following footer and
- * clearing the namespace mappings:
+ * Ends the XMP document by writing the following footer and clearing the namespace mappings:
+ *
* <pre>
* </rdf:RDF>
* </pre>
@@ -143,5 +139,4 @@
endDescription();
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/package-info.java b/tika-core/src/main/java/org/apache/tika/sax/package-info.java
index 3c0b4ba..dde70a8 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * SAX utilities.
- */
+/** SAX utilities. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.sax;
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java
index 7b1693d..66ef2ec 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/AttributeMatcher.java
@@ -17,8 +17,8 @@
package org.apache.tika.sax.xpath;
/**
- * Final evaluation state of a <code>.../@*</code> XPath expression.
- * Matches all attributes of the current element.
+ * Final evaluation state of a <code>.../@*</code> XPath expression. Matches all attributes of the
+ * current element.
*/
public class AttributeMatcher extends Matcher {
@@ -27,5 +27,4 @@
public boolean matchesAttribute(String namespace, String name) {
return true;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java
index b95983c..6cc9068 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/ChildMatcher.java
@@ -17,8 +17,8 @@
package org.apache.tika.sax.xpath;
/**
- * Intermediate evaluation state of a <code>.../*...</code> XPath expression.
- * Matches nothing, but specifies the evaluation state for all child elements.
+ * Intermediate evaluation state of a <code>.../*...</code> XPath expression. Matches nothing, but
+ * specifies the evaluation state for all child elements.
*/
public class ChildMatcher extends Matcher {
@@ -31,5 +31,4 @@
public Matcher descend(String namespace, String name) {
return then;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java
index b0ef511..7b96a16 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/CompositeMatcher.java
@@ -17,8 +17,8 @@
package org.apache.tika.sax.xpath;
/**
- * Composite XPath evaluation state. Used when XPath evaluation results
- * in two or more branches of independent evaluation states.
+ * Composite XPath evaluation state. Used when XPath evaluation results in two or more branches of
+ * independent evaluation states.
*/
public class CompositeMatcher extends Matcher {
@@ -56,5 +56,4 @@
public boolean matchesText() {
return a.matchesText() || b.matchesText();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java
index 164e08a..ee6f5bc 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/ElementMatcher.java
@@ -17,8 +17,8 @@
package org.apache.tika.sax.xpath;
/**
- * Final evaluation state of an XPath expression that targets an element.
- * Matches the current element.
+ * Final evaluation state of an XPath expression that targets an element. Matches the current
+ * element.
*/
public class ElementMatcher extends Matcher {
@@ -27,5 +27,4 @@
public boolean matchesElement() {
return true;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java
index ab9d21c..a24a6a6 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/Matcher.java
@@ -16,25 +16,21 @@
*/
package org.apache.tika.sax.xpath;
-/**
- * XPath element matcher. A matcher instance encapsulates a specific
- * state in XPath evaluation.
- */
+/** XPath element matcher. A matcher instance encapsulates a specific state in XPath evaluation. */
public class Matcher {
/**
- * State of a failed XPath evaluation, where nothing is matched.
- * This matcher instance is used as a sentinel object whenever an
- * XPath evaluation branch fails.
+ * State of a failed XPath evaluation, where nothing is matched. This matcher instance is used
+ * as a sentinel object whenever an XPath evaluation branch fails.
*/
public static final Matcher FAIL = new Matcher();
/**
- * Returns the XPath evaluation state that results from descending
- * to a child element with the given name.
+ * Returns the XPath evaluation state that results from descending to a child element with the
+ * given name.
*
* @param namespace element namespace or <code>null</code>
- * @param name element name
+ * @param name element name
* @return next XPath evaluation state
*/
public Matcher descend(String namespace, String name) {
@@ -42,8 +38,8 @@
}
/**
- * Returns <code>true</code> if the XPath expression matches
- * the element associated with this evaluation state.
+ * Returns <code>true</code> if the XPath expression matches the element associated with this
+ * evaluation state.
*
* @return XPath evaluation state for this element
*/
@@ -52,11 +48,11 @@
}
/**
- * Returns <code>true</code> if the XPath expression matches the named
- * attribute of the element associated with this evaluation state.
+ * Returns <code>true</code> if the XPath expression matches the named attribute of the element
+ * associated with this evaluation state.
*
* @param namespace attribute namespace or <code>null</code>
- * @param name attribute name
+ * @param name attribute name
* @return XPath evaluation state for named attribute of this element
*/
public boolean matchesAttribute(String namespace, String name) {
@@ -64,14 +60,12 @@
}
/**
- * Returns <code>true</code> if the XPath expression matches all text
- * nodes whose parent is the element associated with this evaluation
- * state.
+ * Returns <code>true</code> if the XPath expression matches all text nodes whose parent is the
+ * element associated with this evaluation state.
*
* @return XPath evaluation state for text children of this element
*/
public boolean matchesText() {
return false;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
index 9f96186..d4e5250 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/MatchingContentHandler.java
@@ -17,17 +17,15 @@
package org.apache.tika.sax.xpath;
import java.util.LinkedList;
-
+import org.apache.tika.sax.ContentHandlerDecorator;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import org.apache.tika.sax.ContentHandlerDecorator;
-
/**
- * Content handler decorator that only passes the elements, attributes,
- * and text nodes that match the given XPath expression.
+ * Content handler decorator that only passes the elements, attributes, and text nodes that match
+ * the given XPath expression.
*/
public class MatchingContentHandler extends ContentHandlerDecorator {
@@ -50,8 +48,12 @@
String attributeURI = attributes.getURI(i);
String attributeName = attributes.getLocalName(i);
if (matcher.matchesAttribute(attributeURI, attributeName)) {
- matches.addAttribute(attributeURI, attributeName, attributes.getQName(i),
- attributes.getType(i), attributes.getValue(i));
+ matches.addAttribute(
+ attributeURI,
+ attributeName,
+ attributes.getQName(i),
+ attributes.getType(i),
+ attributes.getValue(i));
}
}
@@ -98,5 +100,4 @@
super.skippedEntity(name);
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java
index 46b65a4..0ac8567 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedAttributeMatcher.java
@@ -19,8 +19,8 @@
import java.util.Objects;
/**
- * Final evaluation state of a <code>.../@name</code> XPath expression.
- * Matches the named attributes of the current element.
+ * Final evaluation state of a <code>.../@name</code> XPath expression. Matches the named attributes
+ * of the current element.
*/
public class NamedAttributeMatcher extends Matcher {
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java
index e304789..085935e 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/NamedElementMatcher.java
@@ -19,9 +19,8 @@
import java.util.Objects;
/**
- * Intermediate evaluation state of a <code>.../name...</code> XPath
- * expression. Matches nothing, but specifies the evaluation state
- * for the child elements with the given name.
+ * Intermediate evaluation state of a <code>.../name...</code> XPath expression. Matches nothing,
+ * but specifies the evaluation state for the child elements with the given name.
*/
public class NamedElementMatcher extends ChildMatcher {
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java
index 8c2e45c..f20a1f2 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/NodeMatcher.java
@@ -17,8 +17,8 @@
package org.apache.tika.sax.xpath;
/**
- * Final evaluation state of a <code>.../node()</code> XPath expression.
- * Matches all elements, attributes, and text.
+ * Final evaluation state of a <code>.../node()</code> XPath expression. Matches all elements,
+ * attributes, and text.
*/
public class NodeMatcher extends Matcher {
@@ -38,5 +38,4 @@
public boolean matchesText() {
return true;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java
index 1915dfc..11f7578 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/SubtreeMatcher.java
@@ -17,8 +17,8 @@
package org.apache.tika.sax.xpath;
/**
- * Evaluation state of a <code>...//...</code> XPath expression. Applies the
- * contained evaluation state to the current element and all its descendants.
+ * Evaluation state of a <code>...//...</code> XPath expression. Applies the contained evaluation
+ * state to the current element and all its descendants.
*/
public class SubtreeMatcher extends Matcher {
@@ -52,5 +52,4 @@
public boolean matchesText() {
return then.matchesText();
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java
index caf82f4..efe46ab 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/TextMatcher.java
@@ -17,8 +17,8 @@
package org.apache.tika.sax.xpath;
/**
- * Final evaluation state of a <code>.../text()</code> XPath expression.
- * Matches all text children of the current element.
+ * Final evaluation state of a <code>.../text()</code> XPath expression. Matches all text children
+ * of the current element.
*/
public class TextMatcher extends Matcher {
@@ -27,5 +27,4 @@
public boolean matchesText() {
return true;
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java
index ffa4ccd..d04ba4c 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/XPathParser.java
@@ -20,29 +20,29 @@
import java.util.Map;
/**
- * Parser for a very simple XPath subset. Only the following XPath constructs
- * (with namespaces) are supported:
+ * Parser for a very simple XPath subset. Only the following XPath constructs (with namespaces) are
+ * supported:
+ *
* <ul>
- * <li><code>.../node()</code></li>
- * <li><code>.../text()</code></li>
- * <li><code>.../@*</code></li>
- * <li><code>.../@name</code></li>
- * <li><code>.../*...</code></li>
- * <li><code>.../name...</code></li>
- * <li><code>...//*...</code></li>
- * <li><code>...//name...</code></li>
+ * <li><code>.../node()</code>
+ * <li><code>.../text()</code>
+ * <li><code>.../@*</code>
+ * <li><code>.../@name</code>
+ * <li><code>.../*...</code>
+ * <li><code>.../name...</code>
+ * <li><code>...//*...</code>
+ * <li><code>...//name...</code>
* </ul>
- * <p>
- * In addition the non-abbreviated <code>.../descendant::node()</code>
- * construct can be used for cases where the descendant-or-self axis
- * used by the <code>...//node()</code> construct is not appropriate.
+ *
+ * <p>In addition the non-abbreviated <code>.../descendant::node()</code> construct can be used for
+ * cases where the descendant-or-self axis used by the <code>...//node()</code> construct is not
+ * appropriate.
*/
public class XPathParser {
private final Map<String, String> prefixes = new HashMap<>();
- public XPathParser() {
- }
+ public XPathParser() {}
public XPathParser(String prefix, String namespace) {
addPrefix(prefix, namespace);
@@ -53,9 +53,9 @@
}
/**
- * Parses the given simple XPath expression to an evaluation state
- * initialized at the document node. Invalid expressions are not flagged
- * as errors, they just result in a failing evaluation state.
+ * Parses the given simple XPath expression to an evaluation state initialized at the document
+ * node. Invalid expressions are not flagged as errors, they just result in a failing evaluation
+ * state.
*
* @param xpath simple XPath expression
* @return XPath evaluation state
@@ -65,9 +65,10 @@
return TextMatcher.INSTANCE;
} else if (xpath.equals("/node()")) {
return NodeMatcher.INSTANCE;
- } else if (xpath.equals("/descendant::node()") ||
- xpath.equals("/descendant:node()")) { // for compatibility
- return new CompositeMatcher(TextMatcher.INSTANCE,
+ } else if (xpath.equals("/descendant::node()")
+ || xpath.equals("/descendant:node()")) { // for compatibility
+ return new CompositeMatcher(
+ TextMatcher.INSTANCE,
new ChildMatcher(new SubtreeMatcher(NodeMatcher.INSTANCE)));
} else if (xpath.equals("/@*")) {
return AttributeMatcher.INSTANCE;
@@ -105,8 +106,8 @@
name = name.substring(colon + 1);
}
if (prefixes.containsKey(prefix)) {
- return new NamedElementMatcher(prefixes.get(prefix), name,
- parse(xpath.substring(slash)));
+ return new NamedElementMatcher(
+ prefixes.get(prefix), name, parse(xpath.substring(slash)));
} else {
return Matcher.FAIL;
}
@@ -114,5 +115,4 @@
return Matcher.FAIL;
}
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java b/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java
index f9c1801..2104b97 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/xpath/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * XPath utilities
- */
+/** XPath utilities */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.sax.xpath;
diff --git a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
index b3b8264..e2fcc50 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/AnnotationUtils.java
@@ -25,15 +25,13 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Param;
import org.apache.tika.config.ParamField;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaConfigException;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
/**
* This class contains utilities for dealing with tika annotations
@@ -43,25 +41,23 @@
public class AnnotationUtils {
private static final Logger LOG = LoggerFactory.getLogger(AnnotationUtils.class);
- /**
- * Cache for annotations for Bean classes which have {@link Field}
- */
+ /** Cache for annotations for Bean classes which have {@link Field} */
private static final Map<Class<?>, List<ParamField>> PARAM_INFO = new HashMap<>();
/**
* Collects all the fields and methods for an annotation
*
- * @param clazz bean class with annotations
+ * @param clazz bean class with annotations
* @param annotation annotation class
* @return list of accessible objects such as fields and methods
*/
- private static List<AccessibleObject> collectInfo(Class<?> clazz,
- Class<? extends Annotation> annotation) {
+ private static List<AccessibleObject> collectInfo(
+ Class<?> clazz, Class<? extends Annotation> annotation) {
Class superClazz = clazz;
List<AccessibleObject> members = new ArrayList<>();
List<AccessibleObject> annotatedMembers = new ArrayList<>();
- //walk through the inheritance chain
+ // walk through the inheritance chain
while (superClazz != null && superClazz != Object.class) {
members.addAll(Arrays.asList(superClazz.getDeclaredFields()));
members.addAll(Arrays.asList(superClazz.getDeclaredMethods()));
@@ -109,26 +105,39 @@
try {
field.assignValue(bean, param.getValue());
} catch (InvocationTargetException e) {
- LOG.error("Error assigning value '{}' to '{}'", param.getValue(), param.getName());
+ LOG.error(
+ "Error assigning value '{}' to '{}'",
+ param.getValue(),
+ param.getName());
final Throwable cause = e.getCause() == null ? e : e.getCause();
throw new TikaConfigException(cause.getMessage(), cause);
} catch (IllegalAccessException e) {
- LOG.error("Error assigning value '{}' to '{}'", param.getValue(), param.getName());
+ LOG.error(
+ "Error assigning value '{}' to '{}'",
+ param.getValue(),
+ param.getName());
throw new TikaConfigException(e.getMessage(), e);
}
} else {
- String msg = String.format(Locale.ROOT,
- "Value '%s' of type '%s' can't be" +
- " assigned to field '%s' of defined type '%s'",
- param.getValue(),
- param.getValue().getClass(), field.getName(), field.getType());
+ String msg =
+ String.format(
+ Locale.ROOT,
+ "Value '%s' of type '%s' can't be"
+ + " assigned to field '%s' of defined type '%s'",
+ param.getValue(),
+ param.getValue().getClass(),
+ field.getName(),
+ field.getType());
throw new TikaConfigException(msg);
}
} else if (field.isRequired()) {
- //param not supplied but field is declared as required?
- String msg = String.format(Locale.ROOT,
- "Param %s is required for %s," + " but it is not given in config.",
- field.getName(), bean.getClass().getName());
+ // param not supplied but field is declared as required?
+ String msg =
+ String.format(
+ Locale.ROOT,
+ "Param %s is required for %s," + " but it is not given in config.",
+ field.getName(),
+ bean.getClass().getName());
throw new TikaConfigException(msg);
} else {
LOG.debug("Param not supplied, field is not mandatory");
diff --git a/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java b/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
index 5177752..5f94141 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/CharsetUtils.java
@@ -44,12 +44,39 @@
private static Method isSupportedICU = null;
static {
- initCommonCharsets("Big5", "EUC-JP", "EUC-KR", "x-EUC-TW", "GB18030", "IBM855", "IBM866",
- "ISO-2022-CN", "ISO-2022-JP", "ISO-2022-KR", "ISO-8859-1", "ISO-8859-2",
- "ISO-8859-3", "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", "ISO-8859-7", "ISO-8859-8",
- "ISO-8859-9", "ISO-8859-11", "ISO-8859-13", "ISO-8859-15", "KOI8-R",
- "x-MacCyrillic", "SHIFT_JIS", "UTF-8", "UTF-16BE", "UTF-16LE", "windows-1251",
- "windows-1252", "windows-1253", "windows-1255");
+ initCommonCharsets(
+ "Big5",
+ "EUC-JP",
+ "EUC-KR",
+ "x-EUC-TW",
+ "GB18030",
+ "IBM855",
+ "IBM866",
+ "ISO-2022-CN",
+ "ISO-2022-JP",
+ "ISO-2022-KR",
+ "ISO-8859-1",
+ "ISO-8859-2",
+ "ISO-8859-3",
+ "ISO-8859-4",
+ "ISO-8859-5",
+ "ISO-8859-6",
+ "ISO-8859-7",
+ "ISO-8859-8",
+ "ISO-8859-9",
+ "ISO-8859-11",
+ "ISO-8859-13",
+ "ISO-8859-15",
+ "KOI8-R",
+ "x-MacCyrillic",
+ "SHIFT_JIS",
+ "UTF-8",
+ "UTF-16BE",
+ "UTF-16LE",
+ "windows-1251",
+ "windows-1252",
+ "windows-1253",
+ "windows-1255");
// Common aliases/typos not included in standard charset definitions
COMMON_CHARSETS.put("iso-8851-1", COMMON_CHARSETS.get("iso-8859-1"));
@@ -62,7 +89,7 @@
icuCharset =
CharsetUtils.class.getClassLoader().loadClass("com.ibm.icu.charset.CharsetICU");
} catch (ClassNotFoundException e) {
- //swallow
+ // swallow
}
if (icuCharset != null) {
try {
@@ -73,7 +100,7 @@
try {
isSupportedICU = icuCharset.getMethod("isSupported", String.class);
} catch (Throwable t) {
- //swallow
+ // swallow
}
// TODO: would be nice to somehow log that we
// successfully found ICU
@@ -120,8 +147,8 @@
}
/**
- * Handle various common charset name errors, and return something
- * that will be considered valid (and is normalized)
+ * Handle various common charset name errors, and return something that will be considered valid
+ * (and is normalized)
*
* @param charsetName name of charset to process
* @return potentially remapped/cleaned up version of charset name
@@ -135,10 +162,9 @@
}
/**
- * Returns Charset impl, if one exists. This method
- * optionally uses ICU4J's CharsetICU.forNameICU,
- * if it is found on the classpath, else only uses
- * JDK's builtin Charset.forName.
+ * Returns Charset impl, if one exists. This method optionally uses ICU4J's
+ * CharsetICU.forNameICU, if it is found on the classpath, else only uses JDK's builtin
+ * Charset.forName.
*/
public static Charset forName(String name) {
if (name == null) {
@@ -186,9 +212,10 @@
if (cs != null) {
return cs;
}
- } catch (IllegalArgumentException | IllegalAccessException |
- InvocationTargetException e) {
- //ignore
+ } catch (IllegalArgumentException
+ | IllegalAccessException
+ | InvocationTargetException e) {
+ // ignore
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java b/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java
index a4da777..514e389 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/CompareUtils.java
@@ -19,10 +19,9 @@
public class CompareUtils {
/**
- * Compare two classes by class names.
- * If both classes are Tika's or both are not Tika's class, compare by name String.
- * Otherwise one of these two class is Tika's class.
- * Then the non-Tika's class comes before Tika's class.
+ * Compare two classes by class names. If both classes are Tika's or both are not Tika's class,
+ * compare by name String. Otherwise one of these two class is Tika's class. Then the non-Tika's
+ * class comes before Tika's class.
*
* @param o1 the object 1 to be compared
* @param o2 the object 2 to be compared
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
index 8720e74..a26fef0 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ConcurrentUtils.java
@@ -19,7 +19,6 @@
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
import java.util.concurrent.FutureTask;
-
import org.apache.tika.parser.ParseContext;
/**
@@ -30,8 +29,8 @@
public class ConcurrentUtils {
/**
- * Execute a runnable using an ExecutorService from the ParseContext if possible.
- * Otherwise fallback to individual threads.
+ * Execute a runnable using an ExecutorService from the ParseContext if possible. Otherwise
+ * fallback to individual threads.
*
* @param context
* @param runnable
diff --git a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
index a6a68fe..ae937e5 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/DateUtils.java
@@ -28,33 +28,28 @@
import java.util.Locale;
import java.util.TimeZone;
-/**
- * Date related utility methods and constants
- */
+/** Date related utility methods and constants */
public class DateUtils {
/**
- * The UTC time zone. Not sure if {@link TimeZone#getTimeZone(String)}
- * understands "UTC" in all environments, but it'll fall back to GMT
- * in such cases, which is in practice equivalent to UTC.
+ * The UTC time zone. Not sure if {@link TimeZone#getTimeZone(String)} understands "UTC" in all
+ * environments, but it'll fall back to GMT in such cases, which is in practice equivalent to
+ * UTC.
*/
public static final TimeZone UTC = TimeZone.getTimeZone("UTC");
/**
- * Custom time zone used to interpret date values without a time
- * component in a way that most likely falls within the same day
- * regardless of in which time zone it is later interpreted. For
- * example, the "2012-02-17" date would map to "2012-02-17T12:00:00Z"
- * (instead of the default "2012-02-17T00:00:00Z"), which would still
- * map to "2012-02-17" if interpreted in say Pacific time (while the
- * default mapping would result in "2012-02-16" for UTC-8).
+ * Custom time zone used to interpret date values without a time component in a way that most
+ * likely falls within the same day regardless of in which time zone it is later interpreted.
+ * For example, the "2012-02-17" date would map to "2012-02-17T12:00:00Z" (instead of the
+ * default "2012-02-17T00:00:00Z"), which would still map to "2012-02-17" if interpreted in say
+ * Pacific time (while the default mapping would result in "2012-02-16" for UTC-8).
*/
public static final TimeZone MIDDAY = TimeZone.getTimeZone("GMT-12:00");
+
/**
- * So we can return Date objects for these, this is the
- * list (in preference order) of the various ISO-8601
- * variants that we try when processing a date based
- * property.
+ * So we can return Date objects for these, this is the list (in preference order) of the
+ * various ISO-8601 variants that we try when processing a date based property.
*/
private final List<DateFormat> iso8601InputFormats = loadDateFormats();
@@ -67,8 +62,8 @@
}
/**
- * Returns a ISO 8601 representation of the given date in UTC,
- * truncated to the seconds unit. This method is thread safe and non-blocking.
+ * Returns a ISO 8601 representation of the given date in UTC, truncated to the seconds unit.
+ * This method is thread safe and non-blocking.
*
* @param date given date
* @return ISO 8601 date string in UTC, truncated to the seconds unit
@@ -81,8 +76,8 @@
}
/**
- * Returns a ISO 8601 representation of the given date in UTC,
- * truncated to the seconds unit. This method is thread safe and non-blocking.
+ * Returns a ISO 8601 representation of the given date in UTC, truncated to the seconds unit.
+ * This method is thread safe and non-blocking.
*
* @param date given Calendar
* @return ISO 8601 date string in UTC, truncated to the seconds unit
@@ -91,15 +86,15 @@
public static String formatDate(Calendar date) {
return doFormatDate(date);
}
+
/**
- * Returns a ISO 8601 representation of the given date in UTC,
- * truncated to the seconds unit. This method is thread safe and non-blocking.
+ * Returns a ISO 8601 representation of the given date in UTC, truncated to the seconds unit.
+ * This method is thread safe and non-blocking.
*
* @param date given date
* @return ISO 8601 date string in UTC, truncated to the seconds unit
* @see <a href="https://issues.apache.org/jira/browse/TIKA-495">TIKA-495</a>
*/
-
public static String formatDateUnknownTimezone(Date date) {
// Create the Calendar object in the system timezone
Calendar calendar = GregorianCalendar.getInstance(TimeZone.getDefault(), Locale.US);
@@ -110,9 +105,9 @@
return formatted.substring(0, formatted.length() - 1);
}
-
/**
* Returns ISO-8601 formatted time converted to UTC, truncated to the seconds place
+ *
* @param calendar
* @return
*/
@@ -123,26 +118,25 @@
private List<DateFormat> loadDateFormats() {
List<DateFormat> dateFormats = new ArrayList<>();
// yyyy-mm-ddThh...
- dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC)); // UTC/Zulu
- dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null)); // With timezone
- dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null)); // Without timezone
+ dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'", UTC)); // UTC/Zulu
+ dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", null)); // With timezone
+ dateFormats.add(createDateFormat("yyyy-MM-dd'T'HH:mm:ss", null)); // Without timezone
// yyyy-mm-dd hh...
- dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC)); // UTC/Zulu
- dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null)); // With timezone
- dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null)); // Without timezone
+ dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss'Z'", UTC)); // UTC/Zulu
+ dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ssZ", null)); // With timezone
+ dateFormats.add(createDateFormat("yyyy-MM-dd' 'HH:mm:ss", null)); // Without timezone
// Date without time, set to Midday UTC
- dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY)); // Normal date format
- dateFormats.add(createDateFormat("yyyy:MM:dd",
- MIDDAY)); // Image (IPTC/EXIF) format
+ dateFormats.add(createDateFormat("yyyy-MM-dd", MIDDAY)); // Normal date format
+ dateFormats.add(createDateFormat("yyyy:MM:dd", MIDDAY)); // Image (IPTC/EXIF) format
return dateFormats;
}
/**
* Tries to parse the date string; returns null if no parse was possible.
- * <p>
- * This is not thread safe! Wrap in synchronized or create new {@link DateUtils}
- * for each class.
+ *
+ * <p>This is not thread safe! Wrap in synchronized or create new {@link DateUtils} for each
+ * class.
*
* @param dateString
* @return
@@ -151,8 +145,8 @@
// Java doesn't like timezones in the form ss+hh:mm
// It only likes the hhmm form, without the colon
int n = dateString.length();
- if (dateString.charAt(n - 3) == ':' &&
- (dateString.charAt(n - 6) == '+' || dateString.charAt(n - 6) == '-')) {
+ if (dateString.charAt(n - 3) == ':'
+ && (dateString.charAt(n - 6) == '+' || dateString.charAt(n - 6) == '-')) {
dateString = dateString.substring(0, n - 3) + dateString.substring(n - 2);
}
@@ -160,7 +154,7 @@
try {
return df.parse(dateString);
} catch (java.text.ParseException e) {
- //swallow
+ // swallow
}
}
return null;
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java
index 8f071e2..dd76ed1 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ExceptionUtils.java
@@ -16,27 +16,25 @@
*/
package org.apache.tika.utils;
-
import java.io.IOException;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.Writer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
import org.apache.tika.exception.TikaException;
public class ExceptionUtils {
- private final static Pattern MSG_PATTERN = Pattern.compile(":[^\r\n]+");
+ private static final Pattern MSG_PATTERN = Pattern.compile(":[^\r\n]+");
/**
* Simple util to get stack trace.
- * <p>
- * This will unwrap a TikaException and return the cause if not null
- * <p>
- * NOTE: If your stacktraces are truncated, make sure to start your jvm
- * with: -XX:-OmitStackTraceInFastThrow
+ *
+ * <p>This will unwrap a TikaException and return the cause if not null
+ *
+ * <p>NOTE: If your stacktraces are truncated, make sure to start your jvm with:
+ * -XX:-OmitStackTraceInFastThrow
*
* @param t throwable
* @return
@@ -66,17 +64,16 @@
writer.close();
result.close();
} catch (IOException e) {
- //swallow
+ // swallow
}
return result.toString();
}
/**
- * Utility method to trim the message from a stack trace
- * string.
- * <p>
- * E.g. <code>java.lang.IllegalStateException: Potential loop detected </code>
- * will be trimmed to <code>java.lang.IllegalStateException</code>
+ * Utility method to trim the message from a stack trace string.
+ *
+ * <p>E.g. <code>java.lang.IllegalStateException: Potential loop detected </code> will be
+ * trimmed to <code>java.lang.IllegalStateException</code>
*
* @param trace string view of stack trace
* @return trimmed stack trace
diff --git a/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java b/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java
index f08ca47..7b4a528 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/FileProcessResult.java
@@ -102,16 +102,27 @@
@Override
public String toString() {
- return "FileProcessResult{" +
- "stderr='" + stderr + '\'' +
- ", stdout='" + stdout + '\'' +
- ", exitValue=" + exitValue +
- ", processTimeMillis=" + processTimeMillis +
- ", isTimeout=" + isTimeout +
- ", stdoutLength=" + stdoutLength +
- ", stderrLength=" + stderrLength +
- ", stderrTruncated=" + stderrTruncated +
- ", stdoutTruncated=" + stdoutTruncated +
- '}';
+ return "FileProcessResult{"
+ + "stderr='"
+ + stderr
+ + '\''
+ + ", stdout='"
+ + stdout
+ + '\''
+ + ", exitValue="
+ + exitValue
+ + ", processTimeMillis="
+ + processTimeMillis
+ + ", isTimeout="
+ + isTimeout
+ + ", stdoutLength="
+ + stdoutLength
+ + ", stderrLength="
+ + stderrLength
+ + ", stderrTruncated="
+ + stderrTruncated
+ + ", stdoutTruncated="
+ + stdoutTruncated
+ + '}';
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
index 837f762..3838763 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ParserUtils.java
@@ -21,7 +21,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
-
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -30,18 +29,14 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
-/**
- * Helper util methods for Parsers themselves.
- */
+/** Helper util methods for Parsers themselves. */
public class ParserUtils {
- public final static Property EMBEDDED_PARSER = Property.internalText(
- TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
+ public static final Property EMBEDDED_PARSER =
+ Property.internalText(
+ TikaCoreProperties.TIKA_META_EXCEPTION_PREFIX + "embedded_parser");
-
- /**
- * Does a deep clone of a Metadata object.
- */
+ /** Does a deep clone of a Metadata object. */
public static Metadata cloneMetadata(Metadata m) {
Metadata clone = new Metadata();
@@ -59,8 +54,8 @@
}
/**
- * Identifies the real class name of the {@link Parser}, unwrapping
- * any {@link ParserDecorator} decorations on top of it.
+ * Identifies the real class name of the {@link Parser}, unwrapping any {@link ParserDecorator}
+ * decorations on top of it.
*/
public static String getParserClassname(Parser parser) {
if (parser instanceof ParserDecorator) {
@@ -71,9 +66,8 @@
}
/**
- * Records details of the {@link Parser} used to the {@link Metadata},
- * typically wanted where multiple parsers could be picked between
- * or used.
+ * Records details of the {@link Parser} used to the {@link Metadata}, typically wanted where
+ * multiple parsers could be picked between or used.
*/
public static void recordParserDetails(Parser parser, Metadata metadata) {
String className = getParserClassname(parser);
@@ -81,24 +75,22 @@
}
/**
- * Records details of the {@link Parser} used to the {@link Metadata},
- * typically wanted where multiple parsers could be picked between
- * or used.
+ * Records details of the {@link Parser} used to the {@link Metadata}, typically wanted where
+ * multiple parsers could be picked between or used.
*/
public static void recordParserDetails(String parserClassName, Metadata metadata) {
String[] parsedBys = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
if (parsedBys == null || parsedBys.length == 0) {
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName);
} else if (Arrays.stream(parsedBys).noneMatch(parserClassName::equals)) {
- //only add parser once
+ // only add parser once
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, parserClassName);
}
}
/**
- * Records details of a {@link Parser}'s failure to the
- * {@link Metadata}, so you can check what went wrong even if the
- * {@link Exception} wasn't immediately thrown (eg when several different
+ * Records details of a {@link Parser}'s failure to the {@link Metadata}, so you can check what
+ * went wrong even if the {@link Exception} wasn't immediately thrown (eg when several different
* Parsers are used)
*/
public static void recordParserFailure(Parser parser, Throwable failure, Metadata metadata) {
@@ -108,14 +100,12 @@
}
/**
- * Ensures that the Stream will be able to be re-read, by buffering to
- * a temporary file if required.
- * Streams that are automatically OK include {@link TikaInputStream}s
- * created from Files or InputStreamFactories, and {@link RereadableInputStream}.
+ * Ensures that the Stream will be able to be re-read, by buffering to a temporary file if
+ * required. Streams that are automatically OK include {@link TikaInputStream}s created from
+ * Files or InputStreamFactories, and {@link RereadableInputStream}.
*/
- public static InputStream ensureStreamReReadable(InputStream stream, TemporaryResources tmp,
- Metadata metadata)
- throws IOException {
+ public static InputStream ensureStreamReReadable(
+ InputStream stream, TemporaryResources tmp, Metadata metadata) throws IOException {
// If it's re-readable, we're done
if (stream instanceof RereadableInputStream) {
return stream;
@@ -140,9 +130,9 @@
}
/**
- * Resets the given {@link TikaInputStream} (checked by
- * {@link #ensureStreamReReadable(InputStream, TemporaryResources, Metadata)})
- * so that it can be re-read again.
+ * Resets the given {@link TikaInputStream} (checked by {@link
+ * #ensureStreamReReadable(InputStream, TemporaryResources, Metadata)}) so that it can be
+ * re-read again.
*/
public static InputStream streamResetForReRead(InputStream stream, TemporaryResources tmp)
throws IOException {
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
index 0120cac..f5c2877 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.utils;
-
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
@@ -26,13 +25,15 @@
public class ProcessUtils {
-
private static final ConcurrentHashMap<String, Process> PROCESS_MAP = new ConcurrentHashMap<>();
static {
- Runtime.getRuntime().addShutdownHook(new Thread(() -> {
- PROCESS_MAP.forEachValue(1, Process::destroyForcibly);
- }));
+ Runtime.getRuntime()
+ .addShutdownHook(
+ new Thread(
+ () -> {
+ PROCESS_MAP.forEachValue(1, Process::destroyForcibly);
+ }));
}
private static String register(Process p) {
@@ -46,9 +47,8 @@
}
/**
- * This should correctly put double-quotes around an argument if
- * ProcessBuilder doesn't seem to work (as it doesn't
- * on paths with spaces on Windows)
+ * This should correctly put double-quotes around an argument if ProcessBuilder doesn't seem to
+ * work (as it doesn't on paths with spaces on Windows)
*
* @param arg
* @return
@@ -57,18 +57,20 @@
if (arg == null) {
return arg;
}
- //need to test for " " on windows, can't just add double quotes
- //across platforms.
- if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS &&
- (!arg.startsWith("\"") && !arg.endsWith("\""))) {
+ // need to test for " " on windows, can't just add double quotes
+ // across platforms.
+ if (arg.contains(" ")
+ && SystemUtils.IS_OS_WINDOWS
+ && (!arg.startsWith("\"") && !arg.endsWith("\""))) {
arg = "\"" + arg + "\"";
}
return arg;
}
public static String unescapeCommandLine(String arg) {
- if (arg.contains(" ") && SystemUtils.IS_OS_WINDOWS &&
- (arg.startsWith("\"") && arg.endsWith("\""))) {
+ if (arg.contains(" ")
+ && SystemUtils.IS_OS_WINDOWS
+ && (arg.startsWith("\"") && arg.endsWith("\""))) {
arg = arg.substring(1, arg.length() - 1);
}
return arg;
@@ -84,9 +86,8 @@
* @return
* @throws IOException
*/
- public static FileProcessResult execute(ProcessBuilder pb,
- long timeoutMillis,
- int maxStdoutBuffer, int maxStdErrBuffer)
+ public static FileProcessResult execute(
+ ProcessBuilder pb, long timeoutMillis, int maxStdoutBuffer, int maxStdErrBuffer)
throws IOException {
Process p = null;
String id = null;
@@ -121,7 +122,7 @@
try {
exitValue = p.exitValue();
} catch (IllegalThreadStateException e) {
- //not finished!
+ // not finished!
}
}
}
@@ -135,7 +136,7 @@
result.processTimeMillis = elapsed;
result.stderrLength = errGobbler.getStreamLength();
result.stdoutLength = outGobbler.getStreamLength();
- result.isTimeout = ! complete;
+ result.isTimeout = !complete;
result.exitValue = exitValue;
result.stdout = StringUtils.joinWith("\n", outGobbler.getLines());
result.stderr = StringUtils.joinWith("\n", errGobbler.getLines());
@@ -162,9 +163,9 @@
* @return
* @throws IOException
*/
- public static FileProcessResult execute(ProcessBuilder pb,
- long timeoutMillis,
- Path stdoutRedirect, int maxStdErrBuffer) throws IOException {
+ public static FileProcessResult execute(
+ ProcessBuilder pb, long timeoutMillis, Path stdoutRedirect, int maxStdErrBuffer)
+ throws IOException {
if (!Files.isDirectory(stdoutRedirect.getParent())) {
Files.createDirectories(stdoutRedirect.getParent());
@@ -214,7 +215,5 @@
}
release(id);
}
-
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java b/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java
index 70d0411..030b011 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/RegexUtils.java
@@ -22,22 +22,20 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-/**
- * Inspired from Nutch code class OutlinkExtractor. Apply regex to extract
- * content
- */
+/** Inspired from Nutch code class OutlinkExtractor. Apply regex to extract content */
public class RegexUtils {
/**
* Regex pattern to get URLs within a plain text.
*
* @see <a
- * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
- * </a>
+ * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html
+ * </a>
*/
- private static final String LINKS_REGEX = "([A-Za-z][A-Za-z0-9+.-]{1,120}:" +
- "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}" +
- "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
+ private static final String LINKS_REGEX =
+ "([A-Za-z][A-Za-z0-9+.-]{1,120}:"
+ + "[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}"
+ + "(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";
private static final Pattern LINKS_PATTERN =
Pattern.compile(LINKS_REGEX, Pattern.CASE_INSENSITIVE + Pattern.MULTILINE);
@@ -59,6 +57,5 @@
extractions.add(matcher.group());
}
return extractions;
-
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java b/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java
index db38977..4f53c7f 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/RereadableInputStream.java
@@ -25,96 +25,75 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
-
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
/**
- * Wraps an input stream, reading it only once, but making it available
- * for rereading an arbitrary number of times. The stream's bytes are
- * stored in memory up to a user specified maximum, and then stored in a
- * temporary file which is deleted when this class's close() method is called.
+ * Wraps an input stream, reading it only once, but making it available for rereading an arbitrary
+ * number of times. The stream's bytes are stored in memory up to a user specified maximum, and then
+ * stored in a temporary file which is deleted when this class's close() method is called.
*/
public class RereadableInputStream extends InputStream {
- /**
- * Default value for buffer size = 500M
- */
+ /** Default value for buffer size = 500M */
private static final int DEFAULT_MAX_BYTES_IN_MEMORY = 512 * 1024 * 1024;
-
- /**
- * Input stream originally passed to the constructor.
- */
+ /** Input stream originally passed to the constructor. */
private final InputStream originalInputStream;
/**
- * The inputStream currently being used by this object to read contents;
- * may be the original stream passed in, or a stream that reads
- * the saved copy from a memory buffer or file.
+ * The inputStream currently being used by this object to read contents; may be the original
+ * stream passed in, or a stream that reads the saved copy from a memory buffer or file.
*/
private InputStream inputStream;
/**
- * Maximum number of bytes that can be stored in memory before
- * storage will be moved to a temporary file.
+ * Maximum number of bytes that can be stored in memory before storage will be moved to a
+ * temporary file.
*/
private final int maxBytesInMemory;
/**
- * Whether or not we are currently reading from the byte buffer in memory
- * Bytes are read until we've exhausted the buffered bytes and then we proceed to read from
- * the original input stream. If the numbers of bytes read from the original stream
- * eventually exceed maxBytesInMemory, then we'll switch to reading from a file.
+ * Whether or not we are currently reading from the byte buffer in memory Bytes are read until
+ * we've exhausted the buffered bytes and then we proceed to read from the original input
+ * stream. If the numbers of bytes read from the original stream eventually exceed
+ * maxBytesInMemory, then we'll switch to reading from a file.
*/
private boolean readingFromBuffer;
-
/**
- * The buffer used to store the stream's content; this storage is moved
- * to a file when the stored data's size exceeds maxBytesInMemory.
- * Set to null once we start writing to a file.
+ * The buffer used to store the stream's content; this storage is moved to a file when the
+ * stored data's size exceeds maxBytesInMemory. Set to null once we start writing to a file.
*/
private byte[] byteBuffer;
- /**
- * The current pointer when reading from memory
- */
+ /** The current pointer when reading from memory */
private int bufferPointer;
- /**
- * Maximum size of the buffer that was written in previous pass(s)
- */
+ /** Maximum size of the buffer that was written in previous pass(s) */
private int bufferHighWaterMark;
/**
- * File used to store the stream's contents; is null until the stored
- * content's size exceeds maxBytesInMemory.
+ * File used to store the stream's contents; is null until the stored content's size exceeds
+ * maxBytesInMemory.
*/
private File storeFile;
- /**
- * Specifies whether the stream has been closed
- */
+ /** Specifies whether the stream has been closed */
private boolean closed;
- /**
- * OutputStream used to save the content of the input stream in a
- * temporary file.
- */
+ /** OutputStream used to save the content of the input stream in a temporary file. */
private OutputStream storeOutputStream;
-
/**
- * Specifies whether or not to close the original input stream
- * when close() is called. Defaults to true.
+ * Specifies whether or not to close the original input stream when close() is called. Defaults
+ * to true.
*/
private final boolean closeOriginalStreamOnClose;
-
/**
- * Creates a rereadable input stream with defaults of 512*1024*1024 bytes (500M) for
- * maxBytesInMemory and both readToEndOfStreamOnFirstRewind and closeOriginalStreamOnClose
- * set to true
+ * Creates a rereadable input stream with defaults of 512*1024*1024 bytes (500M) for
+ * maxBytesInMemory and both readToEndOfStreamOnFirstRewind and closeOriginalStreamOnClose set
+ * to true
*
* @param inputStream stream containing the source of data
*/
@@ -133,16 +112,14 @@
}
/**
- * Creates a rereadable input stream with closeOriginalStreamOnClose set to true
+ * Creates a rereadable input stream with closeOriginalStreamOnClose set to true
*
- * @param inputStream stream containing the source of data
- * @param maxBytesInMemory maximum number of bytes to use to store
- * the stream's contents in memory before switching to disk; note that
- * the instance will preallocate a byte array whose size is
- * maxBytesInMemory. This byte array will be made available for
- * garbage collection (i.e. its reference set to null) when the
- * content size exceeds the array's size, when close() is called, or
- * when there are no more references to the instance.
+ * @param inputStream stream containing the source of data
+ * @param maxBytesInMemory maximum number of bytes to use to store the stream's contents in
+ * memory before switching to disk; note that the instance will preallocate a byte array
+ * whose size is maxBytesInMemory. This byte array will be made available for garbage
+ * collection (i.e. its reference set to null) when the content size exceeds the array's
+ * size, when close() is called, or when there are no more references to the instance.
*/
public RereadableInputStream(InputStream inputStream, int maxBytesInMemory) {
this(inputStream, maxBytesInMemory, true);
@@ -151,17 +128,15 @@
/**
* Creates a rereadable input stream.
*
- * @param inputStream stream containing the source of data
- * @param maxBytesInMemory maximum number of bytes to use to store
- * the stream's contents in memory before switching to disk; note that
- * the instance will preallocate a byte array whose size is
- * maxBytesInMemory. This byte array will be made available for
- * garbage collection (i.e. its reference set to null) when the
- * content size exceeds the array's size, when close() is called, or
- * when there are no more references to the instance.
+ * @param inputStream stream containing the source of data
+ * @param maxBytesInMemory maximum number of bytes to use to store the stream's contents in
+ * memory before switching to disk; note that the instance will preallocate a byte array
+ * whose size is maxBytesInMemory. This byte array will be made available for garbage
+ * collection (i.e. its reference set to null) when the content size exceeds the array's
+ * size, when close() is called, or when there are no more references to the instance.
*/
- public RereadableInputStream(InputStream inputStream, int maxBytesInMemory,
- boolean closeOriginalStreamOnClose) {
+ public RereadableInputStream(
+ InputStream inputStream, int maxBytesInMemory, boolean closeOriginalStreamOnClose) {
this.inputStream = inputStream;
this.originalInputStream = inputStream;
this.maxBytesInMemory = maxBytesInMemory;
@@ -170,9 +145,8 @@
}
/**
- * Reads a byte from the stream, saving it in the store if it is being
- * read from the original stream. Implements the abstract
- * InputStream.read().
+ * Reads a byte from the stream, saving it in the store if it is being read from the original
+ * stream. Implements the abstract InputStream.read().
*
* @return the read byte, or -1 on end of stream.
* @throws IOException
@@ -188,9 +162,9 @@
// the next byte from there instead
if (readingFromBuffer) {
readingFromBuffer = false;
- inputStream.close(); // Close the input byte stream
+ inputStream.close(); // Close the input byte stream
} else {
- inputStream.close(); // Close the input file stream
+ inputStream.close(); // Close the input file stream
// start appending to the file
storeOutputStream = new BufferedOutputStream(new FileOutputStream(storeFile, true));
}
@@ -207,9 +181,7 @@
return inputByte;
}
- /**
- * Saves the bytes read from the original stream to buffer or file
- */
+ /** Saves the bytes read from the original stream to buffer or file */
private void saveByte(int inputByte) throws IOException {
if (byteBuffer != null) {
if (bufferPointer == maxBytesInMemory) {
@@ -257,7 +229,8 @@
// If we have a buffer, then we'll read from it
if (byteBuffer != null) {
readingFromBuffer = true;
- inputStream = new UnsynchronizedByteArrayInputStream(byteBuffer, 0, bufferHighWaterMark);
+ inputStream =
+ new UnsynchronizedByteArrayInputStream(byteBuffer, 0, bufferHighWaterMark);
} else {
// No buffer, which means we've switched to a file
inputStream = new BufferedInputStream(new FileInputStream(storeFile));
@@ -268,8 +241,8 @@
}
/**
- * Closes the input stream currently used for reading (may either be
- * the original stream or a memory or file stream after the first pass).
+ * Closes the input stream currently used for reading (may either be the original stream or a
+ * memory or file stream after the first pass).
*
* @throws IOException
*/
@@ -285,8 +258,7 @@
}
/**
- * Closes the input stream and removes the temporary file if one was
- * created.
+ * Closes the input stream and removes the temporary file if one was created.
*
* @throws IOException
*/
diff --git a/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java
index 1e61c97..c255b41 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/ServiceLoaderUtils.java
@@ -19,16 +19,13 @@
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.util.List;
-
import org.apache.tika.config.ServiceLoader;
-/**
- * Service Loading and Ordering related utils
- */
+/** Service Loading and Ordering related utils */
public class ServiceLoaderUtils {
/**
- * Sorts a list of loaded classes, so that non-Tika ones come
- * before Tika ones, and otherwise in reverse alphabetical order
+ * Sorts a list of loaded classes, so that non-Tika ones come before Tika ones, and otherwise in
+ * reverse alphabetical order
*/
public static <T> void sortLoadedClasses(List<T> loaded) {
loaded.sort(CompareUtils::compareClassName);
@@ -38,7 +35,7 @@
* Loads a class and instantiates it
*
* @param className service class name
- * @param <T> service type
+ * @param <T> service type
* @return instance of service
*/
public static <T> T newInstance(String className) {
@@ -49,27 +46,31 @@
* Loads a class and instantiates it
*
* @param className service class name
- * @param loader class loader
- * @param <T> service type
+ * @param loader class loader
+ * @param <T> service type
* @return instance of service
*/
public static <T> T newInstance(String className, ClassLoader loader) {
try {
- return ((Class<T>) Class.forName(className, true, loader)).getDeclaredConstructor().newInstance();
- } catch (ClassNotFoundException | InstantiationException | IllegalAccessException |
- NoSuchMethodException | InvocationTargetException e) {
+ return ((Class<T>) Class.forName(className, true, loader))
+ .getDeclaredConstructor()
+ .newInstance();
+ } catch (ClassNotFoundException
+ | InstantiationException
+ | IllegalAccessException
+ | NoSuchMethodException
+ | InvocationTargetException e) {
throw new RuntimeException(e);
}
}
/**
- * Loads a class and instantiates it. If the class can be initialized
- * with a ServiceLoader, the ServiceLoader constructor is used.
- * Otherwise, a zero arg newInstance() is called.
+ * Loads a class and instantiates it. If the class can be initialized with a ServiceLoader, the
+ * ServiceLoader constructor is used. Otherwise, a zero arg newInstance() is called.
*
- * @param klass class to build
- * @param loader service loader
- * @param <T> service type
+ * @param klass class to build
+ * @param loader service loader
+ * @param <T> service type
* @return instance of service
*/
public static <T> T newInstance(Class klass, ServiceLoader loader) {
@@ -78,12 +79,14 @@
Constructor<T> constructor = klass.getDeclaredConstructor(ServiceLoader.class);
return constructor.newInstance(loader);
} catch (NoSuchMethodException e) {
- return (T)klass.getDeclaredConstructor().newInstance();
+ return (T) klass.getDeclaredConstructor().newInstance();
} catch (InvocationTargetException e) {
throw new RuntimeException(e);
}
- } catch (InstantiationException | IllegalAccessException | NoSuchMethodException |
- InvocationTargetException e) {
+ } catch (InstantiationException
+ | IllegalAccessException
+ | NoSuchMethodException
+ | InvocationTargetException e) {
throw new RuntimeException(e);
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java
index effbeb2..266f312 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/StreamGobbler.java
@@ -27,7 +27,6 @@
public class StreamGobbler implements Runnable {
-
private final InputStream is;
private final int maxBufferLength;
List<String> lines = new ArrayList<>();
@@ -39,12 +38,11 @@
this.maxBufferLength = maxBufferLength;
}
-
@Override
public void run() {
- try (BufferedReader r = new BufferedReader(
- new InputStreamReader(is, StandardCharsets.UTF_8))) {
+ try (BufferedReader r =
+ new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
String line = r.readLine();
while (line != null) {
if (maxBufferLength >= 0) {
diff --git a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java
index 462cceb..8adb3d2 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/StringUtils.java
@@ -45,9 +45,9 @@
}
/**
- * <p>Left pad a String with a specified String.</p>
+ * Left pad a String with a specified String.
*
- * <p>Pad to a size of {@code size}.</p>
+ * <p>Pad to a size of {@code size}.
*
* <pre>
* StringUtils.leftPad(null, *, *) = null
@@ -61,11 +61,11 @@
* StringUtils.leftPad("bat", 5, "") = " bat"
* </pre>
*
- * @param str the String to pad out, may be null
- * @param size the size to pad to
+ * @param str the String to pad out, may be null
+ * @param size the size to pad to
* @param padStr the String to pad with, null or empty treated as single space
- * @return left padded String or original String if no padding is necessary,
- * {@code null} if null String input
+ * @return left padded String or original String if no padding is necessary, {@code null} if
+ * null String input
*/
public static String leftPad(final String str, final int size, String padStr) {
if (str == null) {
@@ -98,7 +98,6 @@
}
}
-
public static String leftPad(final String str, final int size, final char padChar) {
if (str == null) {
return null;
@@ -114,8 +113,7 @@
}
/**
- * <p>Returns padding using the specified delimiter repeated
- * to a given length.</p>
+ * Returns padding using the specified delimiter repeated to a given length.
*
* <pre>
* StringUtils.repeat('e', 0) = ""
@@ -123,14 +121,13 @@
* StringUtils.repeat('e', -2) = ""
* </pre>
*
- * <p>Note: this method does not support padding with
- * <a href="http://www.unicode.org/glossary/#supplementary_character">Unicode Supplementary Characters</a>
- * as they require a pair of {@code char}s to be represented.
- * If you are needing to support full I18N of your applications
- * consider using {@link #repeat(String, int)} instead.
- * </p>
+ * <p>Note: this method does not support padding with <a
+ * href="http://www.unicode.org/glossary/#supplementary_character">Unicode Supplementary
+ * Characters</a> as they require a pair of {@code char}s to be represented. If you are needing
+ * to support full I18N of your applications consider using {@link #repeat(String, int)}
+ * instead.
*
- * @param ch character to repeat
+ * @param ch character to repeat
* @param repeat number of times to repeat char, negative treated as zero
* @return String with repeated character
* @see #repeat(String, int)
@@ -147,11 +144,10 @@
}
// Padding
- //-----------------------------------------------------------------------
+ // -----------------------------------------------------------------------
/**
- * <p>Repeat a String {@code repeat} times to form a
- * new String.</p>
+ * Repeat a String {@code repeat} times to form a new String.
*
* <pre>
* StringUtils.repeat(null, 2) = null
@@ -162,10 +158,10 @@
* StringUtils.repeat("a", -2) = ""
* </pre>
*
- * @param str the String to repeat, may be null
+ * @param str the String to repeat, may be null
* @param repeat number of times to repeat str, negative treated as zero
- * @return a new String consisting of the original String repeated,
- * {@code null} if null String input
+ * @return a new String consisting of the original String repeated, {@code null} if null String
+ * input
*/
public static String repeat(final String str, final int repeat) {
// Performance tuned for 2.0 (JDK1.4)
@@ -206,7 +202,6 @@
}
}
-
public static String joinWith(String delimiter, List<String> lines) {
if (lines.size() == 0) {
return EMPTY;
diff --git a/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java b/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java
index 027b677..7e75e58 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/SystemUtils.java
@@ -16,9 +16,7 @@
*/
package org.apache.tika.utils;
-/**
- * Copied from commons-lang to avoid requiring the dependency
- */
+/** Copied from commons-lang to avoid requiring the dependency */
public class SystemUtils {
public static final String OS_NAME = getSystemProperty("os.name");
@@ -40,8 +38,14 @@
private static final String OS_VERSION_WSL = "WSL";
static {
- IS_OS_UNIX = IS_OS_AIX || IS_OS_HP_UX || IS_OS_IRIX || IS_OS_LINUX || IS_OS_MAC_OSX ||
- IS_OS_SOLARIS || IS_OS_SUN_OS;
+ IS_OS_UNIX =
+ IS_OS_AIX
+ || IS_OS_HP_UX
+ || IS_OS_IRIX
+ || IS_OS_LINUX
+ || IS_OS_MAC_OSX
+ || IS_OS_SOLARIS
+ || IS_OS_SUN_OS;
IS_OS_WINDOWS = getOSMatchesName(OS_NAME_WINDOWS_PREFIX);
IS_OS_VERSION_WSL = getOSContainsVersion(OS_VERSION_WSL);
}
@@ -69,5 +73,4 @@
static boolean doesOSVersionContain(String osVersion, String osVersionSearch) {
return osVersion != null && osVersion.contains(osVersionSearch);
}
-
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 262ebfe..4b9bfca 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -42,7 +42,9 @@
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
-
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.OfflineContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
@@ -57,26 +59,17 @@
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.OfflineContentHandler;
-
-
-/**
- * Utility functions for reading XML.
- */
+/** Utility functions for reading XML. */
public class XMLReaderUtils implements Serializable {
- /**
- * Default size for the pool of SAX Parsers
- * and the pool of DOM builders
- */
+ /** Default size for the pool of SAX Parsers and the pool of DOM builders */
public static final int DEFAULT_POOL_SIZE = 10;
+
public static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20;
- /**
- * Serial version UID
- */
+
+ /** Serial version UID */
private static final long serialVersionUID = 6110455808615143122L;
+
private static final Logger LOG = LoggerFactory.getLogger(XMLReaderUtils.class);
private static final String XERCES_SECURITY_MANAGER = "org.apache.xerces.util.SecurityManager";
private static final String XERCES_SECURITY_MANAGER_PROPERTY =
@@ -84,37 +77,30 @@
private static final AtomicBoolean HAS_WARNED_STAX = new AtomicBoolean(false);
private static final ContentHandler IGNORING_CONTENT_HANDLER = new DefaultHandler();
- private static final DTDHandler IGNORING_DTD_HANDLER = new DTDHandler() {
- @Override
- public void notationDecl(String name, String publicId, String systemId)
- throws SAXException {
+ private static final DTDHandler IGNORING_DTD_HANDLER =
+ new DTDHandler() {
+ @Override
+ public void notationDecl(String name, String publicId, String systemId)
+ throws SAXException {}
- }
+ @Override
+ public void unparsedEntityDecl(
+ String name, String publicId, String systemId, String notationName)
+ throws SAXException {}
+ };
+ private static final ErrorHandler IGNORING_ERROR_HANDLER =
+ new ErrorHandler() {
+ @Override
+ public void warning(SAXParseException exception) throws SAXException {}
- @Override
- public void unparsedEntityDecl(String name, String publicId, String systemId,
- String notationName) throws SAXException {
+ @Override
+ public void error(SAXParseException exception) throws SAXException {}
- }
- };
- private static final ErrorHandler IGNORING_ERROR_HANDLER = new ErrorHandler() {
- @Override
- public void warning(SAXParseException exception) throws SAXException {
-
- }
-
- @Override
- public void error(SAXParseException exception) throws SAXException {
-
- }
-
- @Override
- public void fatalError(SAXParseException exception) throws SAXException {
-
- }
- };
+ @Override
+ public void fatalError(SAXParseException exception) throws SAXException {}
+ };
private static final String JAXP_ENTITY_EXPANSION_LIMIT_KEY = "jdk.xml.entityExpansionLimit";
- //TODO: figure out if the rw lock is any better than a simple lock
+ // TODO: figure out if the rw lock is any better than a simple lock
private static final ReentrantReadWriteLock SAX_READ_WRITE_LOCK = new ReentrantReadWriteLock();
private static final ReentrantReadWriteLock DOM_READ_WRITE_LOCK = new ReentrantReadWriteLock();
private static final AtomicInteger POOL_GENERATION = new AtomicInteger();
@@ -122,10 +108,10 @@
(publicId, systemId) -> new InputSource(new StringReader(""));
private static final XMLResolver IGNORING_STAX_ENTITY_RESOLVER =
(publicID, systemID, baseURI, namespace) -> "";
- /**
- * Parser pool size
- */
+
+ /** Parser pool size */
private static int POOL_SIZE = DEFAULT_POOL_SIZE;
+
private static long LAST_LOG = -1;
private static volatile int MAX_ENTITY_EXPANSIONS = determineMaxEntityExpansions();
private static ArrayBlockingQueue<PoolSAXParser> SAX_PARSERS =
@@ -148,18 +134,18 @@
return Integer.parseInt(expansionLimit);
} catch (NumberFormatException e) {
LOG.warn(
- "Couldn't parse an integer for the entity expansion limit: {}; " +
- "backing off to default: {}",
- expansionLimit, DEFAULT_MAX_ENTITY_EXPANSIONS);
+ "Couldn't parse an integer for the entity expansion limit: {}; "
+ + "backing off to default: {}",
+ expansionLimit,
+ DEFAULT_MAX_ENTITY_EXPANSIONS);
}
}
return DEFAULT_MAX_ENTITY_EXPANSIONS;
}
/**
- * Returns the XMLReader specified in this parsing context. If a reader
- * is not explicitly specified, then one is created using the specified
- * or the default SAX parser.
+ * Returns the XMLReader specified in this parsing context. If a reader is not explicitly
+ * specified, then one is created using the specified or the default SAX parser.
*
* @return XMLReader
* @throws TikaException
@@ -178,13 +164,11 @@
}
/**
- * Returns the SAX parser specified in this parsing context. If a parser
- * is not explicitly specified, then one is created using the specified
- * or the default SAX parser factory.
- * <p>
- * If you call reset() on the parser, make sure to replace the
- * SecurityManager which will be cleared by xerces2 on reset().
- * </p>
+ * Returns the SAX parser specified in this parsing context. If a parser is not explicitly
+ * specified, then one is created using the specified or the default SAX parser factory.
+ *
+ * <p>If you call reset() on the parser, make sure to replace the SecurityManager which will be
+ * cleared by xerces2 on reset().
*
* @return SAX parser
* @throws TikaException if a SAX parser could not be created
@@ -204,11 +188,10 @@
}
/**
- * Returns the SAX parser factory specified in this parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned. The default factory instance is
- * configured to be namespace-aware, not validating, and to use
- * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
+ * Returns the SAX parser factory specified in this parsing context. If a factory is not
+ * explicitly specified, then a default factory instance is created and returned. The default
+ * factory instance is configured to be namespace-aware, not validating, and to use {@link
+ * XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
*
* @return SAX parser factory
* @since Apache Tika 0.8
@@ -223,26 +206,25 @@
trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
trySetSAXFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
- trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd",
- false);
- trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
- false);
+ trySetSAXFeature(
+ factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+ trySetSAXFeature(
+ factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
return factory;
}
/**
- * Returns the DOM builder factory specified in this parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned. The default factory instance is
- * configured to be namespace-aware and to apply reasonable security
+ * Returns the DOM builder factory specified in this parsing context. If a factory is not
+ * explicitly specified, then a default factory instance is created and returned. The default
+ * factory instance is configured to be namespace-aware and to apply reasonable security
* features.
*
* @return DOM parser factory
* @since Apache Tika 1.13
*/
public static DocumentBuilderFactory getDocumentBuilderFactory() {
- //borrowed from Apache POI
+ // borrowed from Apache POI
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
if (LOG.isDebugEnabled()) {
LOG.debug("DocumentBuilderFactory class {}", factory.getClass());
@@ -255,20 +237,20 @@
trySetSAXFeature(factory, XMLConstants.FEATURE_SECURE_PROCESSING, true);
trySetSAXFeature(factory, "http://xml.org/sax/features/external-general-entities", false);
trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
- trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd",
- false);
- trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar",
- false);
+ trySetSAXFeature(
+ factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
+ trySetSAXFeature(
+ factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
trySetXercesSecurityManager(factory);
return factory;
}
/**
- * Returns the DOM builder specified in this parsing context.
- * If a builder is not explicitly specified, then a builder
- * instance is created and returned. The builder instance is
- * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER},
- * and it sets the ErrorHandler to <code>null</code>.
+ * Returns the DOM builder specified in this parsing context. If a builder is not explicitly
+ * specified, then a builder instance is created and returned. The builder instance is
+ * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER}, and it sets the ErrorHandler to
+ * <code>null
+ * </code>.
*
* @return DOM Builder
* @since Apache Tika 1.13
@@ -286,11 +268,10 @@
}
/**
- * Returns the StAX input factory specified in this parsing context.
- * If a factory is not explicitly specified, then a default factory
- * instance is created and returned. The default factory instance is
- * configured to be namespace-aware and to apply reasonable security
- * using the {@link #IGNORING_STAX_ENTITY_RESOLVER}.
+ * Returns the StAX input factory specified in this parsing context. If a factory is not
+ * explicitly specified, then a default factory instance is created and returned. The default
+ * factory instance is configured to be namespace-aware and to apply reasonable security using
+ * the {@link #IGNORING_STAX_ENTITY_RESOLVER}.
*
* @return StAX input factory
* @since Apache Tika 1.13
@@ -309,8 +290,8 @@
return factory;
}
- private static void trySetTransformerAttribute(TransformerFactory transformerFactory,
- String attribute, String value) {
+ private static void trySetTransformerAttribute(
+ TransformerFactory transformerFactory, String attribute, String value) {
try {
transformerFactory.setAttribute(attribute, value);
} catch (SecurityException e) {
@@ -320,12 +301,13 @@
} catch (AbstractMethodError ame) {
LOG.warn(
"Cannot set Transformer attribute because outdated XML parser in classpath: {}",
- attribute, ame);
+ attribute,
+ ame);
}
}
- private static void trySetSAXFeature(SAXParserFactory saxParserFactory, String feature,
- boolean enabled) {
+ private static void trySetSAXFeature(
+ SAXParserFactory saxParserFactory, String feature, boolean enabled) {
try {
saxParserFactory.setFeature(feature, enabled);
} catch (SecurityException e) {
@@ -333,19 +315,23 @@
} catch (Exception e) {
LOG.warn("SAX Feature unsupported: {}", feature, e);
} catch (AbstractMethodError ame) {
- LOG.warn("Cannot set SAX feature because outdated XML parser in classpath: {}", feature,
+ LOG.warn(
+ "Cannot set SAX feature because outdated XML parser in classpath: {}",
+ feature,
ame);
}
}
- private static void trySetSAXFeature(DocumentBuilderFactory documentBuilderFactory,
- String feature, boolean enabled) {
+ private static void trySetSAXFeature(
+ DocumentBuilderFactory documentBuilderFactory, String feature, boolean enabled) {
try {
documentBuilderFactory.setFeature(feature, enabled);
} catch (Exception e) {
LOG.warn("SAX Feature unsupported: {}", feature, e);
} catch (AbstractMethodError ame) {
- LOG.warn("Cannot set SAX feature because outdated XML parser in classpath: {}", feature,
+ LOG.warn(
+ "Cannot set SAX feature because outdated XML parser in classpath: {}",
+ feature,
ame);
}
}
@@ -360,9 +346,9 @@
/**
* Returns a new transformer
- * <p>
- * The transformer instance is configured to to use
- * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
+ *
+ * <p>The transformer instance is configured to to use {@link
+ * XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}.
*
* @return Transformer
* @throws TikaException when the transformer can not be created
@@ -373,8 +359,8 @@
TransformerFactory transformerFactory = TransformerFactory.newInstance();
transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_DTD, "");
- trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET,
- "");
+ trySetTransformerAttribute(
+ transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
return transformerFactory.newTransformer();
} catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
throw new TikaException("Transformer not available", e);
@@ -382,10 +368,10 @@
}
/**
- * This checks context for a user specified {@link DocumentBuilder}.
- * If one is not found, this reuses a DocumentBuilder from the pool.
+ * This checks context for a user specified {@link DocumentBuilder}. If one is not found, this
+ * reuses a DocumentBuilder from the pool.
*
- * @param is InputStream to parse
+ * @param is InputStream to parse
* @param context context to use
* @return a document
* @throws TikaException
@@ -412,10 +398,10 @@
}
/**
- * This checks context for a user specified {@link DocumentBuilder}.
- * If one is not found, this reuses a DocumentBuilder from the pool.
+ * This checks context for a user specified {@link DocumentBuilder}. If one is not found, this
+ * reuses a DocumentBuilder from the pool.
*
- * @param reader reader (character stream) to parse
+ * @param reader reader (character stream) to parse
* @param context context to use
* @return a document
* @throws TikaException
@@ -497,14 +483,13 @@
}
/**
- * This checks context for a user specified {@link SAXParser}.
- * If one is not found, this reuses a SAXParser from the pool.
+ * This checks context for a user specified {@link SAXParser}. If one is not found, this reuses
+ * a SAXParser from the pool.
*
- * @param is InputStream to parse
- * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler}
- * to the content handler as an extra layer of defense against
- * external entity vulnerabilities
- * @param context context to use
+ * @param is InputStream to parse
+ * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} to the
+ * content handler as an extra layer of defense against external entity vulnerabilities
+ * @param context context to use
* @return
* @throws TikaException
* @throws IOException
@@ -529,14 +514,13 @@
}
/**
- * This checks context for a user specified {@link SAXParser}.
- * If one is not found, this reuses a SAXParser from the pool.
+ * This checks context for a user specified {@link SAXParser}. If one is not found, this reuses
+ * a SAXParser from the pool.
*
- * @param reader reader (character stream) to parse
- * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler}
- * to the content handler as an extra layer of defense against
- * external entity vulnerabilities
- * @param context context to use
+ * @param reader reader (character stream) to parse
+ * @param contentHandler handler to use; this wraps a {@link OfflineContentHandler} to the
+ * content handler as an extra layer of defense against external entity vulnerabilities
+ * @param context context to use
* @return
* @throws TikaException
* @throws IOException
@@ -561,8 +545,7 @@
}
/**
- * Acquire a SAXParser from the pool. Make sure to
- * {@link #releaseDOMBuilder(PoolDOMBuilder)} in
+ * Acquire a SAXParser from the pool. Make sure to {@link #releaseDOMBuilder(PoolDOMBuilder)} in
* a <code>finally</code> block every time you call this.
*
* @return a DocumentBuilder
@@ -585,22 +568,23 @@
return builder;
}
if (lastWarn < 0 || System.currentTimeMillis() - lastWarn > 1000) {
- //avoid spamming logs
- LOG.warn("Contention waiting for a DOMParser. " +
- "Consider increasing the XMLReaderUtils.POOL_SIZE");
+ // avoid spamming logs
+ LOG.warn(
+ "Contention waiting for a DOMParser. "
+ + "Consider increasing the XMLReaderUtils.POOL_SIZE");
lastWarn = System.currentTimeMillis();
}
waiting++;
if (waiting > 3000) {
- //freshen the pool. Something went very wrong...
+ // freshen the pool. Something went very wrong...
setPoolSize(POOL_SIZE);
- //better to get an exception than have permahang by a bug in one of our parsers
- throw new TikaException("Waited more than 5 minutes for a DocumentBuilder; " +
- "This could indicate that a parser has not correctly released its " +
- "DocumentBuilder. " +
- "Please report this to the Tika team: dev@tika.apache.org");
-
+ // better to get an exception than have permahang by a bug in one of our parsers
+ throw new TikaException(
+ "Waited more than 5 minutes for a DocumentBuilder; "
+ + "This could indicate that a parser has not correctly released its "
+ + "DocumentBuilder. "
+ + "Please report this to the Tika team: dev@tika.apache.org");
}
}
}
@@ -617,18 +601,18 @@
try {
builder.reset();
} catch (UnsupportedOperationException e) {
- //ignore
+ // ignore
}
DOM_READ_WRITE_LOCK.readLock().lock();
try {
- //if there are extra parsers (e.g. after a reset of the pool to a smaller size),
+ // if there are extra parsers (e.g. after a reset of the pool to a smaller size),
// this parser will not be added and will then be gc'd
boolean success = DOM_BUILDERS.offer(builder);
if (!success) {
LOG.warn(
- "DocumentBuilder not taken back into pool. If you haven't resized the " +
- "pool, this could be a sign that there are more calls to " +
- "'acquire' than to 'release'");
+ "DocumentBuilder not taken back into pool. If you haven't resized the "
+ + "pool, this could be a sign that there are more calls to "
+ + "'acquire' than to 'release'");
}
} finally {
DOM_READ_WRITE_LOCK.readLock().unlock();
@@ -636,9 +620,8 @@
}
/**
- * Acquire a SAXParser from the pool. Make sure to
- * {@link #releaseParser(PoolSAXParser)} in
- * a <code>finally</code> block every time you call this.
+ * Acquire a SAXParser from the pool. Make sure to {@link #releaseParser(PoolSAXParser)} in a
+ * <code>finally</code> block every time you call this.
*
* @return a SAXParser
* @throws TikaException
@@ -660,20 +643,21 @@
return parser;
}
if (lastWarn < 0 || System.currentTimeMillis() - lastWarn > 1000) {
- //avoid spamming logs
- LOG.warn("Contention waiting for a SAXParser. " +
- "Consider increasing the XMLReaderUtils.POOL_SIZE");
+ // avoid spamming logs
+ LOG.warn(
+ "Contention waiting for a SAXParser. "
+ + "Consider increasing the XMLReaderUtils.POOL_SIZE");
lastWarn = System.currentTimeMillis();
}
waiting++;
if (waiting > 3000) {
- //freshen the pool. Something went very wrong...
+ // freshen the pool. Something went very wrong...
setPoolSize(POOL_SIZE);
- //better to get an exception than have permahang by a bug in one of our parsers
- throw new TikaException("Waited more than 5 minutes for a SAXParser; " +
- "This could indicate that a parser has not correctly released its " +
- "SAXParser. Please report this to the Tika team: dev@tika.apache.org");
-
+ // better to get an exception than have permahang by a bug in one of our parsers
+ throw new TikaException(
+ "Waited more than 5 minutes for a SAXParser; "
+ + "This could indicate that a parser has not correctly released its "
+ + "SAXParser. Please report this to the Tika team: dev@tika.apache.org");
}
}
}
@@ -687,23 +671,23 @@
try {
parser.reset();
} catch (UnsupportedOperationException e) {
- //TIKA-3009 -- we really shouldn't have to do this... :(
+ // TIKA-3009 -- we really shouldn't have to do this... :(
}
- //if this is a different generation, don't put it back
- //in the pool
+ // if this is a different generation, don't put it back
+ // in the pool
if (parser.getGeneration() != POOL_GENERATION.get()) {
return;
}
SAX_READ_WRITE_LOCK.readLock().lock();
try {
- //if there are extra parsers (e.g. after a reset of the pool to a smaller size),
+ // if there are extra parsers (e.g. after a reset of the pool to a smaller size),
// this parser will not be added and will then be gc'd
boolean success = SAX_PARSERS.offer(parser);
if (!success) {
LOG.warn(
- "SAXParser not taken back into pool. If you haven't resized the pool " +
- "this could be a sign that there are more calls to 'acquire' " +
- "than to 'release'");
+ "SAXParser not taken back into pool. If you haven't resized the pool "
+ + "this could be a sign that there are more calls to 'acquire' "
+ + "than to 'release'");
}
} finally {
SAX_READ_WRITE_LOCK.readLock().unlock();
@@ -711,28 +695,31 @@
}
private static void trySetXercesSecurityManager(DocumentBuilderFactory factory) {
- //from POI
+ // from POI
// Try built-in JVM one first, standalone if not
- for (String securityManagerClassName : new String[]{
- //"com.sun.org.apache.xerces.internal.util.SecurityManager",
- XERCES_SECURITY_MANAGER}) {
+ for (String securityManagerClassName :
+ new String[] {
+ // "com.sun.org.apache.xerces.internal.util.SecurityManager",
+ XERCES_SECURITY_MANAGER
+ }) {
try {
Object mgr =
- Class.forName(securityManagerClassName).getDeclaredConstructor().newInstance();
- Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit",
- Integer.TYPE);
+ Class.forName(securityManagerClassName)
+ .getDeclaredConstructor()
+ .newInstance();
+ Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
factory.setAttribute(XERCES_SECURITY_MANAGER_PROPERTY, mgr);
// Stop once one can be setup without error
return;
} catch (ClassNotFoundException e) {
// continue without log, this is expected in some setups
- } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here
+ } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
LOG.warn(
- "SAX Security Manager could not be setup [log suppressed for 5 " +
- "minutes]",
+ "SAX Security Manager could not be setup [log suppressed for 5 "
+ + "minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
@@ -741,13 +728,15 @@
// separate old version of Xerces not found => use the builtin way of setting the property
try {
- factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
+ factory.setAttribute(
+ "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
MAX_ENTITY_EXPANSIONS);
} catch (IllegalArgumentException e) {
// NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
- LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]",
+ LOG.warn(
+ "SAX Security Manager could not be setup [log suppressed for 5 minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
@@ -755,14 +744,18 @@
}
private static void trySetXercesSecurityManager(SAXParser parser) {
- //from POI
+ // from POI
// Try built-in JVM one first, standalone if not
- for (String securityManagerClassName : new String[]{
- //"com.sun.org.apache.xerces.internal.util.SecurityManager",
- XERCES_SECURITY_MANAGER}) {
+ for (String securityManagerClassName :
+ new String[] {
+ // "com.sun.org.apache.xerces.internal.util.SecurityManager",
+ XERCES_SECURITY_MANAGER
+ }) {
try {
Object mgr =
- Class.forName(securityManagerClassName).getDeclaredConstructor().newInstance();
+ Class.forName(securityManagerClassName)
+ .getDeclaredConstructor()
+ .newInstance();
Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
@@ -776,8 +769,8 @@
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
LOG.warn(
- "SAX Security Manager could not be setup [log suppressed for 5 " +
- "minutes]",
+ "SAX Security Manager could not be setup [log suppressed for 5 "
+ + "minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
@@ -786,12 +779,14 @@
// separate old version of Xerces not found => use the builtin way of setting the property
try {
- parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
+ parser.setProperty(
+ "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
MAX_ENTITY_EXPANSIONS);
- } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here
+ } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
- LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]",
+ LOG.warn(
+ "SAX Security Manager could not be setup [log suppressed for 5 minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
@@ -799,19 +794,21 @@
}
private static void trySetStaxSecurityManager(XMLInputFactory inputFactory) {
- //try default java entity expansion, then fallback to woodstox, then warn...once.
+ // try default java entity expansion, then fallback to woodstox, then warn...once.
try {
- inputFactory.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
+ inputFactory.setProperty(
+ "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
MAX_ENTITY_EXPANSIONS);
} catch (IllegalArgumentException e) {
try {
inputFactory.setProperty("com.ctc.wstx.maxEntityCount", MAX_ENTITY_EXPANSIONS);
} catch (IllegalArgumentException e2) {
if (HAS_WARNED_STAX.getAndSet(true) == false) {
- LOG.warn("Could not set limit on maximum entity expansions for: " + inputFactory.getClass());
+ LOG.warn(
+ "Could not set limit on maximum entity expansions for: "
+ + inputFactory.getClass());
}
}
-
}
}
@@ -820,23 +817,23 @@
}
/**
- * Set the pool size for cached XML parsers. This has a side
- * effect of locking the pool, and rebuilding the pool from
- * scratch with the most recent settings, such as {@link #MAX_ENTITY_EXPANSIONS}
+ * Set the pool size for cached XML parsers. This has a side effect of locking the pool, and
+ * rebuilding the pool from scratch with the most recent settings, such as {@link
+ * #MAX_ENTITY_EXPANSIONS}
*
* @param poolSize
* @since Apache Tika 1.19
*/
public static void setPoolSize(int poolSize) throws TikaException {
- //stop the world with a write lock.
- //parsers that are currently in use will be offered later (once the lock is released),
- //but not accepted and will be gc'd. We have to do this locking and
- //the read locking in case one thread resizes the pool when the
- //parsers have already started. We could have an NPE on SAX_PARSERS
- //if we didn't lock.
+ // stop the world with a write lock.
+ // parsers that are currently in use will be offered later (once the lock is released),
+ // but not accepted and will be gc'd. We have to do this locking and
+ // the read locking in case one thread resizes the pool when the
+ // parsers have already started. We could have an NPE on SAX_PARSERS
+ // if we didn't lock.
SAX_READ_WRITE_LOCK.writeLock().lock();
try {
- //free up any resources before emptying SAX_PARSERS
+ // free up any resources before emptying SAX_PARSERS
for (PoolSAXParser parser : SAX_PARSERS) {
parser.reset();
}
@@ -845,8 +842,8 @@
int generation = POOL_GENERATION.incrementAndGet();
for (int i = 0; i < poolSize; i++) {
try {
- SAX_PARSERS.offer(buildPoolParser(generation,
- getSAXParserFactory().newSAXParser()));
+ SAX_PARSERS.offer(
+ buildPoolParser(generation, getSAXParserFactory().newSAXParser()));
} catch (SAXException | ParserConfigurationException e) {
throw new TikaException("problem creating sax parser", e);
}
@@ -873,15 +870,13 @@
}
/**
- * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing.
- * <b>NOTE:</b>A value less than or equal to zero indicates no limit.
- * This will override the system property {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY}
- * and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value for allowable entity expansions
- * <p>
- * <b>NOTE:</b> To trigger a rebuild of the pool of parsers with this setting,
- * the client must call {@link #setPoolSize(int)} to rebuild the SAX and DOM parsers
- * with this setting.
- * </p>
+ * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing. <b>NOTE:</b>A
+ * value less than or equal to zero indicates no limit. This will override the system property
+ * {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY} and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value
+ * for allowable entity expansions
+ *
+ * <p><b>NOTE:</b> To trigger a rebuild of the pool of parsers with this setting, the client
+ * must call {@link #setPoolSize(int)} to rebuild the SAX and DOM parsers with this setting.
*
* @param maxEntityExpansions -- maximum number of allowable entity expansions
* @since Apache Tika 1.19
@@ -922,7 +917,7 @@
parser.setProperty(XERCES_SECURITY_MANAGER_PROPERTY, mgr);
hasSecurityManager = true;
} catch (SecurityException e) {
- //don't swallow security exceptions
+ // don't swallow security exceptions
throw e;
} catch (ClassNotFoundException e) {
// continue without log, this is expected in some setups
@@ -930,7 +925,8 @@
// NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
- LOG.warn("SAX Security Manager could not be setup [log suppressed for 5 minutes]",
+ LOG.warn(
+ "SAX Security Manager could not be setup [log suppressed for 5 minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
@@ -940,15 +936,16 @@
if (!hasSecurityManager) {
// use the builtin way of setting the property
try {
- parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
+ parser.setProperty(
+ "http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit",
MAX_ENTITY_EXPANSIONS);
canSetJaxPEntity = true;
- } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here
+ } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if (System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
LOG.warn(
- "SAX Security Manager could not be setup [log suppressed for 5 " +
- "minutes]",
+ "SAX Security Manager could not be setup [log suppressed for 5 "
+ + "minutes]",
e);
LAST_LOG = System.currentTimeMillis();
}
@@ -964,7 +961,6 @@
} else {
return new UnrecognizedPoolSAXParser(generation, parser);
}
-
}
private static void clearReader(XMLReader reader) {
@@ -1028,12 +1024,12 @@
@Override
public void reset() {
- //don't do anything
+ // don't do anything
try {
XMLReader reader = saxParser.getXMLReader();
clearReader(reader);
} catch (SAXException e) {
- //swallow
+ // swallow
}
}
}
@@ -1079,8 +1075,8 @@
}
private static class UnrecognizedPoolSAXParser extends PoolSAXParser {
- //if unrecognized, try to set all protections
- //and try to reset every time
+ // if unrecognized, try to set all protections
+ // and try to reset every time
public UnrecognizedPoolSAXParser(int generation, SAXParser parser) {
super(generation, parser);
}
diff --git a/tika-core/src/main/java/org/apache/tika/utils/package-info.java b/tika-core/src/main/java/org/apache/tika/utils/package-info.java
index 04ea52e..833c117 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/package-info.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/package-info.java
@@ -15,8 +15,6 @@
* limitations under the License.
*/
-/**
- * Utilities.
- */
+/** Utilities. */
@aQute.bnd.annotation.Version("1.0.0")
package org.apache.tika.utils;
diff --git a/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java b/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java
index 7237d11..2e5efd5 100644
--- a/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java
+++ b/tika-core/src/test/java/org/apache/custom/detect/MyCustomDetector.java
@@ -18,7 +18,6 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.tika.detect.Detector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
diff --git a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
index fd3f381..4bcbab6 100644
--- a/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/MultiThreadedTikaTest.java
@@ -42,7 +42,6 @@
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
-
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.XmlRootExtractor;
import org.apache.tika.exception.TikaException;
@@ -57,32 +56,34 @@
import org.apache.tika.utils.XMLReaderUtils;
public class MultiThreadedTikaTest extends TikaTest {
- //TODO: figure out how to make failures reproducible a la Lucene/Solr with a seed
- //TODO: Consider randomizing the Locale and timezone, like Lucene/Solr...
+ // TODO: figure out how to make failures reproducible a la Lucene/Solr with a seed
+ // TODO: Consider randomizing the Locale and timezone, like Lucene/Solr...
XmlRootExtractor ex = new XmlRootExtractor();
public static Path[] getTestFiles(final FileFilter fileFilter)
throws URISyntaxException, IOException {
Path root = Paths.get(MultiThreadedTikaTest.class.getResource("/test-documents").toURI());
final List<Path> files = new ArrayList<>();
- Files.walkFileTree(root, new SimpleFileVisitor<Path>() {
- @Override
- public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
- throws IOException {
- if (fileFilter != null && !fileFilter.accept(file.toFile())) {
- return FileVisitResult.CONTINUE;
- }
- if (!attrs.isDirectory()) {
- files.add(file);
- }
- return FileVisitResult.CONTINUE;
- }
- });
+ Files.walkFileTree(
+ root,
+ new SimpleFileVisitor<Path>() {
+ @Override
+ public FileVisitResult visitFile(Path file, BasicFileAttributes attrs)
+ throws IOException {
+ if (fileFilter != null && !fileFilter.accept(file.toFile())) {
+ return FileVisitResult.CONTINUE;
+ }
+ if (!attrs.isDirectory()) {
+ files.add(file);
+ }
+ return FileVisitResult.CONTINUE;
+ }
+ });
return files.toArray(new Path[0]);
}
- private static ConcurrentHashMap<Path, MediaType> getBaselineDetection(Detector detector,
- Path[] files) {
+ private static ConcurrentHashMap<Path, MediaType> getBaselineDetection(
+ Detector detector, Path[] files) {
ConcurrentHashMap<Path, MediaType> baseline = new ConcurrentHashMap<>();
XmlRootExtractor extractor = new XmlRootExtractor();
@@ -98,8 +99,8 @@
return baseline;
}
- private static ConcurrentHashMap<Path, Extract> getBaseline(Parser parser, Path[] files,
- ParseContext parseContext) {
+ private static ConcurrentHashMap<Path, Extract> getBaseline(
+ Parser parser, Path[] files, ParseContext parseContext) {
ConcurrentHashMap<Path, Extract> baseline = new ConcurrentHashMap<>();
for (Path f : files) {
@@ -110,69 +111,89 @@
} catch (Exception e) {
e.printStackTrace();
- //swallow
+ // swallow
}
}
return baseline;
}
- private static List<Metadata> getRecursiveMetadata(InputStream is, Parser parser,
- ParseContext parseContext) throws Exception {
- //different from parent TikaTest in that this extracts text.
- //can't extract xhtml because "tmp" file names wind up in
- //content's metadata and they'll differ by file.
+ private static List<Metadata> getRecursiveMetadata(
+ InputStream is, Parser parser, ParseContext parseContext) throws Exception {
+ // different from parent TikaTest in that this extracts text.
+ // can't extract xhtml because "tmp" file names wind up in
+ // content's metadata and they'll differ by file.
parseContext = new ParseContext();
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
- -1);
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1),
+ -1);
parser.parse(is, handler, new Metadata(), parseContext);
return handler.getMetadataList();
}
private static void assertExtractEquals(Extract extractA, Extract extractB) {
- //this currently only checks the basics
- //might want to add more checks
+ // this currently only checks the basics
+ // might want to add more checks
- assertEquals(extractA.metadataList.size(), extractB.metadataList.size(),
+ assertEquals(
+ extractA.metadataList.size(),
+ extractB.metadataList.size(),
"number of embedded files");
for (int i = 0; i < extractA.metadataList.size(); i++) {
- assertEquals(extractA.metadataList.get(i).size(), extractB.metadataList.get(i).size(),
+ assertEquals(
+ extractA.metadataList.get(i).size(),
+ extractB.metadataList.get(i).size(),
"number of metadata elements in attachment: " + i);
- assertEquals(extractA.metadataList.get(i).get(TikaCoreProperties.TIKA_CONTENT),
+ assertEquals(
+ extractA.metadataList.get(i).get(TikaCoreProperties.TIKA_CONTENT),
extractB.metadataList.get(i).get(TikaCoreProperties.TIKA_CONTENT),
"content in attachment: " + i);
}
}
/**
- * This calls {@link #testEach(Parser parser, Path[], ParseContext[], int, int)} and
- * then {@link #testAll(Parser parser, Path[], ParseContext[], int, int)}
+ * This calls {@link #testEach(Parser parser, Path[], ParseContext[], int, int)} and then {@link
+ * #testAll(Parser parser, Path[], ParseContext[], int, int)}
*
- * @param numThreads number of threads to use
+ * @param numThreads number of threads to use
* @param numIterations number of iterations per thread
- * @param filter file filter to select files from "/test-documents"; if
- * <code>null</code>,
- * all files will be used
+ * @param filter file filter to select files from "/test-documents"; if <code>null</code>, all
+ * files will be used
* @throws Exception
*/
- protected void testMultiThreaded(Parser parser, ParseContext[] parseContext, int numThreads,
- int numIterations, FileFilter filter) throws Exception {
+ protected void testMultiThreaded(
+ Parser parser,
+ ParseContext[] parseContext,
+ int numThreads,
+ int numIterations,
+ FileFilter filter)
+ throws Exception {
Path[] allFiles = getTestFiles(filter);
testEach(parser, allFiles, parseContext, numThreads, numIterations);
testAll(parser, allFiles, parseContext, numThreads, numIterations);
}
- public void testDetector(Detector detector, int numThreads, int numIterations,
- FileFilter filter, int randomlyResizeSAXPool) throws Exception {
+ public void testDetector(
+ Detector detector,
+ int numThreads,
+ int numIterations,
+ FileFilter filter,
+ int randomlyResizeSAXPool)
+ throws Exception {
Path[] files = getTestFiles(filter);
testDetectorEach(detector, files, numThreads, numIterations, randomlyResizeSAXPool);
testDetectorOnAll(detector, files, numThreads, numIterations, randomlyResizeSAXPool);
}
- void testDetectorEach(Detector detector, Path[] files, int numThreads, int numIterations,
- int randomlyResizeSAXPool) {
+ void testDetectorEach(
+ Detector detector,
+ Path[] files,
+ int numThreads,
+ int numIterations,
+ int randomlyResizeSAXPool) {
for (Path p : files) {
Path[] toTest = new Path[1];
toTest[0] = p;
@@ -180,14 +201,18 @@
}
}
- private void testDetectorOnAll(Detector detector, Path[] toTest, int numThreads,
- int numIterations, int randomlyResizeSAXPool) {
+ private void testDetectorOnAll(
+ Detector detector,
+ Path[] toTest,
+ int numThreads,
+ int numIterations,
+ int randomlyResizeSAXPool) {
Map<Path, MediaType> truth = getBaselineDetection(detector, toTest);
- //if all files caused an exception
+ // if all files caused an exception
if (truth.size() == 0) {
return;
}
- //only those that parsed without exception
+ // only those that parsed without exception
Path[] testFiles = new Path[truth.size()];
int j = 0;
for (Path testFile : truth.keySet()) {
@@ -196,7 +221,13 @@
int actualThreadCount = numThreads + Math.max(randomlyResizeSAXPool, 0);
ExecutorService ex = Executors.newFixedThreadPool(actualThreadCount);
try {
- _testDetectorOnAll(detector, testFiles, numThreads, numIterations, truth, ex,
+ _testDetectorOnAll(
+ detector,
+ testFiles,
+ numThreads,
+ numIterations,
+ truth,
+ ex,
randomlyResizeSAXPool);
} finally {
ex.shutdown();
@@ -204,27 +235,32 @@
}
}
- private void _testDetectorOnAll(Detector detector, Path[] testFiles, int numThreads,
- int numIterations, Map<Path, MediaType> truth,
- ExecutorService ex, int randomlyResizeSAXPool) {
+ private void _testDetectorOnAll(
+ Detector detector,
+ Path[] testFiles,
+ int numThreads,
+ int numIterations,
+ Map<Path, MediaType> truth,
+ ExecutorService ex,
+ int randomlyResizeSAXPool) {
ExecutorCompletionService<Integer> executorCompletionService =
new ExecutorCompletionService<>(ex);
executorCompletionService.submit(new SAXPoolResizer(randomlyResizeSAXPool));
for (int i = 0; i < numThreads; i++) {
- executorCompletionService
- .submit(new TikaDetectorRunner(detector, numIterations, testFiles, truth));
+ executorCompletionService.submit(
+ new TikaDetectorRunner(detector, numIterations, testFiles, truth));
}
int completed = 0;
while (completed < numThreads) {
- //TODO: add a maximum timeout threshold
+ // TODO: add a maximum timeout threshold
Future<Integer> future = null;
try {
future = executorCompletionService.poll(1000, TimeUnit.MILLISECONDS);
if (future != null) {
- future.get();//trigger exceptions from thread
+ future.get(); // trigger exceptions from thread
completed++;
}
} catch (InterruptedException | ExecutionException e) {
@@ -236,21 +272,23 @@
}
/**
- * Test each file, one at a time in multiple threads.
- * This was required to test TIKA-2519 in a reasonable
- * amount of time. This forced the parser to use the
- * same underlying memory structures because it was the same file.
- * This is stricter than I think our agreement with clients is
- * because this run tests on literally the same file and
- * not a copy of the file per thread. Let's leave this as is
- * unless there's a good reason to create a separate copy per thread.
+ * Test each file, one at a time in multiple threads. This was required to test TIKA-2519 in a
+ * reasonable amount of time. This forced the parser to use the same underlying memory
+ * structures because it was the same file. This is stricter than I think our agreement with
+ * clients is because this run tests on literally the same file and not a copy of the file per
+ * thread. Let's leave this as is unless there's a good reason to create a separate copy per
+ * thread.
*
- * @param files files to test, one at a time
- * @param numThreads number of threads to use
+ * @param files files to test, one at a time
+ * @param numThreads number of threads to use
* @param numIterations number of iterations per thread
*/
- protected void testEach(Parser parser, Path[] files, ParseContext[] parseContext,
- int numThreads, int numIterations) {
+ protected void testEach(
+ Parser parser,
+ Path[] files,
+ ParseContext[] parseContext,
+ int numThreads,
+ int numIterations) {
for (Path p : files) {
Path[] toTest = new Path[1];
toTest[0] = p;
@@ -259,27 +297,29 @@
}
/**
- * This tests all files together. Each parser randomly selects
- * a file from the array. Two parsers could wind up parsing the
- * same file at the same time. Good.
- * <p>
- * In the current implementation, this gets ground truth only
- * from files that do not throw exceptions. This will ignore
- * files that cause exceptions.
+ * This tests all files together. Each parser randomly selects a file from the array. Two
+ * parsers could wind up parsing the same file at the same time. Good.
*
- * @param files files to parse
- * @param numThreads number of parser threads
+ * <p>In the current implementation, this gets ground truth only from files that do not throw
+ * exceptions. This will ignore files that cause exceptions.
+ *
+ * @param files files to parse
+ * @param numThreads number of parser threads
* @param numIterations number of iterations per parser
*/
- protected void testAll(Parser parser, Path[] files, ParseContext[] parseContext, int numThreads,
- int numIterations) {
+ protected void testAll(
+ Parser parser,
+ Path[] files,
+ ParseContext[] parseContext,
+ int numThreads,
+ int numIterations) {
Map<Path, Extract> truth = getBaseline(parser, files, parseContext[0]);
- //if all files caused an exception
+ // if all files caused an exception
if (truth.size() == 0) {
- //return;
+ // return;
}
- //only those that parsed without exception
+ // only those that parsed without exception
Path[] testFiles = new Path[truth.size()];
int j = 0;
for (Path testFile : truth.keySet()) {
@@ -295,29 +335,33 @@
}
}
- private void _testAll(Parser parser, Path[] testFiles, ParseContext[] parseContext,
- int numThreads, int numIterations, Map<Path, Extract> truth,
- ExecutorService ex) {
+ private void _testAll(
+ Parser parser,
+ Path[] testFiles,
+ ParseContext[] parseContext,
+ int numThreads,
+ int numIterations,
+ Map<Path, Extract> truth,
+ ExecutorService ex) {
ExecutorCompletionService<Integer> executorCompletionService =
new ExecutorCompletionService<>(ex);
- //use the same parser in all threads
+ // use the same parser in all threads
for (int i = 0; i < numThreads; i++) {
- executorCompletionService
- .submit(new TikaRunner(parser, parseContext[i], numIterations, testFiles,
- truth));
+ executorCompletionService.submit(
+ new TikaRunner(parser, parseContext[i], numIterations, testFiles, truth));
}
int completed = 0;
while (completed < numThreads) {
- //TODO: add a maximum timeout threshold
+ // TODO: add a maximum timeout threshold
Future<Integer> future = null;
try {
future = executorCompletionService.poll(1000, TimeUnit.MILLISECONDS);
if (future != null) {
- future.get();//trigger exceptions from thread
+ future.get(); // trigger exceptions from thread
completed++;
}
} catch (InterruptedException | ExecutionException e) {
@@ -326,7 +370,7 @@
}
}
- //TODO: make this return something useful besides an integer
+ // TODO: make this return something useful besides an integer
private static class TikaRunner implements Callable<Integer> {
private static final AtomicInteger threadCount = new AtomicInteger(0);
private final Parser parser;
@@ -337,8 +381,12 @@
private final Random random = new Random();
private final int threadNumber;
- private TikaRunner(Parser parser, ParseContext parseContext, int iterations, Path[] files,
- Map<Path, Extract> truth) {
+ private TikaRunner(
+ Parser parser,
+ ParseContext parseContext,
+ int iterations,
+ Path[] files,
+ Map<Path, Extract> truth) {
this.parser = parser;
this.iterations = iterations;
this.files = files;
@@ -358,8 +406,8 @@
metadataList = getRecursiveMetadata(is, parser, new ParseContext());
success = true;
} catch (Exception e) {
- //swallow
- //throw new RuntimeException(testFile + " triggered this exception", e);
+ // swallow
+ // throw new RuntimeException(testFile + " triggered this exception", e);
}
if (success) {
assertExtractEquals(truth.get(testFile), new Extract(metadataList));
@@ -367,7 +415,6 @@
}
return 1;
}
-
}
private static class Extract {
@@ -411,8 +458,8 @@
private final Map<Path, MediaType> truth;
private final Random random = new Random();
- private TikaDetectorRunner(Detector detector, int iterations, Path[] files,
- Map<Path, MediaType> truth) {
+ private TikaDetectorRunner(
+ Detector detector, int iterations, Path[] files, Map<Path, MediaType> truth) {
this.detector = detector;
this.iterations = iterations;
this.files = files;
@@ -427,12 +474,11 @@
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(testFile, metadata)) {
MediaType mediaType = detector.detect(tis, metadata);
- assertEquals(truth.get(testFile), mediaType,
- "failed on: " + testFile.getFileName());
+ assertEquals(
+ truth.get(testFile), mediaType, "failed on: " + testFile.getFileName());
}
}
return 1;
}
-
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java b/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java
index 1a6d454..e7d88f1 100644
--- a/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java
+++ b/tika-core/src/test/java/org/apache/tika/ResourceLoggingClassLoader.java
@@ -26,10 +26,8 @@
import java.util.Map;
/**
- * A wrapper around a {@link ClassLoader} that logs all
- * the Resources loaded through it.
- * Used to check that a specific ClassLoader was used
- * when unit testing
+ * A wrapper around a {@link ClassLoader} that logs all the Resources loaded through it. Used to
+ * check that a specific ClassLoader was used when unit testing
*/
public class ResourceLoggingClassLoader extends ClassLoader {
private final Map<String, List<URL>> loadedResources = new HashMap<>();
diff --git a/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java b/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java
index 05fdb53..c31c5cd 100644
--- a/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java
+++ b/tika-core/src/test/java/org/apache/tika/TestRereadableInputStream.java
@@ -25,12 +25,10 @@
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
-
+import org.apache.tika.utils.RereadableInputStream;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
-import org.apache.tika.utils.RereadableInputStream;
-
public class TestRereadableInputStream {
private final int DEFAULT_TEST_SIZE = 3;
@@ -48,54 +46,60 @@
// This size of data exactly equals memory threshold
private final int TEST_SIZE_MAX = MEMORY_THRESHOLD;
- @TempDir
- private Path tempDir;
+ @TempDir private Path tempDir;
@Test
public void testInMemory() throws IOException {
readEntireStream((TEST_SIZE_MEMORY));
}
-// @Test
-// public void testInFile() throws IOException {
-// readData(TEST_SIZE_FILE);
-// }
-//
-// @Test
-// public void testMemoryThreshold() throws IOException {
-// readData(TEST_SIZE_MAX);
-// }
-//
-// @Test
-// public void testInMemory2() throws IOException {
-// readData2((TEST_SIZE_MEMORY));
-// }
-//
-// @Test
-// public void testInFile2() throws IOException {
-// readData2(TEST_SIZE_FILE);
-// }
+ // @Test
+ // public void testInFile() throws IOException {
+ // readData(TEST_SIZE_FILE);
+ // }
+ //
+ // @Test
+ // public void testMemoryThreshold() throws IOException {
+ // readData(TEST_SIZE_MAX);
+ // }
+ //
+ // @Test
+ // public void testInMemory2() throws IOException {
+ // readData2((TEST_SIZE_MEMORY));
+ // }
+ //
+ // @Test
+ // public void testInFile2() throws IOException {
+ // readData2(TEST_SIZE_FILE);
+ // }
@Test
public void testMemoryThreshold2() throws IOException {
readPartialStream(TEST_SIZE_MAX);
}
- /**
- * Read entire stream of various sizes
- */
+ /** Read entire stream of various sizes */
private void readEntireStream(int testSize) throws IOException {
InputStream is = createTestInputStream(testSize);
try (RereadableInputStream ris = new RereadableInputStream(is, MEMORY_THRESHOLD, true)) {
for (int pass = 0; pass < NUM_PASSES; pass++) {
for (int byteNum = 0; byteNum < testSize; byteNum++) {
int byteRead = ris.read();
- assertEquals(byteNum, byteRead,
- "Pass = " + pass + ", byte num should be " + byteNum + " but is " +
- byteRead + ".");
+ assertEquals(
+ byteNum,
+ byteRead,
+ "Pass = "
+ + pass
+ + ", byte num should be "
+ + byteNum
+ + " but is "
+ + byteRead
+ + ".");
}
int eof = ris.read();
- assertEquals(-1, eof,
+ assertEquals(
+ -1,
+ eof,
"Pass = " + pass + ", byte num should be " + -1 + " but is " + eof + ".");
ris.rewind();
}
@@ -103,8 +107,8 @@
}
/**
- * Read increasingly more of the stream, but not all, with each pass before rewinding to
- * make sure we pick up at the correct point
+ * Read increasingly more of the stream, but not all, with each pass before rewinding to make
+ * sure we pick up at the correct point
*/
private void readPartialStream(int testSize) throws IOException {
InputStream is = createTestInputStream(20);
@@ -114,8 +118,16 @@
for (int pass = 0; pass < NUM_PASSES; pass++) {
for (int byteNum = 0; byteNum < iterations; byteNum++) {
int byteRead = ris.read();
- assertEquals(byteNum, byteRead,
- "Pass = " + pass + ", byte num should be " + byteNum + " but is " + byteRead + ".");
+ assertEquals(
+ byteNum,
+ byteRead,
+ "Pass = "
+ + pass
+ + ", byte num should be "
+ + byteNum
+ + " but is "
+ + byteRead
+ + ".");
}
ris.rewind();
iterations++;
@@ -123,20 +135,21 @@
}
}
-
@Test
public void testRewind() throws IOException {
InputStream is = createTestInputStream(DEFAULT_TEST_SIZE);
try (RereadableInputStream ris = new RereadableInputStream(is, MEMORY_THRESHOLD, true)) {
- ris.rewind(); // rewind before we've done anything
+ ris.rewind(); // rewind before we've done anything
for (int byteNum = 0; byteNum < 1; byteNum++) {
int byteRead = ris.read();
- assertEquals(byteNum, byteRead, "Byte num should be " + byteNum + " but is " + byteRead + ".");
+ assertEquals(
+ byteNum,
+ byteRead,
+ "Byte num should be " + byteNum + " but is " + byteRead + ".");
}
}
}
-
private TestInputStream createTestInputStream(int testSize) throws IOException {
return new TestInputStream(
new BufferedInputStream(Files.newInputStream(createTestFile(testSize))));
@@ -175,15 +188,14 @@
TestInputStream tis = createTestInputStream(DEFAULT_TEST_SIZE);
RereadableInputStream ris = new RereadableInputStream(tis, DEFAULT_TEST_SIZE);
ris.close();
- assertThrows(IOException.class, () -> {
- ris.read();
- });
+ assertThrows(
+ IOException.class,
+ () -> {
+ ris.read();
+ });
}
-
- /**
- * Adds isClosed() to a BufferedInputStream.
- */
+ /** Adds isClosed() to a BufferedInputStream. */
static class TestInputStream extends BufferedInputStream {
private boolean closed;
diff --git a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
index f52482c..6924029 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaDetectionTest.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika;
-
import static org.junit.jupiter.api.Assertions.assertEquals;
import org.junit.jupiter.api.Test;
@@ -26,9 +25,10 @@
private final Tika tika = new Tika();
/**
- * This test checks that Tika correctly detects all the file extensions
- * defined in the mime.types file (revision 819245) of the Apache HTTP
- * Server project. The tests were created with:
+ * This test checks that Tika correctly detects all the file extensions defined in the
+ * mime.types file (revision 819245) of the Apache HTTP Server project. The tests were created
+ * with:
+ *
* <pre>
* cat docs/conf/mime.types | grep -v '#' | perl -lne '/\S\s+\S/ and do {
* my ($type, @ext) = split /\s+/;
@@ -92,11 +92,11 @@
assertEquals("application/oebps-package+xml", tika.detect("x.opf"));
assertEquals("application/ogg", tika.detect("x.ogx"));
// Differ from httpd - We have subtypes they lack
- //assertEquals("application/onenote", tika.detect("x.one"));
- //assertEquals("application/onenote", tika.detect("x.onetoc"));
- //assertEquals("application/onenote", tika.detect("x.onetoc2"));
- //assertEquals("application/onenote", tika.detect("x.onetmp"));
- //assertEquals("application/onenote", tika.detect("x.onepkg"));
+ // assertEquals("application/onenote", tika.detect("x.one"));
+ // assertEquals("application/onenote", tika.detect("x.onetoc"));
+ // assertEquals("application/onenote", tika.detect("x.onetoc2"));
+ // assertEquals("application/onenote", tika.detect("x.onetmp"));
+ // assertEquals("application/onenote", tika.detect("x.onepkg"));
assertEquals("application/patch-ops-error+xml", tika.detect("x.xer"));
assertEquals("application/pdf", tika.detect("x.pdf"));
assertEquals("application/pgp-encrypted", tika.detect("x.pgp"));
@@ -154,7 +154,8 @@
assertEquals("application/vnd.acucobol", tika.detect("x.acu"));
assertEquals("application/vnd.acucorp", tika.detect("x.atc"));
assertEquals("application/vnd.acucorp", tika.detect("x.acutc"));
- assertEquals("application/vnd.adobe.air-application-installer-package+zip",
+ assertEquals(
+ "application/vnd.adobe.air-application-installer-package+zip",
tika.detect("x.air"));
assertEquals("application/vnd.adobe.xdp+xml", tika.detect("x.xdp"));
assertEquals("application/vnd.adobe.xfdf", tika.detect("x.xfdf"));
@@ -164,14 +165,14 @@
assertEquals("application/vnd.americandynamics.acc", tika.detect("x.acc"));
assertEquals("application/vnd.amiga.ami", tika.detect("x.ami"));
assertEquals("application/vnd.android.package-archive", tika.detect("x.apk"));
- assertEquals("application/vnd.anser-web-certificate-issue-initiation",
- tika.detect("x.cii"));
+ assertEquals(
+ "application/vnd.anser-web-certificate-issue-initiation", tika.detect("x.cii"));
assertEquals("application/vnd.anser-web-funds-transfer-initiation", tika.detect("x.fti"));
assertEquals("application/vnd.antix.game-component", tika.detect("x.atx"));
assertEquals("application/vnd.apple.installer+xml", tika.detect("x.mpkg"));
assertEquals("application/vnd.arastra.swi", tika.detect("x.swi"));
// Differ from httpd - Adobe After Effects is a much more common user of .AEP these days
- //assertEquals("application/vnd.audiograph", tika.detect("x.aep"));
+ // assertEquals("application/vnd.audiograph", tika.detect("x.aep"));
assertEquals("application/vnd.blueice.multipass", tika.detect("x.mpm"));
assertEquals("application/vnd.bmi", tika.detect("x.bmi"));
assertEquals("application/vnd.businessobjects", tika.detect("x.rep"));
@@ -309,8 +310,8 @@
assertEquals("application/vnd.koan", tika.detect("x.skm"));
assertEquals("application/vnd.kodak-descriptor", tika.detect("x.sse"));
assertEquals("application/vnd.llamagraphics.life-balance.desktop", tika.detect("x.lbd"));
- assertEquals("application/vnd.llamagraphics.life-balance.exchange+xml",
- tika.detect("x.lbe"));
+ assertEquals(
+ "application/vnd.llamagraphics.life-balance.exchange+xml", tika.detect("x.lbe"));
assertEquals("application/vnd.lotus-1-2-3", tika.detect("x.123"));
assertEquals("application/vnd.lotus-approach", tika.detect("x.apr"));
assertEquals("application/vnd.lotus-freelance", tika.detect("x.pre"));
@@ -346,8 +347,8 @@
assertEquals("application/vnd.ms-excel", tika.detect("x.xlt"));
assertEquals("application/vnd.ms-excel", tika.detect("x.xlw"));
assertEquals("application/vnd.ms-excel.addin.macroenabled.12", tika.detect("x.xlam"));
- assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12",
- tika.detect("x.xlsb"));
+ assertEquals(
+ "application/vnd.ms-excel.sheet.binary.macroenabled.12", tika.detect("x.xlsb"));
assertEquals("application/vnd.ms-excel.sheet.macroenabled.12", tika.detect("x.xlsm"));
assertEquals("application/vnd.ms-excel.template.macroenabled.12", tika.detect("x.xltm"));
assertEquals("application/vnd.ms-fontobject", tika.detect("x.eot"));
@@ -360,13 +361,14 @@
assertEquals("application/vnd.ms-powerpoint", tika.detect("x.pps"));
assertEquals("application/vnd.ms-powerpoint", tika.detect("x.pot"));
assertEquals("application/vnd.ms-powerpoint.addin.macroenabled.12", tika.detect("x.ppam"));
- assertEquals("application/vnd.ms-powerpoint.presentation.macroenabled.12",
+ assertEquals(
+ "application/vnd.ms-powerpoint.presentation.macroenabled.12",
tika.detect("x.pptm"));
assertEquals("application/vnd.ms-powerpoint.slide.macroenabled.12", tika.detect("x.sldm"));
- assertEquals("application/vnd.ms-powerpoint.slideshow.macroenabled.12",
- tika.detect("x.ppsm"));
- assertEquals("application/vnd.ms-powerpoint.template.macroenabled.12",
- tika.detect("x.potm"));
+ assertEquals(
+ "application/vnd.ms-powerpoint.slideshow.macroenabled.12", tika.detect("x.ppsm"));
+ assertEquals(
+ "application/vnd.ms-powerpoint.template.macroenabled.12", tika.detect("x.potm"));
assertEquals("application/vnd.ms-project", tika.detect("x.mpp"));
assertEquals("application/vnd.ms-project", tika.detect("x.mpt"));
assertEquals("application/vnd.ms-word.document.macroenabled.12", tika.detect("x.docm"));
@@ -394,7 +396,7 @@
assertEquals("application/vnd.oasis.opendocument.chart", tika.detect("x.odc"));
assertEquals("application/vnd.oasis.opendocument.chart-template", tika.detect("x.otc"));
// Differ from httpd - Mimetype embedded in file is .base not .database
- //assertEquals("application/vnd.oasis.opendocument.database", tika.detect("x.odb"));
+ // assertEquals("application/vnd.oasis.opendocument.database", tika.detect("x.odb"));
assertEquals("application/vnd.oasis.opendocument.formula", tika.detect("x.odf"));
assertEquals("application/vnd.oasis.opendocument.formula-template", tika.detect("x.odft"));
assertEquals("application/vnd.oasis.opendocument.graphics", tika.detect("x.odg"));
@@ -402,11 +404,11 @@
assertEquals("application/vnd.oasis.opendocument.image", tika.detect("x.odi"));
assertEquals("application/vnd.oasis.opendocument.image-template", tika.detect("x.oti"));
assertEquals("application/vnd.oasis.opendocument.presentation", tika.detect("x.odp"));
- assertEquals("application/vnd.oasis.opendocument.presentation-template",
- tika.detect("x.otp"));
+ assertEquals(
+ "application/vnd.oasis.opendocument.presentation-template", tika.detect("x.otp"));
assertEquals("application/vnd.oasis.opendocument.spreadsheet", tika.detect("x.ods"));
- assertEquals("application/vnd.oasis.opendocument.spreadsheet-template",
- tika.detect("x.ots"));
+ assertEquals(
+ "application/vnd.oasis.opendocument.spreadsheet-template", tika.detect("x.ots"));
assertEquals("application/vnd.oasis.opendocument.text", tika.detect("x.odt"));
assertEquals("application/vnd.oasis.opendocument.text-master", tika.detect("x.otm"));
assertEquals("application/vnd.oasis.opendocument.text-template", tika.detect("x.ott"));
@@ -414,21 +416,29 @@
assertEquals("application/vnd.olpc-sugar", tika.detect("x.xo"));
assertEquals("application/vnd.oma.dd2+xml", tika.detect("x.dd2"));
assertEquals("application/vnd.openofficeorg.extension", tika.detect("x.oxt"));
- assertEquals("application/vnd.openxmlformats-officedocument.presentationml.presentation",
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation",
tika.detect("x.pptx"));
- assertEquals("application/vnd.openxmlformats-officedocument.presentationml.slide",
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.presentationml.slide",
tika.detect("x.sldx"));
- assertEquals("application/vnd.openxmlformats-officedocument.presentationml.slideshow",
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.presentationml.slideshow",
tika.detect("x.ppsx"));
- assertEquals("application/vnd.openxmlformats-officedocument.presentationml.template",
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.presentationml.template",
tika.detect("x.potx"));
- assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
tika.detect("x.xlsx"));
- assertEquals("application/vnd.openxmlformats-officedocument.spreadsheetml.template",
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.template",
tika.detect("x.xltx"));
- assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
tika.detect("x.docx"));
- assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.template",
+ assertEquals(
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
tika.detect("x.dotx"));
assertEquals("application/vnd.osgi.dp", tika.detect("x.dp"));
assertEquals("chemical/x-pdb", tika.detect("x.pdb"));
@@ -582,8 +592,8 @@
assertEquals("application/x-font-type1", tika.detect("x.pfa"));
assertEquals("application/x-font-type1", tika.detect("x.pfb"));
// TODO Get these fixed upstream too
- //assertEquals("application/x-font-type1", tika.detect("x.pfm"));
- //assertEquals("application/x-font-type1", tika.detect("x.afm"));
+ // assertEquals("application/x-font-type1", tika.detect("x.pfm"));
+ // assertEquals("application/x-font-type1", tika.detect("x.afm"));
assertEquals("application/x-font-printer-metric", tika.detect("x.pfm"));
assertEquals("application/x-font-adobe-metric", tika.detect("x.afm"));
assertEquals("application/x-futuresplash", tika.detect("x.spl"));
@@ -606,14 +616,14 @@
assertEquals("application/x-msdownload", tika.detect("x.dll"));
assertEquals("application/x-msdownload", tika.detect("x.com"));
// Differ from httpd - BAT is different from normal windows executables
- //assertEquals("application/x-msdownload", tika.detect("x.bat"));
+ // assertEquals("application/x-msdownload", tika.detect("x.bat"));
// Differ from httpd - MSI is different from normal windows executables
- //assertEquals("application/x-msdownload", tika.detect("x.msi"));
+ // assertEquals("application/x-msdownload", tika.detect("x.msi"));
assertEquals("application/x-msmediaview", tika.detect("x.mvb"));
assertEquals("application/x-msmediaview", tika.detect("x.m13"));
assertEquals("application/x-msmediaview", tika.detect("x.m14"));
// Differ from httpd - wmf was properly registered in RFC 7903
- //assertEquals("application/x-msmetafile", tika.detect("x.wmf"));
+ // assertEquals("application/x-msmetafile", tika.detect("x.wmf"));
assertEquals("application/x-msmoney", tika.detect("x.mny"));
assertEquals("application/x-mspublisher", tika.detect("x.pub"));
assertEquals("application/x-msschedule", tika.detect("x.scd"));
@@ -644,8 +654,8 @@
assertEquals("application/x-ustar", tika.detect("x.ustar"));
assertEquals("application/x-wais-source", tika.detect("x.src"));
// Differ from httpd - use a common parent for CA and User certs
- //assertEquals("application/x-x509-ca-cert", tika.detect("x.der"));
- //assertEquals("application/x-x509-ca-cert", tika.detect("x.crt"));
+ // assertEquals("application/x-x509-ca-cert", tika.detect("x.der"));
+ // assertEquals("application/x-x509-ca-cert", tika.detect("x.crt"));
assertEquals("application/x-xfig", tika.detect("x.fig"));
assertEquals("application/x-xpinstall", tika.detect("x.xpi"));
assertEquals("application/xenc+xml", tika.detect("x.xenc"));
@@ -678,9 +688,9 @@
assertEquals("audio/mpeg", tika.detect("x.m3a"));
assertEquals("audio/ogg", tika.detect("x.oga"));
// Differ from httpd - Use a dedicated mimetype of Vorbis
- //assertEquals("audio/ogg", tika.detect("x.ogg"));
+ // assertEquals("audio/ogg", tika.detect("x.ogg"));
// Differ from httpd - Speex more commonly uses its own mimetype
- //assertEquals("audio/ogg", tika.detect("x.spx"));
+ // assertEquals("audio/ogg", tika.detect("x.spx"));
assertEquals("audio/vnd.digital-winds", tika.detect("x.eol"));
assertEquals("audio/vnd.dts", tika.detect("x.dts"));
assertEquals("audio/vnd.dts.hd", tika.detect("x.dtshd"));
@@ -700,7 +710,7 @@
assertEquals("audio/x-pn-realaudio", tika.detect("x.ra"));
assertEquals("audio/x-pn-realaudio-plugin", tika.detect("x.rmp"));
// Differ from httpd - wav was properly registered in RFC 2361
- //assertEquals("audio/x-wav", tika.detect("x.wav"));
+ // assertEquals("audio/x-wav", tika.detect("x.wav"));
assertEquals("chemical/x-cdx", tika.detect("x.cdx"));
assertEquals("chemical/x-cif", tika.detect("x.cif"));
assertEquals("chemical/x-cmdf", tika.detect("x.cmdf"));
@@ -708,7 +718,7 @@
assertEquals("chemical/x-csml", tika.detect("x.csml"));
assertEquals("chemical/x-xyz", tika.detect("x.xyz"));
// Differ from httpd - bmp was properly registered in RFC 7903
- //assertEquals("image/x-ms-bmp", tika.detect("x.bmp"));
+ // assertEquals("image/x-ms-bmp", tika.detect("x.bmp"));
assertEquals("image/cgm", tika.detect("x.cgm"));
assertEquals("image/g3fax", tika.detect("x.g3"));
assertEquals("image/gif", tika.detect("x.gif"));
@@ -747,10 +757,10 @@
assertEquals("image/x-freehand", tika.detect("x.fh7"));
// Differ from httpd - An official mimetype has subsequently been issued
// favicon.ico +friends should now be image/vnd.microsoft.icon
- //assertEquals("image/x-icon", tika.detect("x.ico"));
+ // assertEquals("image/x-icon", tika.detect("x.ico"));
// Differ from httpd - An official mimetype has subsequently been issued
// pcx PiCture eXchange files should now be image/vnd.zbrush.pcx
- //assertEquals("image/x-pcx", tika.detect("x.pcx"));
+ // assertEquals("image/x-pcx", tika.detect("x.pcx"));
assertEquals("image/x-pict", tika.detect("x.pic"));
assertEquals("image/x-pict", tika.detect("x.pct"));
assertEquals("image/x-portable-anymap", tika.detect("x.pnm"));
@@ -784,7 +794,7 @@
assertEquals("text/plain", tika.detect("x.txt"));
assertEquals("text/plain", tika.detect("x.text"));
// Differ from httpd - Use a dedicated mimetype for Config files
- //assertEquals("text/plain", tika.detect("x.conf"));
+ // assertEquals("text/plain", tika.detect("x.conf"));
assertEquals("text/plain", tika.detect("x.def"));
assertEquals("text/plain", tika.detect("x.list"));
assertEquals("text/x-log", tika.detect("x.log"));
@@ -877,5 +887,4 @@
assertEquals("application/x-grib", tika.detect("x.grb2"));
assertEquals("application/dif+xml", tika.detect("x.dif"));
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/TikaIT.java b/tika-core/src/test/java/org/apache/tika/TikaIT.java
index 1604818..db6f99c 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaIT.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaIT.java
@@ -30,5 +30,4 @@
assertTrue(
version.matches("Apache Tika \\d+\\.\\d+\\.\\d+(-(?:ALPHA|BETA))?(?:-SNAPSHOT)?"));
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index a0a6377..275ed8d 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -39,10 +39,7 @@
import java.util.HashSet;
import java.util.List;
import java.util.Set;
-
import org.apache.commons.io.IOUtils;
-import org.xml.sax.ContentHandler;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.FilenameUtils;
@@ -59,10 +56,9 @@
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
import org.apache.tika.sax.ToXMLContentHandler;
+import org.xml.sax.ContentHandler;
-/**
- * Parent class of Tika tests
- */
+/** Parent class of Tika tests */
public abstract class TikaTest {
protected static TikaConfig DEFAULT_TIKA_CONFIG;
@@ -75,6 +71,7 @@
throw new RuntimeException(e);
}
}
+
public static void assertContainsCount(String needle, String haystack, int targetCount) {
int i = haystack.indexOf(needle);
int count = 0;
@@ -82,8 +79,8 @@
count++;
i = haystack.indexOf(needle, i + 1);
}
- assertEquals(targetCount, count,
- "found " + count + " but should have found: " + targetCount);
+ assertEquals(
+ targetCount, count, "found " + count + " but should have found: " + targetCount);
}
public static void assertContains(String needle, String haystack) {
@@ -102,9 +99,10 @@
assertFalse(haystack.contains(needle), needle + " unexpectedly found in:\n" + haystack);
}
- public static void assertMetadataListEquals(List<Metadata> metadataListA,
- List<Metadata> metadataListB,
- Set<String> fieldsToIgnore) {
+ public static void assertMetadataListEquals(
+ List<Metadata> metadataListA,
+ List<Metadata> metadataListB,
+ Set<String> fieldsToIgnore) {
assertEquals(metadataListA.size(), metadataListB.size(), "different sizes");
for (int i = 0; i < metadataListA.size(); i++) {
Metadata mA = metadataListA.get(i);
@@ -115,8 +113,10 @@
continue;
}
mAFields.add(n);
- assertArrayEquals(mA.getValues(n), mB.getValues(n), "problem with " + n +
- " in metadata index=" + i);
+ assertArrayEquals(
+ mA.getValues(n),
+ mB.getValues(n),
+ "problem with " + n + " in metadata index=" + i);
}
Set<String> mBFields = new HashSet<>();
for (String n : mB.names()) {
@@ -130,14 +130,13 @@
}
/**
- * Test that in at least one item in metadataList, all keys and values
- * in minExpected are contained.
- * <p>
- * The values in minExpected are tested for whether they are contained
- * within a value in the target. If minExpected=&dquot;text/vbasic&dquot; and
- * what was actually found in the target within metadatalist is
- * &dquot;text/vbasic; charset=windows-1252&dquot;,
- * that is counted as a hit.
+ * Test that in at least one item in metadataList, all keys and values in minExpected are
+ * contained.
+ *
+ * <p>The values in minExpected are tested for whether they are contained within a value in the
+ * target. If minExpected=&dquot;text/vbasic&dquot; and what was actually found in the target
+ * within metadatalist is &dquot;text/vbasic; charset=windows-1252&dquot;, that is counted as a
+ * hit.
*
* @param minExpected
* @param metadataList
@@ -160,11 +159,11 @@
}
}
if (foundPropertyCount == minExpected.names().length) {
- //found everything!
+ // found everything!
return;
}
}
- //TODO: figure out how to have more informative error message
+ // TODO: figure out how to have more informative error message
fail("Couldn't find everything within a single metadata item");
}
@@ -221,8 +220,8 @@
*
* @param name name of the desired resource
* @return A {@link java.net.URI} object or null
- * @throws URISyntaxException if this URL is not formatted strictly according to
- * RFC2396 and cannot be converted to a URI.
+ * @throws URISyntaxException if this URL is not formatted strictly according to RFC2396 and
+ * cannot be converted to a URI.
*/
public URI getResourceAsUri(String name) throws URISyntaxException {
URL url = getResourceAsUrl(name);
@@ -233,13 +232,12 @@
}
/**
- * This method will give you back the filename incl. the absolute path name
- * to the resource. If the resource does not exist it will give you back the
- * resource name incl. the path.
+ * This method will give you back the filename incl. the absolute path name to the resource. If
+ * the resource does not exist it will give you back the resource name incl. the path.
*
* @param name The named resource to search for.
- * @return an absolute path incl. the name which is in the same directory as
- * the the class you've called it from.
+ * @return an absolute path incl. the name which is in the same directory as the the class
+ * you've called it from.
*/
public File getResourceAsFile(String name) throws URISyntaxException {
URI uri = getResourceAsUri(name);
@@ -267,7 +265,10 @@
protected XMLResult getXML(String filePath, Parser parser, ParseContext context)
throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), parser, new Metadata(),
+ return getXML(
+ getResourceAsStream("/test-documents/" + filePath),
+ parser,
+ new Metadata(),
context);
}
@@ -279,22 +280,28 @@
return getXML(filePath, AUTO_DETECT_PARSER, parseContext);
}
- protected XMLResult getXML(String filePath, Parser parser, Metadata metadata,
- ParseContext parseContext)
+ protected XMLResult getXML(
+ String filePath, Parser parser, Metadata metadata, ParseContext parseContext)
throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), parser,
- metadata, parseContext);
+ return getXML(
+ getResourceAsStream("/test-documents/" + filePath), parser, metadata, parseContext);
}
protected XMLResult getXML(String filePath, Metadata metadata, ParseContext parseContext)
throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER,
- metadata, parseContext);
+ return getXML(
+ getResourceAsStream("/test-documents/" + filePath),
+ AUTO_DETECT_PARSER,
+ metadata,
+ parseContext);
}
protected XMLResult getXML(String filePath, Metadata metadata) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER,
- metadata, null);
+ return getXML(
+ getResourceAsStream("/test-documents/" + filePath),
+ AUTO_DETECT_PARSER,
+ metadata,
+ null);
}
protected XMLResult getXML(String filePath, Parser parser) throws Exception {
@@ -304,8 +311,11 @@
}
protected XMLResult getXML(String filePath) throws Exception {
- return getXML(getResourceAsStream("/test-documents/" + filePath), AUTO_DETECT_PARSER,
- new Metadata(), null);
+ return getXML(
+ getResourceAsStream("/test-documents/" + filePath),
+ AUTO_DETECT_PARSER,
+ new Metadata(),
+ null);
}
protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata)
@@ -313,8 +323,9 @@
return getXML(input, parser, metadata, null);
}
- protected XMLResult getXML(InputStream input, Parser parser, Metadata metadata,
- ParseContext context) throws Exception {
+ protected XMLResult getXML(
+ InputStream input, Parser parser, Metadata metadata, ParseContext context)
+ throws Exception {
if (context == null) {
context = new ParseContext();
}
@@ -334,12 +345,13 @@
protected List<Metadata> getRecursiveMetadata(String filePath, boolean suppressException)
throws Exception {
- return getRecursiveMetadata(filePath, new Metadata(), new ParseContext(),
- suppressException);
+ return getRecursiveMetadata(
+ filePath, new Metadata(), new ParseContext(), suppressException);
}
- protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext parseContext,
- boolean suppressException) throws Exception {
+ protected List<Metadata> getRecursiveMetadata(
+ String filePath, ParseContext parseContext, boolean suppressException)
+ throws Exception {
return getRecursiveMetadata(filePath, new Metadata(), parseContext, suppressException);
}
@@ -347,11 +359,14 @@
return getRecursiveMetadata(filePath, new ParseContext());
}
- protected List<Metadata> getRecursiveMetadata(String filePath,
- BasicContentHandlerFactory.HANDLER_TYPE handlerType)
- throws Exception {
- return getRecursiveMetadata(filePath, TikaTest.AUTO_DETECT_PARSER, new Metadata(),
- new ParseContext(), true,
+ protected List<Metadata> getRecursiveMetadata(
+ String filePath, BasicContentHandlerFactory.HANDLER_TYPE handlerType) throws Exception {
+ return getRecursiveMetadata(
+ filePath,
+ TikaTest.AUTO_DETECT_PARSER,
+ new Metadata(),
+ new ParseContext(),
+ true,
handlerType);
}
@@ -360,51 +375,59 @@
return getRecursiveMetadata(filePath, metadata, new ParseContext());
}
- protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata,
- ParseContext context) throws Exception {
+ protected List<Metadata> getRecursiveMetadata(
+ String filePath, Metadata metadata, ParseContext context) throws Exception {
return getRecursiveMetadata(filePath, metadata, context, false);
}
- protected List<Metadata> getRecursiveMetadata(String filePath, Metadata metadata,
- ParseContext context, boolean suppressException)
+ protected List<Metadata> getRecursiveMetadata(
+ String filePath, Metadata metadata, ParseContext context, boolean suppressException)
throws Exception {
- return getRecursiveMetadata(filePath, AUTO_DETECT_PARSER, metadata, context,
- suppressException);
+ return getRecursiveMetadata(
+ filePath, AUTO_DETECT_PARSER, metadata, context, suppressException);
}
- protected List<Metadata> getRecursiveMetadata(String filePath, Parser wrapped,
- Metadata metadata, ParseContext context,
- boolean suppressException) throws Exception {
+ protected List<Metadata> getRecursiveMetadata(
+ String filePath,
+ Parser wrapped,
+ Metadata metadata,
+ ParseContext context,
+ boolean suppressException)
+ throws Exception {
try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
return getRecursiveMetadata(is, wrapped, metadata, context, suppressException);
}
}
- protected List<Metadata> getRecursiveMetadata(String filePath, Parser wrapped,
- Metadata metadata, ParseContext context,
- boolean suppressException,
- BasicContentHandlerFactory.HANDLER_TYPE handlerType)
+ protected List<Metadata> getRecursiveMetadata(
+ String filePath,
+ Parser wrapped,
+ Metadata metadata,
+ ParseContext context,
+ boolean suppressException,
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType)
throws Exception {
try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
- return getRecursiveMetadata(is, wrapped, metadata, context, suppressException, handlerType);
+ return getRecursiveMetadata(
+ is, wrapped, metadata, context, suppressException, handlerType);
}
}
- protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context,
- boolean suppressException) throws Exception {
+ protected List<Metadata> getRecursiveMetadata(
+ Path path, ParseContext context, boolean suppressException) throws Exception {
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
- return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, metadata, context,
- suppressException);
+ return getRecursiveMetadata(
+ tis, AUTO_DETECT_PARSER, metadata, context, suppressException);
}
}
- protected List<Metadata> getRecursiveMetadata(Path path, Parser parser,
- boolean suppressException) throws Exception {
+ protected List<Metadata> getRecursiveMetadata(
+ Path path, Parser parser, boolean suppressException) throws Exception {
Metadata metadata = new Metadata();
try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
- return getRecursiveMetadata(tis, parser, metadata, new ParseContext(),
- suppressException);
+ return getRecursiveMetadata(
+ tis, parser, metadata, new ParseContext(), suppressException);
}
}
@@ -428,24 +451,30 @@
return getRecursiveMetadata(is, new Metadata(), new ParseContext(), suppressException);
}
- protected List<Metadata> getRecursiveMetadata(InputStream is, Parser parser,
- boolean suppressException) throws Exception {
- return getRecursiveMetadata(is, parser, new Metadata(), new ParseContext(),
- suppressException);
+ protected List<Metadata> getRecursiveMetadata(
+ InputStream is, Parser parser, boolean suppressException) throws Exception {
+ return getRecursiveMetadata(
+ is, parser, new Metadata(), new ParseContext(), suppressException);
}
- protected List<Metadata> getRecursiveMetadata(InputStream is, Metadata metadata,
- ParseContext context, boolean suppressException)
+ protected List<Metadata> getRecursiveMetadata(
+ InputStream is, Metadata metadata, ParseContext context, boolean suppressException)
throws Exception {
return getRecursiveMetadata(is, AUTO_DETECT_PARSER, metadata, context, suppressException);
}
- protected List<Metadata> getRecursiveMetadata(InputStream is, Parser p, Metadata metadata,
- ParseContext context, boolean suppressException)
+ protected List<Metadata> getRecursiveMetadata(
+ InputStream is,
+ Parser p,
+ Metadata metadata,
+ ParseContext context,
+ boolean suppressException)
throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
try {
wrapper.parse(is, handler, metadata, context);
} catch (Exception e) {
@@ -456,13 +485,17 @@
return handler.getMetadataList();
}
- protected List<Metadata> getRecursiveMetadata(InputStream is, Parser p, Metadata metadata,
- ParseContext context, boolean suppressException,
- BasicContentHandlerFactory.HANDLER_TYPE handlerType)
+ protected List<Metadata> getRecursiveMetadata(
+ InputStream is,
+ Parser p,
+ Metadata metadata,
+ ParseContext context,
+ boolean suppressException,
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType)
throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p);
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(handlerType, -1));
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(handlerType, -1));
try {
wrapper.parse(is, handler, metadata, context);
} catch (Exception e) {
@@ -477,8 +510,10 @@
throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(AUTO_DETECT_PARSER);
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
wrapper.parse(is, handler, new Metadata(), context);
}
@@ -487,21 +522,24 @@
protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap)
throws Exception {
- return getRecursiveMetadata(filePath, parserToWrap,
- BasicContentHandlerFactory.HANDLER_TYPE.XML);
+ return getRecursiveMetadata(
+ filePath, parserToWrap, BasicContentHandlerFactory.HANDLER_TYPE.XML);
}
- protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap,
- BasicContentHandlerFactory.HANDLER_TYPE
- handlerType)
+ protected List<Metadata> getRecursiveMetadata(
+ String filePath,
+ Parser parserToWrap,
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType)
throws Exception {
return getRecursiveMetadata(filePath, parserToWrap, handlerType, new ParseContext());
}
- protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap,
- BasicContentHandlerFactory.HANDLER_TYPE
- handlerType,
- ParseContext context) throws Exception {
+ protected List<Metadata> getRecursiveMetadata(
+ String filePath,
+ Parser parserToWrap,
+ BasicContentHandlerFactory.HANDLER_TYPE handlerType,
+ ParseContext context)
+ throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
RecursiveParserWrapperHandler handler =
new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(handlerType, -1));
@@ -513,11 +551,13 @@
return handler.getMetadataList();
}
- protected List<Metadata> getRecursiveMetadata(String filePath, Parser parserToWrap,
- ParseContext parseContext) throws Exception {
+ protected List<Metadata> getRecursiveMetadata(
+ String filePath, Parser parserToWrap, ParseContext parseContext) throws Exception {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parserToWrap);
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
wrapper.parse(is, handler, new Metadata(), parseContext);
@@ -546,16 +586,17 @@
return getText(filePath, AUTO_DETECT_PARSER, metadata, parseContext);
}
- protected String getText(String filePath, Parser parser, Metadata metadata,
- ParseContext parseContext) throws Exception {
- return getText(getResourceAsStream("/test-documents/" + filePath), parser, parseContext,
- metadata);
+ protected String getText(
+ String filePath, Parser parser, Metadata metadata, ParseContext parseContext)
+ throws Exception {
+ return getText(
+ getResourceAsStream("/test-documents/" + filePath), parser, parseContext, metadata);
}
/**
* Basic text extraction.
- * <p>
- * Tries to close input stream after processing.
+ *
+ * <p>Tries to close input stream after processing.
*/
public String getText(InputStream is, Parser parser, ParseContext context, Metadata metadata)
throws Exception {
@@ -595,8 +636,8 @@
}
public List<Path> getAllTestFiles() {
- //for now, just get main files
- //TODO: fix this to be recursive
+ // for now, just get main files
+ // TODO: fix this to be recursive
try {
File[] pathArray = Paths.get(getResourceAsUri("/test-documents")).toFile().listFiles();
List<Path> paths = new ArrayList<>();
@@ -619,9 +660,7 @@
}
}
- /**
- * Keeps track of media types and file names recursively.
- */
+ /** Keeps track of media types and file names recursively. */
public static class TrackingHandler implements EmbeddedResourceHandler {
private final Set<MediaType> skipTypes;
public List<String> filenames = new ArrayList<>();
@@ -645,9 +684,7 @@
}
}
- /**
- * Copies byte[] of embedded documents into a List.
- */
+ /** Copies byte[] of embedded documents into a List. */
public static class ByteCopyingHandler implements EmbeddedResourceHandler {
public List<byte[]> bytes = new ArrayList<>();
@@ -664,7 +701,7 @@
bytes.add(os.toByteArray());
stream.reset();
} catch (IOException e) {
- //swallow
+ // swallow
}
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java b/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java
index 9ff104c..f44b7c8 100644
--- a/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java
+++ b/tika-core/src/test/java/org/apache/tika/TypeDetectionBenchmark.java
@@ -21,7 +21,6 @@
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.Locale;
-
import org.apache.commons.io.IOUtils;
public class TypeDetectionBenchmark {
@@ -51,8 +50,12 @@
for (int i = 0; i < 1000; i++) {
tika.detect(new ByteArrayInputStream(content));
}
- System.out.printf(Locale.ROOT, "%6dns per Tika.detect(%s) = %s%n",
- System.currentTimeMillis() - start, file, type);
+ System.out.printf(
+ Locale.ROOT,
+ "%6dns per Tika.detect(%s) = %s%n",
+ System.currentTimeMillis() - start,
+ file,
+ type);
}
} else if (file.isDirectory()) {
for (File child : file.listFiles()) {
@@ -60,5 +63,4 @@
}
}
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
index 763cb43..459a1cd 100644
--- a/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/AbstractTikaConfigTest.java
@@ -21,17 +21,14 @@
import java.net.URL;
import java.nio.file.Path;
import java.nio.file.Paths;
-
-import org.junit.jupiter.api.AfterEach;
-
import org.apache.tika.TikaTest;
import org.apache.tika.parser.ParseContext;
+import org.junit.jupiter.api.AfterEach;
/**
- * Parent of Junit test classes for {@link TikaConfig}, including
- * Tika Core based ones, and ones in Tika Parsers that do things
- * that {@link TikaConfigTest} can't, do due to a need for the
- * full set of "real" classes of parsers / detectors
+ * Parent of Junit test classes for {@link TikaConfig}, including Tika Core based ones, and ones in
+ * Tika Parsers that do things that {@link TikaConfigTest} can't, do due to a need for the full set
+ * of "real" classes of parsers / detectors
*/
public abstract class AbstractTikaConfigTest extends TikaTest {
protected static ParseContext context = new ParseContext();
@@ -42,7 +39,6 @@
return Paths.get(url.toURI());
}
-
protected static String getConfigPath(String config) throws Exception {
URL url = TikaConfig.class.getResource(config);
assertNotNull(url, "Test Tika Config not found: " + config);
diff --git a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
index 185387c..72e4aee 100644
--- a/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
+++ b/tika-core/src/test/java/org/apache/tika/config/DummyExecutor.java
@@ -19,7 +19,6 @@
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
-
import org.apache.tika.concurrent.ConfigurableThreadPoolExecutor;
public class DummyExecutor extends ThreadPoolExecutor implements ConfigurableThreadPoolExecutor {
diff --git a/tika-core/src/test/java/org/apache/tika/config/DummyParser.java b/tika-core/src/test/java/org/apache/tika/config/DummyParser.java
index cea6c2f..3557de4 100644
--- a/tika-core/src/test/java/org/apache/tika/config/DummyParser.java
+++ b/tika-core/src/test/java/org/apache/tika/config/DummyParser.java
@@ -17,7 +17,6 @@
package org.apache.tika.config;
import java.util.Collection;
-
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
@@ -27,8 +26,10 @@
private final ServiceLoader loader;
- public DummyParser(MediaTypeRegistry registry, ServiceLoader loader,
- Collection<Class<? extends Parser>> excludeParsers) {
+ public DummyParser(
+ MediaTypeRegistry registry,
+ ServiceLoader loader,
+ Collection<Class<? extends Parser>> excludeParsers) {
this.loader = loader;
}
diff --git a/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java
index 9bbe8ed..56ae304 100644
--- a/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/MockConfigTest.java
@@ -1,17 +1,15 @@
/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * <p>http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * <p>Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config;
@@ -22,10 +20,8 @@
import java.io.InputStream;
import java.util.List;
import java.util.Map;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.exception.TikaException;
+import org.junit.jupiter.api.Test;
public class MockConfigTest {
@@ -44,7 +40,6 @@
assertEquals("two", config.getMyStrings().get(1));
}
-
public static class MockConfig extends ConfigBase {
private Map<String, String> mappings;
diff --git a/tika-core/src/test/java/org/apache/tika/config/ParamTest.java b/tika-core/src/test/java/org/apache/tika/config/ParamTest.java
index 5f5321d..0393540 100644
--- a/tika-core/src/test/java/org/apache/tika/config/ParamTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/ParamTest.java
@@ -1,17 +1,15 @@
/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * <p>http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * <p>Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config;
@@ -26,7 +24,6 @@
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
-
import org.junit.jupiter.api.Test;
public class ParamTest {
@@ -39,11 +36,20 @@
list.add("brown");
list.add("fox");
Object[] objects =
- new Object[]{list, Integer.MAX_VALUE, 2.5f, 4000.57576, true, false, Long.MAX_VALUE,
- "Hello this is a boring string", new URL("http://apache.org"),
- new URI("tika://org.apache.tika.ner.parser?impl=xyz"),
- new BigInteger(Long.MAX_VALUE + "").add(
- new BigInteger(Long.MAX_VALUE + "")), new File("."),};
+ new Object[] {
+ list,
+ Integer.MAX_VALUE,
+ 2.5f,
+ 4000.57576,
+ true,
+ false,
+ Long.MAX_VALUE,
+ "Hello this is a boring string",
+ new URL("http://apache.org"),
+ new URI("tika://org.apache.tika.ner.parser?impl=xyz"),
+ new BigInteger(Long.MAX_VALUE + "").add(new BigInteger(Long.MAX_VALUE + "")),
+ new File("."),
+ };
for (Object object : objects) {
String name = "name" + System.currentTimeMillis();
@@ -64,5 +70,4 @@
assertEquals(loaded.getType(), object.getClass());
}
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
index dafdd64..81db088 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigSerializerTest.java
@@ -17,7 +17,6 @@
package org.apache.tika.config;
-
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -25,49 +24,51 @@
import java.io.InputStream;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
-
-import org.junit.jupiter.api.Disabled;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.parser.CompositeParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.parser.mock.MockParser;
import org.apache.tika.parser.multiple.FallbackParser;
+import org.junit.jupiter.api.Disabled;
+import org.junit.jupiter.api.Test;
public class TikaConfigSerializerTest extends TikaConfigTest {
/**
- * TIKA-1445 It should be possible to exclude DefaultParser from
- * certain types, so another parser explicitly listed will take them
+ * TIKA-1445 It should be possible to exclude DefaultParser from certain types, so another
+ * parser explicitly listed will take them
*/
@Test
public void defaultParserWithExcludes() throws Exception {
String xml =
loadAndSerialize("TIKA-1445-default-except.xml", TikaConfigSerializer.Mode.STATIC);
assertContains(
- "<parser class=\"org.apache.tika.parser.ErrorParser\">" + " <mime>fail/world" +
- "</mime> " +
- "</parser>", xml);
+ "<parser class=\"org.apache.tika.parser.ErrorParser\">"
+ + " <mime>fail/world"
+ + "</mime> "
+ + "</parser>",
+ xml);
}
@Test
public void testEncodingDetectors() throws Exception {
String xml = loadAndSerialize("TIKA-1762-executors.xml", TikaConfigSerializer.Mode.STATIC);
- assertContains("<encodingDetectors> " +
- "<encodingDetector class=\"org.apache.tika.detect" +
- ".NonDetectingEncodingDetector\"/> " +
- "</encodingDetectors>", xml);
+ assertContains(
+ "<encodingDetectors> "
+ + "<encodingDetector class=\"org.apache.tika.detect"
+ + ".NonDetectingEncodingDetector\"/> "
+ + "</encodingDetectors>",
+ xml);
}
@Test
public void testMultipleWithFallback() throws Exception {
TikaConfig config = getConfig("TIKA-1509-multiple-fallback.xml");
StringWriter writer = new StringWriter();
- TikaConfigSerializer.serialize(config,
- TikaConfigSerializer.Mode.STATIC_FULL, writer, StandardCharsets.UTF_8);
+ TikaConfigSerializer.serialize(
+ config, TikaConfigSerializer.Mode.STATIC_FULL, writer, StandardCharsets.UTF_8);
try (InputStream is =
- new ByteArrayInputStream(writer.toString().getBytes(StandardCharsets.UTF_8))) {
+ new ByteArrayInputStream(writer.toString().getBytes(StandardCharsets.UTF_8))) {
config = new TikaConfig(is);
}
@@ -90,9 +91,12 @@
@Disabled("TODO: executor-service info needs to be stored in TikaConfig for serialization")
public void testExecutors() throws Exception {
String xml = loadAndSerialize("TIKA-1762-executors.xml", TikaConfigSerializer.Mode.STATIC);
- assertContains("<executor-service class=\"org.apache.tika.config.DummyExecutor\">" +
- " <core-threads>3</core-threads>" + " <max-threads>10</max-threads>" +
- "</executor-service>", xml);
+ assertContains(
+ "<executor-service class=\"org.apache.tika.config.DummyExecutor\">"
+ + " <core-threads>3</core-threads>"
+ + " <max-threads>10</max-threads>"
+ + "</executor-service>",
+ xml);
}
String loadAndSerialize(String configFile, TikaConfigSerializer.Mode mode) throws Exception {
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
index 7fa0217..9178dd7 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaConfigTest.java
@@ -31,9 +31,6 @@
import java.util.List;
import java.util.Map;
import java.util.concurrent.ThreadPoolExecutor;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.ResourceLoggingClassLoader;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
@@ -51,18 +48,17 @@
import org.apache.tika.parser.mock.MockParser;
import org.apache.tika.parser.multiple.FallbackParser;
import org.apache.tika.utils.XMLReaderUtils;
+import org.junit.jupiter.api.Test;
/**
- * Tests for the Tika Config, which don't require real parsers /
- * detectors / etc.
- * There's also {@link TikaParserConfigTest} and {@link TikaDetectorConfigTest}
- * over in the Tika Parsers project, which do further Tika Config
- * testing using real parsers and detectors.
+ * Tests for the Tika Config, which don't require real parsers / detectors / etc. There's also
+ * {@link TikaParserConfigTest} and {@link TikaDetectorConfigTest} over in the Tika Parsers project,
+ * which do further Tika Config testing using real parsers and detectors.
*/
public class TikaConfigTest extends AbstractTikaConfigTest {
/**
- * Make sure that a configuration file can't reference the
- * {@link AutoDetectParser} class a <parser> configuration element.
+ * Make sure that a configuration file can't reference the {@link AutoDetectParser} class a
+ * <parser> configuration element.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
*/
@@ -76,9 +72,8 @@
}
/**
- * Make sure that with a service loader given, we can
- * get different configurable behaviour on parser classes
- * which can't be found.
+ * Make sure that with a service loader given, we can get different configurable behaviour on
+ * parser classes which can't be found.
*/
@Test
public void testUnknownParser() throws Exception {
@@ -108,9 +103,8 @@
}
/**
- * Make sure that a configuration file can reference also a composite
- * parser class like {@link DefaultParser} in a <parser>
- * configuration element.
+ * Make sure that a configuration file can reference also a composite parser class like {@link
+ * DefaultParser} in a <parser> configuration element.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
*/
@@ -124,8 +118,8 @@
}
/**
- * Make sure that a valid configuration file without mimetypes or
- * detector entries can be loaded without problems.
+ * Make sure that a valid configuration file without mimetypes or detector entries can be loaded
+ * without problems.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-866">TIKA-866</a>
*/
@@ -139,9 +133,8 @@
}
/**
- * TIKA-1145 If the TikaConfig has a ClassLoader set on it,
- * that should be used when loading the mimetypes and when
- * discovering services
+ * TIKA-1145 If the TikaConfig has a ClassLoader set on it, that should be used when loading the
+ * mimetypes and when discovering services
*/
@Test
public void ensureClassLoaderUsedEverywhere() throws Exception {
@@ -162,14 +155,15 @@
Map<String, List<URL>> resources = customLoader.getLoadedResources();
int resourcesCount = resources.size();
- assertTrue(resourcesCount > 3,
+ assertTrue(
+ resourcesCount > 3,
"Not enough things used the classloader, found only " + resourcesCount);
// Ensure everything that should do, did use it
// - Parsers
assertNotNull(resources.get("META-INF/services/org.apache.tika.parser.Parser"));
// - Detectors
- //assertNotNull(resources.get("META-INF/services/org.apache.tika.detect.Detector"));
+ // assertNotNull(resources.get("META-INF/services/org.apache.tika.detect.Detector"));
// - Built-In Mimetypes
assertNotNull(resources.get("org/apache/tika/mime/tika-mimetypes.xml"));
// - Custom Mimetypes
@@ -177,8 +171,8 @@
}
/**
- * TIKA-1445 It should be possible to exclude DefaultParser from
- * certain types, so another parser explicitly listed will take them
+ * TIKA-1445 It should be possible to exclude DefaultParser from certain types, so another
+ * parser explicitly listed will take them
*/
@Test
public void defaultParserWithExcludes() throws Exception {
@@ -215,8 +209,8 @@
}
/**
- * TIKA-1653 If one parser has child parsers, those child parsers shouldn't
- * show up at the top level as well
+ * TIKA-1653 If one parser has child parsers, those child parsers shouldn't show up at the top
+ * level as well
*/
@Test
public void parserWithChildParsers() throws Exception {
@@ -268,41 +262,46 @@
assertTrue((executorService instanceof DummyExecutor), "Should use Dummy Executor");
assertEquals(3, executorService.getCorePoolSize(), "Should have configured Core Threads");
- assertEquals(10, executorService.getMaximumPoolSize(),
- "Should have configured Max Threads");
+ assertEquals(
+ 10, executorService.getMaximumPoolSize(), "Should have configured Max Threads");
}
@Test
public void testInitializerBadValue() throws Exception {
- assertThrows(TikaConfigException.class, () -> {
- TikaConfig config = getConfig("TIKA-2389-illegal.xml");
- });
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ TikaConfig config = getConfig("TIKA-2389-illegal.xml");
+ });
}
-
@Test
public void testInitializerPerParserThrow() throws Exception {
- assertThrows(TikaConfigException.class, () -> {
- TikaConfig config = getConfig("TIKA-2389-throw-per-parser.xml");
- });
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ TikaConfig config = getConfig("TIKA-2389-throw-per-parser.xml");
+ });
}
@Test
public void testInitializerServiceLoaderThrow() throws Exception {
- assertThrows(TikaConfigException.class, () -> {
- TikaConfig config = getConfig("TIKA-2389-throw-default.xml");
- });
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ TikaConfig config = getConfig("TIKA-2389-throw-default.xml");
+ });
}
@Test
public void testInitializerServiceLoaderThrowButOverridden() throws Exception {
- //TODO: test that this was logged at INFO level
+ // TODO: test that this was logged at INFO level
TikaConfig config = getConfig("TIKA-2389-throw-default-overridden.xml");
}
@Test
public void testInitializerPerParserWarn() throws Exception {
- //TODO: test that this was logged at WARN level
+ // TODO: test that this was logged at WARN level
TikaConfig config = getConfig("TIKA-2389-warn-per-parser.xml");
}
@@ -327,20 +326,22 @@
@Test
public void testXMLReaderUtils() throws Exception {
- //pool size may have been reset already by an
- //earlier test. Can't test for default here.
- assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS,
+ // pool size may have been reset already by an
+ // earlier test. Can't test for default here.
+ assertEquals(
+ XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS,
XMLReaderUtils.getMaxEntityExpansions());
- //make sure that detection on this file actually works with
- //default expansions
- assertEquals("application/rdf+xml",
+ // make sure that detection on this file actually works with
+ // default expansions
+ assertEquals(
+ "application/rdf+xml",
detect("test-difficult-rdf1.xml", TikaConfig.getDefaultConfig()).toString());
TikaConfig tikaConfig = getConfig("TIKA-2732-xmlreaderutils.xml");
try {
assertEquals(33, XMLReaderUtils.getPoolSize());
assertEquals(5, XMLReaderUtils.getMaxEntityExpansions());
- //make sure that there's actually a change in behavior
+ // make sure that there's actually a change in behavior
assertEquals("text/plain", detect("test-difficult-rdf1.xml", tikaConfig).toString());
} finally {
XMLReaderUtils.setMaxEntityExpansions(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS);
@@ -356,27 +357,33 @@
@Test
public void testXMLReaderUtilsException() throws Exception {
- assertThrows(NumberFormatException.class, () -> {
- getConfig("TIKA-2732-xmlreaderutils-exc.xml");
- });
+ assertThrows(
+ NumberFormatException.class,
+ () -> {
+ getConfig("TIKA-2732-xmlreaderutils-exc.xml");
+ });
}
@Test
public void testXMLReaderUtilsUnspecifiedAttribute() throws Exception {
TikaConfig tikaConfig = getConfig("TIKA-3551-xmlreaderutils.xml");
- assertEquals(XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS, XMLReaderUtils.getMaxEntityExpansions());
+ assertEquals(
+ XMLReaderUtils.DEFAULT_MAX_ENTITY_EXPANSIONS,
+ XMLReaderUtils.getMaxEntityExpansions());
}
@Test
public void testBadExclude() throws Exception {
- assertThrows(TikaConfigException.class, () -> {
- getConfig("TIKA-3268-bad-parser-exclude.xml");
- });
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ getConfig("TIKA-3268-bad-parser-exclude.xml");
+ });
}
@Test
public void testTimesInitiated() throws Exception {
- //this prevents multi-threading tests, but we aren't doing that now...
+ // this prevents multi-threading tests, but we aren't doing that now...
MockParser.resetTimesInitiated();
TikaConfig tikaConfig =
new TikaConfig(TikaConfigTest.class.getResourceAsStream("mock-exclude.xml"));
diff --git a/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java b/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java
index 3ea1e53..2cae67a 100644
--- a/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/config/TikaPipesConfigTest.java
@@ -1,17 +1,15 @@
/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- * <p>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * <p>http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * <p>Unless required by applicable law or agreed to in writing, software distributed under the
+ * License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
+ * express or implied. See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.config;
@@ -24,9 +22,6 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.pipes.CompositePipesReporter;
import org.apache.tika.pipes.PipesReporter;
@@ -38,9 +33,10 @@
import org.apache.tika.pipes.fetcher.FetcherManager;
import org.apache.tika.pipes.fetcher.fs.FileSystemFetcher;
import org.apache.tika.pipes.pipesiterator.PipesIterator;
+import org.junit.jupiter.api.Test;
public class TikaPipesConfigTest extends AbstractTikaConfigTest {
- //this handles tests for the newer pipes type configs.
+ // this handles tests for the newer pipes type configs.
@Test
public void testFetchers() throws Exception {
@@ -54,27 +50,31 @@
@Test
public void testDuplicateFetchers() throws Exception {
- //can't have two fetchers with the same name
- assertThrows(TikaConfigException.class, () -> {
- FetcherManager.load(getConfigFilePath("fetchers-duplicate-config.xml"));
- });
+ // can't have two fetchers with the same name
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ FetcherManager.load(getConfigFilePath("fetchers-duplicate-config.xml"));
+ });
}
@Test
public void testNoNameFetchers() throws Exception {
- //can't have two fetchers with an empty name
- assertThrows(TikaConfigException.class, () -> {
- FetcherManager.load(getConfigFilePath("fetchers-noname-config.xml"));
- });
+ // can't have two fetchers with an empty name
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ FetcherManager.load(getConfigFilePath("fetchers-noname-config.xml"));
+ });
}
@Test
public void testNoBasePathFetchers() throws Exception {
- //no basepath is allowed as of > 2.3.0
- //test that this does not throw an exception.
+ // no basepath is allowed as of > 2.3.0
+ // test that this does not throw an exception.
- FetcherManager fetcherManager = FetcherManager.load(
- getConfigFilePath("fetchers-nobasepath-config.xml"));
+ FetcherManager fetcherManager =
+ FetcherManager.load(getConfigFilePath("fetchers-nobasepath-config.xml"));
}
@Test
@@ -89,36 +89,41 @@
@Test
public void testDuplicateEmitters() throws Exception {
- assertThrows(TikaConfigException.class, () -> {
- EmitterManager.load(getConfigFilePath("emitters-duplicate-config.xml"));
- });
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ EmitterManager.load(getConfigFilePath("emitters-duplicate-config.xml"));
+ });
}
@Test
public void testPipesIterator() throws Exception {
- PipesIterator it =
- PipesIterator.build(getConfigFilePath("pipes-iterator-config.xml"));
+ PipesIterator it = PipesIterator.build(getConfigFilePath("pipes-iterator-config.xml"));
assertEquals("fs1", it.getFetcherName());
}
@Test
public void testMultiplePipesIterators() throws Exception {
- assertThrows(TikaConfigException.class, () -> {
- PipesIterator it =
- PipesIterator.build(getConfigFilePath("pipes-iterator-multiple-config.xml"));
- assertEquals("fs1", it.getFetcherName());
- });
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ PipesIterator it =
+ PipesIterator.build(
+ getConfigFilePath("pipes-iterator-multiple-config.xml"));
+ assertEquals("fs1", it.getFetcherName());
+ });
}
+
@Test
public void testParams() throws Exception {
- //This test makes sure that pre 2.7.x configs that still contain <params/> element
- //in ConfigBase derived objects still work.
+ // This test makes sure that pre 2.7.x configs that still contain <params/> element
+ // in ConfigBase derived objects still work.
Path configPath = getConfigFilePath("TIKA-3865-params.xml");
AsyncConfig asyncConfig = AsyncConfig.load(configPath);
PipesReporter reporter = asyncConfig.getPipesReporter();
assertTrue(reporter instanceof CompositePipesReporter);
- List<PipesReporter> reporters = ((CompositePipesReporter)reporter).getPipesReporters();
- assertEquals("somethingOrOther1", ((MockReporter)reporters.get(0)).getEndpoint());
- assertEquals("somethingOrOther2", ((MockReporter)reporters.get(1)).getEndpoint());
+ List<PipesReporter> reporters = ((CompositePipesReporter) reporter).getPipesReporters();
+ assertEquals("somethingOrOther1", ((MockReporter) reporters.get(0)).getEndpoint());
+ assertEquals("somethingOrOther2", ((MockReporter) reporters.get(1)).getEndpoint());
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java
index dbb8220..8b8912c 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/FileCommandDetectorTest.java
@@ -20,14 +20,12 @@
import static org.junit.jupiter.api.Assumptions.assumeTrue;
import java.io.InputStream;
-
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
public class FileCommandDetectorTest {
@@ -44,28 +42,30 @@
public void testBasic() throws Exception {
assumeTrue(FileCommandDetector.checkHasFile());
- try (InputStream is = getClass()
- .getResourceAsStream("/test-documents/basic_embedded.xml")) {
- //run more than once to ensure that the input stream is reset
+ try (InputStream is =
+ getClass().getResourceAsStream("/test-documents/basic_embedded.xml")) {
+ // run more than once to ensure that the input stream is reset
for (int i = 0; i < 2; i++) {
Metadata metadata = new Metadata();
MediaType answer = DETECTOR.detect(is, metadata);
String fileMime = metadata.get(FileCommandDetector.FILE_MIME);
- assertTrue(MediaType.text("xml").equals(answer) ||
- MediaType.application("xml").equals(answer));
- assertTrue("application/xml".equals(fileMime) ||
- "text/xml".equals(fileMime));
+ assertTrue(
+ MediaType.text("xml").equals(answer)
+ || MediaType.application("xml").equals(answer));
+ assertTrue("application/xml".equals(fileMime) || "text/xml".equals(fileMime));
}
}
- //now try with TikaInputStream
- try (InputStream is = TikaInputStream
- .get(getClass().getResourceAsStream("/test-documents/basic_embedded.xml"))) {
- //run more than once to ensure that the input stream is reset
+ // now try with TikaInputStream
+ try (InputStream is =
+ TikaInputStream.get(
+ getClass().getResourceAsStream("/test-documents/basic_embedded.xml"))) {
+ // run more than once to ensure that the input stream is reset
for (int i = 0; i < 2; i++) {
MediaType answer = DETECTOR.detect(is, new Metadata());
- assertTrue(MediaType.text("xml").equals(answer) ||
- MediaType.application("xml").equals(answer));
+ assertTrue(
+ MediaType.text("xml").equals(answer)
+ || MediaType.application("xml").equals(answer));
}
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
index 3a86a53..dc9cfed 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/MagicDetectorTest.java
@@ -26,17 +26,13 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-
import org.apache.commons.io.IOUtils;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.junit.jupiter.api.Test;
-/**
- * Test cases for the {@link MagicDetector} class.
- */
+/** Test cases for the {@link MagicDetector} class. */
public class MagicDetectorTest {
@Test
@@ -73,9 +69,13 @@
assertDetect(detector, MediaType.OCTET_STREAM, " html");
assertDetect(detector, MediaType.OCTET_STREAM, "<HTML");
- assertDetect(detector, html,
+ assertDetect(
+ detector,
+ html,
"0........1.........2.........3.........4.........5.........6" + "1234<html");
- assertDetect(detector, MediaType.OCTET_STREAM,
+ assertDetect(
+ detector,
+ MediaType.OCTET_STREAM,
"0........1.........2.........3.........4.........5.........6" + "12345<html");
assertDetect(detector, MediaType.OCTET_STREAM, "");
@@ -85,8 +85,13 @@
public void testDetectMask() throws Exception {
MediaType html = new MediaType("text", "html");
byte up = (byte) 0xdf;
- Detector detector = new MagicDetector(html, new byte[]{'<', 'H', 'T', 'M', 'L'},
- new byte[]{(byte) 0xff, up, up, up, up}, 0, 64);
+ Detector detector =
+ new MagicDetector(
+ html,
+ new byte[] {'<', 'H', 'T', 'M', 'L'},
+ new byte[] {(byte) 0xff, up, up, up, up},
+ 0,
+ 64);
assertDetect(detector, html, "<html");
assertDetect(detector, html, "<HTML><head/><body/></html>");
@@ -96,9 +101,13 @@
assertDetect(detector, MediaType.OCTET_STREAM, "<htm");
assertDetect(detector, MediaType.OCTET_STREAM, " html");
- assertDetect(detector, html,
+ assertDetect(
+ detector,
+ html,
"0 1 2 3 4 5 6" + "1234<html");
- assertDetect(detector, MediaType.OCTET_STREAM,
+ assertDetect(
+ detector,
+ MediaType.OCTET_STREAM,
"0 1 2 3 4 5 6" + "12345<html");
assertDetect(detector, MediaType.OCTET_STREAM, "");
@@ -111,44 +120,63 @@
new MagicDetector(pdf, "(?s)\\A.{0,144}%PDF-".getBytes(US_ASCII), null, true, 0, 0);
assertDetect(detector, pdf, "%PDF-1.0");
- assertDetect(detector, pdf, "0 10 20 30 40 50 6" +
- "0 70 80 90 100 110 1" +
- "20 130 140" + "34%PDF-1.0");
- assertDetect(detector, MediaType.OCTET_STREAM,
- "0 10 20 30 40 50 6" +
- "0 70 80 90 100 110 1" +
- "20 130 140" + "345%PDF-1.0");
+ assertDetect(
+ detector,
+ pdf,
+ "0 10 20 30 40 50 6"
+ + "0 70 80 90 100 110 1"
+ + "20 130 140"
+ + "34%PDF-1.0");
+ assertDetect(
+ detector,
+ MediaType.OCTET_STREAM,
+ "0 10 20 30 40 50 6"
+ + "0 70 80 90 100 110 1"
+ + "20 130 140"
+ + "345%PDF-1.0");
assertDetect(detector, MediaType.OCTET_STREAM, "");
}
@Test
public void testDetectRegExGreedy() throws Exception {
- String pattern = "(?s)\\x3chtml xmlns=\"http://www\\.w3\\.org/1999/xhtml" +
- "\".*\\x3ctitle\\x3e.*\\x3c/title\\x3e";
+ String pattern =
+ "(?s)\\x3chtml xmlns=\"http://www\\.w3\\.org/1999/xhtml"
+ + "\".*\\x3ctitle\\x3e.*\\x3c/title\\x3e";
MediaType xhtml = new MediaType("application", "xhtml+xml");
Detector detector =
new MagicDetector(xhtml, pattern.getBytes(US_ASCII), null, true, 0, 8192);
- assertDetect(detector, xhtml, "<html xmlns=\"http://www.w3.org/1999/xhtml\">" +
- "<head><title>XHTML test document</title></head>");
+ assertDetect(
+ detector,
+ xhtml,
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">"
+ + "<head><title>XHTML test document</title></head>");
}
@Test
public void testDetectRegExOptions() throws Exception {
- String pattern = "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) " +
- "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}" + "(?:HTML|html) 4\\.01";
+ String pattern =
+ "(?s)\\A.{0,1024}\\x3c\\!(?:DOCTYPE|doctype) (?:HTML|html) "
+ + "(?:PUBLIC|public) \"-//.{1,16}//(?:DTD|dtd) .{0,64}"
+ + "(?:HTML|html) 4\\.01";
- String data = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\"" +
- "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" +
- "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";
+ String data =
+ "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\""
+ + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>"
+ + "<HEAD><TITLE>HTML document</TITLE></HEAD>"
+ + "<BODY><P>Hello world!</BODY></HTML>";
- String data1 = "<!DOCTYPE html PUBLIC \"-//W3C//dtd html 4.01//EN\"" +
- "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" +
- "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";
+ String data1 =
+ "<!DOCTYPE html PUBLIC \"-//W3C//dtd html 4.01//EN\""
+ + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>"
+ + "<HEAD><TITLE>HTML document</TITLE></HEAD>"
+ + "<BODY><P>Hello world!</BODY></HTML>";
- String data2 = "<!DoCtYpE hTmL pUbLiC \"-//W3C//dTd HtMl 4.01//EN\"" +
- "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>" +
- "<HEAD><TITLE>HTML document</TITLE></HEAD>" + "<BODY><P>Hello world!</BODY></HTML>";
+ String data2 =
+ "<!DoCtYpE hTmL pUbLiC \"-//W3C//dTd HtMl 4.01//EN\""
+ + "\"http://www.w3.org/TR/html4/strict.dtd\"><HTML>"
+ + "<HEAD><TITLE>HTML document</TITLE></HEAD>"
+ + "<BODY><P>Hello world!</BODY></HTML>";
MediaType html = new MediaType("text", "html");
Detector detector = new MagicDetector(html, pattern.getBytes(US_ASCII), null, true, 0, 0);
@@ -171,8 +199,9 @@
@Test
public void testDetectApplicationEnviHdr() throws Exception {
- InputStream iStream = MagicDetectorTest.class
- .getResourceAsStream("/test-documents/ang20150420t182050_corr_v1e_img.hdr");
+ InputStream iStream =
+ MagicDetectorTest.class.getResourceAsStream(
+ "/test-documents/ang20150420t182050_corr_v1e_img.hdr");
byte[] data = IOUtils.toByteArray(iStream);
MediaType testMT = new MediaType("application", "envi.hdr");
Detector detector = new MagicDetector(testMT, data, null, false, 0, 0);
@@ -225,19 +254,13 @@
}
}
- /**
- * InputStream class that does not read in all available bytes in
- * one go.
- */
+ /** InputStream class that does not read in all available bytes in one go. */
private static class RestrictiveInputStream extends ByteArrayInputStream {
public RestrictiveInputStream(byte[] buf) {
super(buf);
}
- /**
- * Prevent reading the entire len of bytes if requesting more
- * than 10 bytes.
- */
+ /** Prevent reading the entire len of bytes if requesting more than 10 bytes. */
public int read(byte[] b, int off, int len) {
if (len > 10) {
return super.read(b, off, len - 10);
@@ -250,18 +273,23 @@
@Test
public void testBZ2Detection() throws Exception {
Detector detector = new TikaConfig().getDetector();
- for (String bz2 : new String[]{"bzip2-8-file.txt.bz2",
- "empty-file.txt.bz2", "lbzip2-8-file.txt.bz2",
- "small-file.txt.bz2", "test-file-1.csv.bz2",
- "test-file-2.csv.bz2"}) {
+ for (String bz2 :
+ new String[] {
+ "bzip2-8-file.txt.bz2",
+ "empty-file.txt.bz2",
+ "lbzip2-8-file.txt.bz2",
+ "small-file.txt.bz2",
+ "test-file-1.csv.bz2",
+ "test-file-2.csv.bz2"
+ }) {
assertEquals("application/x-bzip2", detect(detector, bz2));
}
}
- private String detect(Detector detector, String bz2Name) throws IOException {
- try (InputStream is = new BufferedInputStream(
- this.getClass().getResourceAsStream(
- "/test-documents/bz2/" + bz2Name))) {
+ private String detect(Detector detector, String bz2Name) throws IOException {
+ try (InputStream is =
+ new BufferedInputStream(
+ this.getClass().getResourceAsStream("/test-documents/bz2/" + bz2Name))) {
return detector.detect(is, new Metadata()).toString();
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
index 293f423..30d2319 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/MimeDetectionWithNNTest.java
@@ -22,13 +22,11 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MimeDetectionTest;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
public class MimeDetectionWithNNTest {
@@ -43,10 +41,10 @@
}
/**
- * The test case only works on the detector that only has grb model as
- * currently the grb model is used as an example; if more models are added
- * in the TrainedModelDetector, the following tests will need to modified to reflect
- * the corresponding type instead of test-equal with the "OCTET_STREAM";
+ * The test case only works on the detector that only has grb model as currently the grb model
+ * is used as an example; if more models are added in the TrainedModelDetector, the following
+ * tests will need to modified to reflect the corresponding type instead of test-equal with the
+ * "OCTET_STREAM";
*
* @throws Exception
*/
@@ -69,7 +67,9 @@
testFile(octetStream_str, "test-utf16be.xml");
testFile(octetStream_str, "test-long-comment.xml");
testFile(octetStream_str, "stylesheet.xsl");
- testUrl(octetStream_str, "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
+ testUrl(
+ octetStream_str,
+ "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
"test-difficult-rdf1.xml");
testUrl(octetStream_str, "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml");
// add evil test from TIKA-327
@@ -108,27 +108,25 @@
try {
Metadata metadata = new Metadata();
String mime = this.detector.detect(in, metadata).toString();
- assertEquals(expected, mime,
- urlOrFileName + " is not properly detected: detected.");
+ assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected.");
// Add resource name and test again
// metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName);
mime = this.detector.detect(in, metadata).toString();
- assertEquals(expected, mime,
+ assertEquals(
+ expected,
+ mime,
urlOrFileName + " is not properly detected after adding resource name.");
} finally {
in.close();
}
}
- /**
- * Test for type detection of empty documents.
- */
+ /** Test for type detection of empty documents. */
@Test
public void testEmptyDocument() throws IOException {
- assertEquals(MediaType.OCTET_STREAM,
+ assertEquals(
+ MediaType.OCTET_STREAM,
detector.detect(new ByteArrayInputStream(new byte[0]), new Metadata()));
-
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java
index dc15299..f232025 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/NameDetectorTest.java
@@ -23,17 +23,13 @@
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
-
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
-/**
- * Test cases for the {@link NameDetector} class.
- */
+/** Test cases for the {@link NameDetector} class. */
public class NameDetectorTest {
private Detector detector;
@@ -50,27 +46,27 @@
@Test
public void testDetect() {
assertDetect(MediaType.TEXT_PLAIN, "text.txt");
- assertDetect(MediaType.TEXT_PLAIN, "text.txt "); // trailing space
- assertDetect(MediaType.TEXT_PLAIN, "text.txt\n"); // trailing newline
+ assertDetect(MediaType.TEXT_PLAIN, "text.txt "); // trailing space
+ assertDetect(MediaType.TEXT_PLAIN, "text.txt\n"); // trailing newline
assertDetect(MediaType.TEXT_PLAIN, "text.txt?a=b"); // URL query
assertDetect(MediaType.TEXT_PLAIN, "text.txt#abc"); // URL fragment
- assertDetect(MediaType.TEXT_PLAIN, "text%2Etxt"); // URL encoded
- assertDetect(MediaType.TEXT_PLAIN, "text.TXT"); // case insensitive
+ assertDetect(MediaType.TEXT_PLAIN, "text%2Etxt"); // URL encoded
+ assertDetect(MediaType.TEXT_PLAIN, "text.TXT"); // case insensitive
assertDetect(MediaType.OCTET_STREAM, "text.txt.gz");
assertDetect(MediaType.TEXT_PLAIN, "README");
- assertDetect(MediaType.TEXT_PLAIN, " README "); // space around
- assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n"); // other whitespace
- assertDetect(MediaType.TEXT_PLAIN, "/a/README"); // leading path
- assertDetect(MediaType.TEXT_PLAIN, "\\b\\README"); // windows path
- assertDetect(MediaType.OCTET_STREAM, "ReadMe"); // case sensitive
+ assertDetect(MediaType.TEXT_PLAIN, " README "); // space around
+ assertDetect(MediaType.TEXT_PLAIN, "\tREADME\n"); // other whitespace
+ assertDetect(MediaType.TEXT_PLAIN, "/a/README"); // leading path
+ assertDetect(MediaType.TEXT_PLAIN, "\\b\\README"); // windows path
+ assertDetect(MediaType.OCTET_STREAM, "ReadMe"); // case sensitive
assertDetect(MediaType.OCTET_STREAM, "README.NOW");
// TIKA-1928 # in the filename
assertDetect(MediaType.TEXT_PLAIN, "text.txt");
- assertDetect(MediaType.TEXT_PLAIN, "text#.txt"); // # before extension
- assertDetect(MediaType.TEXT_PLAIN, "text#123.txt");// # before extension
- assertDetect(MediaType.TEXT_PLAIN, "text.txt#pdf");// # after extension
+ assertDetect(MediaType.TEXT_PLAIN, "text#.txt"); // # before extension
+ assertDetect(MediaType.TEXT_PLAIN, "text#123.txt"); // # before extension
+ assertDetect(MediaType.TEXT_PLAIN, "text.txt#pdf"); // # after extension
// TIKA-3783 # before the final .
assertDetect(MediaType.TEXT_PLAIN, "ABC#192.168.0.1#2.txt");
@@ -82,7 +78,7 @@
// tough one
assertDetect(MediaType.TEXT_PLAIN, " See http://www.example.com:1234/README.txt?a=b#c \n");
assertDetect(MediaType.TEXT_PLAIN, "See README.txt"); // even this!
- assertDetect(MediaType.OCTET_STREAM, "See README"); // but not this
+ assertDetect(MediaType.OCTET_STREAM, "See README"); // but not this
assertDetect(MediaType.application("envi.hdr"), "ang20150420t182050_corr_v1e_img.hdr");
diff --git a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
index 1870033..377adc9 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/TextDetectorTest.java
@@ -24,15 +24,11 @@
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.junit.jupiter.api.Test;
-/**
- * Test cases for the {@link TextDetector} class.
- */
+/** Test cases for the {@link TextDetector} class. */
public class TextDetectorTest {
private final Detector detector = new TextDetector();
@@ -56,9 +52,9 @@
public void testDetectText() throws Exception {
assertText("Hello, World!".getBytes(UTF_8));
assertText(" \t\r\n".getBytes(UTF_8));
- assertNotText(new byte[]{-1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B});
- assertNotText(new byte[]{0});
- assertNotText(new byte[]{'H', 'e', 'l', 'l', 'o', 0});
+ assertNotText(new byte[] {-1, -2, -3, 0x09, 0x0A, 0x0C, 0x0D, 0x1B});
+ assertNotText(new byte[] {0});
+ assertNotText(new byte[] {'H', 'e', 'l', 'l', 'o', 0});
byte[] data = new byte[512];
Arrays.fill(data, (byte) '.');
@@ -99,11 +95,11 @@
private void assertNotText(byte[] data) {
try {
- assertEquals(MediaType.OCTET_STREAM,
+ assertEquals(
+ MediaType.OCTET_STREAM,
detector.detect(new ByteArrayInputStream(data), new Metadata()));
} catch (IOException e) {
fail("Unexpected exception from TextDetector");
}
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java
index d79e9b7..5ad556f 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/TypeDetectorTest.java
@@ -22,15 +22,11 @@
import java.io.IOException;
import java.util.Map;
import java.util.TreeMap;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.junit.jupiter.api.Test;
-/**
- * Test cases for the {@link TypeDetector} class.
- */
+/** Test cases for the {@link TypeDetector} class. */
public class TypeDetectorTest {
private static Map<String, String> params = new TreeMap<>();
@@ -41,7 +37,6 @@
private static MediaType TEXT_PLAIN_A_EQ_B = new MediaType("text", "plain", params);
-
private final Detector detector = new TypeDetector();
@Test
@@ -73,5 +68,4 @@
fail("TypeDetector should never throw an IOException");
}
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java b/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java
index 8527116..68bbb29 100644
--- a/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/detect/ZeroSizeFileDetectorTest.java
@@ -23,12 +23,10 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
public class ZeroSizeFileDetectorTest {
@@ -59,5 +57,4 @@
fail("Unexpected exception from ZeroSizeFileDetector");
}
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
index b233b44..ab24f15 100644
--- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTest.java
@@ -45,14 +45,7 @@
import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore;
import java.util.concurrent.TimeUnit;
-
import org.apache.commons.io.IOUtils;
-import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.io.TempDir;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -67,16 +60,20 @@
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.ContentHandlerFactory;
import org.apache.tika.sax.RecursiveParserWrapperHandler;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
public class ForkParserTest extends TikaTest {
- @TempDir
- Path tempDir;
+ @TempDir Path tempDir;
@Test
public void testHelloWorld() throws Exception {
- try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(),
- new ForkTestParser())) {
+ try (ForkParser parser =
+ new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) {
Metadata metadata = new Metadata();
ContentHandler output = new BodyContentHandler();
InputStream stream = new ByteArrayInputStream(new byte[0]);
@@ -89,8 +86,8 @@
@Test
public void testSerialParsing() throws Exception {
- try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(),
- new ForkTestParser())) {
+ try (ForkParser parser =
+ new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) {
ParseContext context = new ParseContext();
for (int i = 0; i < 10; i++) {
ContentHandler output = new BodyContentHandler();
@@ -103,8 +100,8 @@
@Test
public void testParallelParsing() throws Exception {
- try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(),
- new ForkTestParser())) {
+ try (ForkParser parser =
+ new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) {
final ParseContext context = new ParseContext();
Thread[] threads = new Thread[10];
@@ -112,14 +109,16 @@
for (int i = 0; i < threads.length; i++) {
final ContentHandler o = new BodyContentHandler();
output[i] = o;
- threads[i] = new Thread(() -> {
- try {
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- parser.parse(stream, o, new Metadata(), context);
- } catch (Exception e) {
- e.printStackTrace();
- }
- });
+ threads[i] =
+ new Thread(
+ () -> {
+ try {
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ parser.parse(stream, o, new Metadata(), context);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ });
threads[i].start();
}
@@ -132,30 +131,33 @@
@Test
public void testPoolSizeReached() throws Exception {
- try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(),
- new ForkTestParser())) {
+ try (ForkParser parser =
+ new ForkParser(ForkParserTest.class.getClassLoader(), new ForkTestParser())) {
final Semaphore barrier = new Semaphore(0);
Thread[] threads = new Thread[parser.getPoolSize()];
PipedOutputStream[] pipes = new PipedOutputStream[threads.length];
final ParseContext context = new ParseContext();
for (int i = 0; i < threads.length; i++) {
- final PipedInputStream input = new PipedInputStream() {
- @Override
- public synchronized int read() throws IOException {
- barrier.release();
- return super.read();
- }
- };
+ final PipedInputStream input =
+ new PipedInputStream() {
+ @Override
+ public synchronized int read() throws IOException {
+ barrier.release();
+ return super.read();
+ }
+ };
pipes[i] = new PipedOutputStream(input);
- threads[i] = new Thread(() -> {
- try {
- ContentHandler o = new DefaultHandler();
- parser.parse(input, o, new Metadata(), context);
- } catch (Exception e) {
- e.printStackTrace();
- }
- });
+ threads[i] =
+ new Thread(
+ () -> {
+ try {
+ ContentHandler o = new DefaultHandler();
+ parser.parse(input, o, new Metadata(), context);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ });
threads[i].start();
}
@@ -163,15 +165,17 @@
barrier.acquire(parser.getPoolSize());
final ContentHandler o = new BodyContentHandler();
- Thread blocked = new Thread(() -> {
- try {
- barrier.release();
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- parser.parse(stream, o, new Metadata(), context);
- } catch (Exception e) {
- e.printStackTrace();
- }
- });
+ Thread blocked =
+ new Thread(
+ () -> {
+ try {
+ barrier.release();
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ parser.parse(stream, o, new Metadata(), context);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+ });
blocked.start();
// Wait until the last thread is started, and then some to
@@ -200,40 +204,48 @@
forkParser.setServerPulseMillis(500);
forkParser.setServerParseTimeoutMillis(5000);
forkParser.setServerWaitTimeoutMillis(60000);
- String sleepCommand = "<mock>\n" + " <write element=\"p\">Hello, World!</write>\n" +
- " <hang millis=\"11000\" heavy=\"false\" interruptible=\"false\" />\n" +
- "</mock>";
+ String sleepCommand =
+ "<mock>\n"
+ + " <write element=\"p\">Hello, World!</write>\n"
+ + " <hang millis=\"11000\" heavy=\"false\" interruptible=\"false\" />\n"
+ + "</mock>";
ContentHandler o = new BodyContentHandler(-1);
Metadata m = new Metadata();
ParseContext c = new ParseContext();
try {
- forkParser
- .parse(new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)),
- o, m, c);
+ forkParser.parse(
+ new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)),
+ o,
+ m,
+ c);
fail("should have thrown IOException");
} catch (TikaException e) {
- //failed to communicate with forked parser process"
+ // failed to communicate with forked parser process"
} finally {
forkParser.close();
}
- //test setting very short pulse (10 ms) and a parser that takes at least 1000 ms
+ // test setting very short pulse (10 ms) and a parser that takes at least 1000 ms
forkParser = new ForkParser(ForkParserTest.class.getClassLoader(), new MockParser());
forkParser.setServerPulseMillis(10);
forkParser.setServerParseTimeoutMillis(100);
- sleepCommand = "<mock>\n" + " <write element=\"p\">Hello, World!</write>\n" +
- " <hang millis=\"1000\" heavy=\"false\" interruptible=\"false\" />\n" +
- "</mock>";
+ sleepCommand =
+ "<mock>\n"
+ + " <write element=\"p\">Hello, World!</write>\n"
+ + " <hang millis=\"1000\" heavy=\"false\" interruptible=\"false\" />\n"
+ + "</mock>";
o = new BodyContentHandler(-1);
m = new Metadata();
c = new ParseContext();
try {
- forkParser
- .parse(new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)),
- o, m, c);
+ forkParser.parse(
+ new ByteArrayInputStream(sleepCommand.getBytes(StandardCharsets.UTF_8)),
+ o,
+ m,
+ c);
fail("Should have thrown exception");
} catch (IOException | TikaException e) {
- //"should have thrown IOException lost connection"
+ // "should have thrown IOException lost connection"
} finally {
forkParser.close();
}
@@ -241,8 +253,10 @@
@Test
public void testPackageCanBeAccessed() throws Exception {
- try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(),
- new ForkTestParser.ForkTestParserAccessingPackage())) {
+ try (ForkParser parser =
+ new ForkParser(
+ ForkParserTest.class.getClassLoader(),
+ new ForkTestParser.ForkTestParserAccessingPackage())) {
Metadata metadata = new Metadata();
ContentHandler output = new BodyContentHandler();
InputStream stream = new ByteArrayInputStream(new byte[0]);
@@ -257,9 +271,10 @@
public void testRecursiveParserWrapper() throws Exception {
Parser parser = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
- 20000));
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000));
try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
InputStream is = getResourceAsStream("/test-documents/basic_embedded.xml")) {
Metadata metadata = new Metadata();
@@ -282,9 +297,10 @@
public void testRPWWithEmbeddedNPE() throws Exception {
Parser parser = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
- 20000));
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000));
try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
InputStream is = getResourceAsStream("/test-documents/embedded_with_npe.xml")) {
Metadata metadata = new Metadata();
@@ -301,17 +317,18 @@
assertEquals("embeddedAuthor", m1.get(TikaCoreProperties.CREATOR));
assertContains("some_embedded_content", m1.get(TikaCoreProperties.TIKA_CONTENT));
assertEquals("/embed1.xml", m1.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
- assertContains("another null pointer exception",
- m1.get(TikaCoreProperties.EMBEDDED_EXCEPTION));
+ assertContains(
+ "another null pointer exception", m1.get(TikaCoreProperties.EMBEDDED_EXCEPTION));
}
@Test
public void testRPWWithMainDocNPE() throws Exception {
Parser parser = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser);
- RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(
- new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT,
- 20000));
+ RecursiveParserWrapperHandler handler =
+ new RecursiveParserWrapperHandler(
+ new BasicContentHandlerFactory(
+ BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 20000));
try (ForkParser fork = new ForkParser(ForkParserTest.class.getClassLoader(), wrapper);
InputStream is = getResourceAsStream("/test-documents/embedded_then_npe.xml")) {
Metadata metadata = new Metadata();
@@ -336,15 +353,15 @@
@Test
public void testToFileHandler() throws Exception {
- //test that a server-side write-to-file works without proxying back the
- //AbstractContentHandlerFactory
+ // test that a server-side write-to-file works without proxying back the
+ // AbstractContentHandlerFactory
Path target = Files.createTempFile(tempDir, "fork-to-file-handler-", ".txt");
try (InputStream is = getResourceAsStream("/test-documents/basic_embedded.xml")) {
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(new AutoDetectParser());
ToFileHandler toFileHandler =
new ToFileHandler(new SBContentHandlerFactory(), target.toFile());
- try (ForkParser forkParser = new ForkParser(ForkParserTest.class.getClassLoader(),
- wrapper)) {
+ try (ForkParser forkParser =
+ new ForkParser(ForkParserTest.class.getClassLoader(), wrapper)) {
Metadata m = new Metadata();
ParseContext context = new ParseContext();
forkParser.parse(is, toFileHandler, m, context);
@@ -355,16 +372,21 @@
try (Reader reader = Files.newBufferedReader(target, StandardCharsets.UTF_8)) {
contents = IOUtils.toString(reader);
}
- assertContainsCount(TikaCoreProperties.TIKA_PARSED_BY.getName() +
- " : org.apache.tika.parser.DefaultParser", contents, 2);
- assertContainsCount(TikaCoreProperties.TIKA_PARSED_BY.getName() +
- " : org.apache.tika.parser.mock.MockParser", contents, 2);
+ assertContainsCount(
+ TikaCoreProperties.TIKA_PARSED_BY.getName()
+ + " : org.apache.tika.parser.DefaultParser",
+ contents,
+ 2);
+ assertContainsCount(
+ TikaCoreProperties.TIKA_PARSED_BY.getName()
+ + " : org.apache.tika.parser.mock.MockParser",
+ contents,
+ 2);
assertContains("Nikolai Lobachevsky", contents);
assertContains("embeddedAuthor", contents);
assertContains("main_content", contents);
assertContains("some_embedded_content", contents);
assertContains("X-TIKA:embedded_resource_path : /embed1.xml", contents);
-
}
@Test
@@ -394,7 +416,6 @@
assertEquals("/embed1.xml", m1.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH));
}
-
@Test
public void testRPWWithNonSerializableContentHandler() throws Exception {
Parser parser = new AutoDetectParser();
@@ -439,20 +460,23 @@
public void testForkParserDoesntPreventShutdown() throws Exception {
ExecutorService service = Executors.newFixedThreadPool(1);
CountDownLatch cdl = new CountDownLatch(1);
- service.submit(() -> {
- try (ForkParser parser = new ForkParser(ForkParserTest.class.getClassLoader(),
- new ForkTestParser.ForkTestParserWaiting())) {
- Metadata metadata = new Metadata();
- ContentHandler output = new BodyContentHandler();
- InputStream stream = new ByteArrayInputStream(new byte[0]);
- ParseContext context = new ParseContext();
- cdl.countDown();
- parser.parse(stream, output, metadata, context);
- // Don't care about output not planning to get this far
- } catch (IOException | SAXException | TikaException e) {
- throw new RuntimeException(e);
- }
- });
+ service.submit(
+ () -> {
+ try (ForkParser parser =
+ new ForkParser(
+ ForkParserTest.class.getClassLoader(),
+ new ForkTestParser.ForkTestParserWaiting())) {
+ Metadata metadata = new Metadata();
+ ContentHandler output = new BodyContentHandler();
+ InputStream stream = new ByteArrayInputStream(new byte[0]);
+ ParseContext context = new ParseContext();
+ cdl.countDown();
+ parser.parse(stream, output, metadata, context);
+ // Don't care about output not planning to get this far
+ } catch (IOException | SAXException | TikaException e) {
+ throw new RuntimeException(e);
+ }
+ });
// Wait to make sure submitted runnable is actually running
boolean await = cdl.await(1, TimeUnit.SECONDS);
if (!await) {
@@ -464,14 +488,15 @@
service.shutdownNow();
service.awaitTermination(15, TimeUnit.SECONDS);
long secondsSinceShutdown = ChronoUnit.SECONDS.between(requestShutdown, Instant.now());
- assertTrue(secondsSinceShutdown < 5, "Should have shutdown the service in less than 5 seconds");
+ assertTrue(
+ secondsSinceShutdown < 5,
+ "Should have shutdown the service in less than 5 seconds");
}
-
- //use this to test that the wrapper handler is acted upon by the server but not proxied back
+ // use this to test that the wrapper handler is acted upon by the server but not proxied back
private static class ToFileHandler extends AbstractRecursiveParserWrapperHandler {
- //this needs to be a file because a File is serializable
+ // this needs to be a file because a File is serializable
private final File file;
private OutputStream os;
@@ -579,8 +604,8 @@
private static class LyingNonSerializableContentHandler extends DefaultHandler
implements Serializable {
- //StringWriter makes this class not actually Serializable
- //as is.
+ // StringWriter makes this class not actually Serializable
+ // as is.
StringWriter writer = new StringWriter();
@Override
@@ -594,9 +619,9 @@
}
}
- //use this to test that a handler that extends RecursiveParserWrapperHandler
- //does have both contenthandlers and metadata objects proxied back from the
- //server.
+ // use this to test that a handler that extends RecursiveParserWrapperHandler
+ // does have both contenthandlers and metadata objects proxied back from the
+ // server.
private static class BufferingHandler extends RecursiveParserWrapperHandler {
List<ContentHandler> contentHandlers = new ArrayList<>();
@@ -604,7 +629,6 @@
super(contentHandlerFactory);
}
-
@Override
public void endEmbeddedDocument(ContentHandler contentHandler, Metadata metadata)
throws SAXException {
@@ -627,6 +651,5 @@
public List<Metadata> getMetadataList() {
return metadataList;
}
-
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java
index 4756f00..7e76857 100644
--- a/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java
+++ b/tika-core/src/test/java/org/apache/tika/fork/ForkParserTikaBinTest.java
@@ -18,6 +18,7 @@
import static org.junit.jupiter.api.Assertions.assertEquals;
+import com.google.common.reflect.ClassPath;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
@@ -32,15 +33,7 @@
import java.util.function.Predicate;
import java.util.jar.JarEntry;
import java.util.jar.JarOutputStream;
-
-import com.google.common.reflect.ClassPath;
import org.apache.commons.io.IOUtils;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.io.TempDir;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.TikaTest;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -48,13 +41,17 @@
import org.apache.tika.parser.AutoDetectParserFactory;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToXMLContentHandler;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
public class ForkParserTikaBinTest extends TikaTest {
private static final String JAR_FILE_NAME = "mock-tika-app.jar";
private static final Map<String, String> EMPTY_MAP = Collections.emptyMap();
- @TempDir
- private static Path JAR_DIR;
+ @TempDir private static Path JAR_DIR;
private static Path JAR_FILE;
@BeforeAll
@@ -65,29 +62,38 @@
ClassLoader loader = ForkServer.class.getClassLoader();
ClassPath classPath = ClassPath.from(loader);
addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.slf4j"));
- addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.apache.logging"));
- addClasses(jarOs, classPath,
+ addClasses(
+ jarOs, classPath, ci -> ci.getPackageName().startsWith("org.apache.logging"));
+ addClasses(
+ jarOs,
+ classPath,
ci -> ci.getPackageName().startsWith("org.apache.commons.io"));
- //exclude TypeDetectionBenchmark because it is not serializable
- //exclude UpperCasingContentHandler because we want to test that
- //we can serialize it from the parent process into the forked process
- addClasses(jarOs, classPath, ci -> ci.getPackageName().startsWith("org.apache.tika") &&
- (!ci.getName().contains("TypeDetectionBenchmark")) &&
- (!ci.getName().contains("UpperCasingContentHandler")));
+ // exclude TypeDetectionBenchmark because it is not serializable
+ // exclude UpperCasingContentHandler because we want to test that
+ // we can serialize it from the parent process into the forked process
+ addClasses(
+ jarOs,
+ classPath,
+ ci ->
+ ci.getPackageName().startsWith("org.apache.tika")
+ && (!ci.getName().contains("TypeDetectionBenchmark"))
+ && (!ci.getName().contains("UpperCasingContentHandler")));
- try (InputStream input = ForkParserTikaBinTest.class
- .getResourceAsStream("/org/apache/tika/config/TIKA-2653-vowel-parser-ae.xml")) {
+ try (InputStream input =
+ ForkParserTikaBinTest.class.getResourceAsStream(
+ "/org/apache/tika/config/TIKA-2653-vowel-parser-ae.xml")) {
jarOs.putNextEntry(
new JarEntry("org/apache/tika/parser/TIKA-2653-vowel-parser-ae.xml"));
IOUtils.copy(input, jarOs);
}
- try (InputStream input = ForkParserTikaBinTest.class
- .getResourceAsStream("/org/apache/tika/mime/tika-mimetypes.xml")) {
+ try (InputStream input =
+ ForkParserTikaBinTest.class.getResourceAsStream(
+ "/org/apache/tika/mime/tika-mimetypes.xml")) {
jarOs.putNextEntry(new JarEntry("org/apache/tika/mime/tika-mimetypes.xml"));
IOUtils.copy(input, jarOs);
}
- try (InputStream input = ForkParserTikaBinTest.class
- .getResourceAsStream("/custom-mimetypes.xml")) {
+ try (InputStream input =
+ ForkParserTikaBinTest.class.getResourceAsStream("/custom-mimetypes.xml")) {
jarOs.putNextEntry(new JarEntry("custom-mimetypes.xml"));
IOUtils.copy(input, jarOs);
}
@@ -98,15 +104,17 @@
}
Path tikaConfigVowelParser = JAR_DIR.resolve("TIKA_2653-iou.xml");
- try (InputStream is = ForkServer.class
- .getResourceAsStream("/org/apache/tika/config/TIKA-2653-vowel-parser-iou.xml");
+ try (InputStream is =
+ ForkServer.class.getResourceAsStream(
+ "/org/apache/tika/config/TIKA-2653-vowel-parser-iou.xml");
OutputStream os = Files.newOutputStream(tikaConfigVowelParser)) {
IOUtils.copy(is, os);
}
}
- private static void addClasses(JarOutputStream jarOs, ClassPath classPath,
- Predicate<ClassPath.ClassInfo> predicate) throws IOException {
+ private static void addClasses(
+ JarOutputStream jarOs, ClassPath classPath, Predicate<ClassPath.ClassInfo> predicate)
+ throws IOException {
for (ClassPath.ClassInfo classInfo : classPath.getAllClasses()) {
if (predicate.test(classInfo)) {
jarOs.putNextEntry(new JarEntry(classInfo.getResourceName()));
@@ -118,8 +126,9 @@
@Test
public void testExplicitParserFactory() throws Exception {
XMLResult xmlResult =
- getXML(new ParserFactoryFactory("org.apache.tika.parser.mock.MockParserFactory",
- EMPTY_MAP));
+ getXML(
+ new ParserFactoryFactory(
+ "org.apache.tika.parser.mock.MockParserFactory", EMPTY_MAP));
assertContains("hello world!", xmlResult.xml);
assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
}
@@ -127,8 +136,8 @@
@Test
public void testVowelParserAsDefault() throws Exception {
ParserFactoryFactory pff =
- new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory",
- EMPTY_MAP);
+ new ParserFactoryFactory(
+ "org.apache.tika.parser.AutoDetectParserFactory", EMPTY_MAP);
XMLResult xmlResult = getXML(pff);
assertContains("eooeuiooueoeeao", xmlResult.xml);
assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
@@ -138,19 +147,18 @@
public void testVowelParserInClassPath() throws Exception {
Map<String, String> args = new HashMap<>();
args.put(AutoDetectParserFactory.TIKA_CONFIG_PATH, "TIKA-2653-vowel-parser-ae.xml");
- ParserFactoryFactory pff = new ParserFactoryFactory(
- "org.apache.tika.parser.AutoDetectParserFactory",
- args);
+ ParserFactoryFactory pff =
+ new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory", args);
XMLResult xmlResult = getXML(pff);
assertContains("eeeeea", xmlResult.xml);
- assertEquals("Nikolai Lobachevsky",
- xmlResult.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
}
@Test
public void testVowelParserFromDirectory() throws Exception {
Map<String, String> args = new HashMap<>();
- args.put(AutoDetectParserFactory.TIKA_CONFIG_PATH,
+ args.put(
+ AutoDetectParserFactory.TIKA_CONFIG_PATH,
JAR_DIR.resolve("TIKA_2653-iou.xml").toAbsolutePath().toString());
ParserFactoryFactory pff =
new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory", args);
@@ -161,17 +169,16 @@
@Test
public void testPFFWithClassLoaderFromParentProcess() throws Exception {
- //The UpperCasingContentHandler is not sent to the bootstrap test jar file in @BeforeClass.
- //this tests that the content handler was loaded from the parent process.
+ // The UpperCasingContentHandler is not sent to the bootstrap test jar file in @BeforeClass.
+ // this tests that the content handler was loaded from the parent process.
ParserFactoryFactory pff =
- new ParserFactoryFactory("org.apache.tika.parser.AutoDetectParserFactory",
- EMPTY_MAP);
+ new ParserFactoryFactory(
+ "org.apache.tika.parser.AutoDetectParserFactory", EMPTY_MAP);
XMLResult xmlResult =
getXML(pff, this.getClass().getClassLoader(), new UpperCasingContentHandler());
assertContains("EOOEUIOOUEOEEAO", xmlResult.xml);
assertEquals("Nikolai Lobachevsky", xmlResult.metadata.get(TikaCoreProperties.CREATOR));
-
}
private XMLResult getXML(ParserFactoryFactory pff)
@@ -179,8 +186,8 @@
return getXML(pff, null, null);
}
- private XMLResult getXML(ParserFactoryFactory pff, ClassLoader classloader,
- ContentHandler contentHandler)
+ private XMLResult getXML(
+ ParserFactoryFactory pff, ClassLoader classloader, ContentHandler contentHandler)
throws TikaException, SAXException, IOException {
List<String> java = new ArrayList<>();
diff --git a/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java b/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java
index e9c6949..6ae7ff9 100644
--- a/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java
+++ b/tika-core/src/test/java/org/apache/tika/fork/ForkTestParser.java
@@ -22,10 +22,6 @@
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.fork.unusedpackage.ClassInUnusedPackage;
import org.apache.tika.metadata.Metadata;
@@ -33,20 +29,21 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
class ForkTestParser implements Parser {
- /**
- * Serial version UID
- */
+ /** Serial version UID */
private static final long serialVersionUID = -5492269783593452319L;
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.singleton(MediaType.TEXT_PLAIN);
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
stream.read();
metadata.set(Metadata.CONTENT_TYPE, "text/plain");
@@ -60,8 +57,9 @@
static class ForkTestParserAccessingPackage extends ForkTestParser {
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
assertNotNull(ClassInUnusedPackage.class.getPackage());
super.parse(stream, handler, metadata, context);
}
@@ -69,8 +67,9 @@
static class ForkTestParserWaiting extends ForkTestParser {
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
try {
Thread.sleep(10_000);
} catch (InterruptedException e) {
diff --git a/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java b/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java
index 3ca513f..e02a9e9 100644
--- a/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java
+++ b/tika-core/src/test/java/org/apache/tika/fork/UpperCasingContentHandler.java
@@ -17,7 +17,6 @@
package org.apache.tika.fork;
import java.util.Locale;
-
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
@@ -34,5 +33,4 @@
public String toString() {
return sb.toString();
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java b/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java
index 1de4c45..996bfa2 100644
--- a/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java
+++ b/tika-core/src/test/java/org/apache/tika/fork/unusedpackage/ClassInUnusedPackage.java
@@ -16,5 +16,4 @@
*/
package org.apache.tika.fork.unusedpackage;
-public class ClassInUnusedPackage {
-}
+public class ClassInUnusedPackage {}
diff --git a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
index 906870e..e607c08 100644
--- a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
@@ -21,7 +21,6 @@
import static org.junit.jupiter.api.Assertions.fail;
import java.io.ByteArrayInputStream;
-
import org.junit.jupiter.api.Test;
public class EndianUtilsTest {
@@ -29,65 +28,65 @@
public void testReadUE7() throws Exception {
byte[] data;
- data = new byte[]{0x08};
+ data = new byte[] {0x08};
assertEquals(8, EndianUtils.readUE7(new ByteArrayInputStream(data)));
- data = new byte[]{(byte) 0x84, 0x1e};
+ data = new byte[] {(byte) 0x84, 0x1e};
assertEquals(542, EndianUtils.readUE7(new ByteArrayInputStream(data)));
- data = new byte[]{(byte) 0xac, (byte) 0xbe, 0x17};
+ data = new byte[] {(byte) 0xac, (byte) 0xbe, 0x17};
assertEquals(728855, EndianUtils.readUE7(new ByteArrayInputStream(data)));
}
@Test
public void testReadUIntLE() throws Exception {
- byte[] data = new byte[]{(byte) 0x08, (byte) 0x00, (byte) 0x00, (byte) 0x00};
+ byte[] data = new byte[] {(byte) 0x08, (byte) 0x00, (byte) 0x00, (byte) 0x00};
assertEquals(8, EndianUtils.readUIntLE(new ByteArrayInputStream(data)));
- data = new byte[]{(byte) 0xF0, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF};
+ data = new byte[] {(byte) 0xF0, (byte) 0xFF, (byte) 0xFF, (byte) 0xFF};
assertEquals(4294967280L, EndianUtils.readUIntLE(new ByteArrayInputStream(data)));
- data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF};
+ data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF};
try {
EndianUtils.readUIntLE(new ByteArrayInputStream(data));
fail("Should have thrown exception");
} catch (EndianUtils.BufferUnderrunException e) {
- //swallow
+ // swallow
}
}
@Test
public void testReadUIntBE() throws Exception {
- byte[] data = new byte[]{(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x08};
+ byte[] data = new byte[] {(byte) 0x00, (byte) 0x00, (byte) 0x00, (byte) 0x08};
assertEquals(8, EndianUtils.readUIntBE(new ByteArrayInputStream(data)));
- data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xF0};
+ data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF, (byte) 0xF0};
assertEquals(4294967280L, EndianUtils.readUIntBE(new ByteArrayInputStream(data)));
- data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF};
+ data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF};
try {
EndianUtils.readUIntLE(new ByteArrayInputStream(data));
fail("Should have thrown exception");
} catch (EndianUtils.BufferUnderrunException e) {
- //swallow
+ // swallow
}
}
@Test
public void testReadIntME() throws Exception {
- // Example from https://yamm.finance/wiki/Endianness.html#mwAiw
- byte[] data = new byte[]{(byte) 0x0b, (byte) 0x0a, (byte) 0x0d, (byte) 0x0c};
+ // Example from https://yamm.finance/wiki/Endianness.html#mwAiw
+ byte[] data = new byte[] {(byte) 0x0b, (byte) 0x0a, (byte) 0x0d, (byte) 0x0c};
assertEquals(0x0a0b0c0d, EndianUtils.readIntME(new ByteArrayInputStream(data)));
- data = new byte[]{(byte) 0xFE, (byte) 0xFF, (byte) 0xFC, (byte) 0xFD};
+ data = new byte[] {(byte) 0xFE, (byte) 0xFF, (byte) 0xFC, (byte) 0xFD};
assertEquals(0xfffefdfc, EndianUtils.readIntME(new ByteArrayInputStream(data)));
- data = new byte[]{(byte) 0xFF, (byte) 0xFF, (byte) 0xFF};
+ data = new byte[] {(byte) 0xFF, (byte) 0xFF, (byte) 0xFF};
try {
EndianUtils.readIntME(new ByteArrayInputStream(data));
fail("Should have thrown exception");
} catch (EndianUtils.BufferUnderrunException e) {
- //swallow
+ // swallow
}
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
index 0cc869a..64a65bb 100644
--- a/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
@@ -57,11 +57,12 @@
@Test
public void normalizeWithReservedChar() throws Exception {
final String[] TEST_NAMES = {"test?.txt", "?test.txt", "test.txt?", "?test?txt?"};
- final String[] EXPECTED_NAMES =
- {"test%3F.txt", "%3Ftest.txt", "test.txt%3F", "%3Ftest%3Ftxt%3F"};
+ final String[] EXPECTED_NAMES = {
+ "test%3F.txt", "%3Ftest.txt", "test.txt%3F", "%3Ftest%3Ftxt%3F"
+ };
for (int i = 0; i < TEST_NAMES.length; ++i) {
- //System.out.println("checking " + TEST_NAMES[i]);
+ // System.out.println("checking " + TEST_NAMES[i]);
assertEquals(EXPECTED_NAMES[i], FilenameUtils.normalize(TEST_NAMES[i]));
}
}
@@ -76,11 +77,16 @@
@Test
public void normalizeWithNotPrintableChars() throws Exception {
- final String TEST_NAME = new String(
- new char[]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, '.', 16, 17, 18,
- 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31});
- final String EXPECTED_NAME = "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F" + "." +
- "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F";
+ final String TEST_NAME =
+ new String(
+ new char[] {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, '.', 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+ });
+ final String EXPECTED_NAME =
+ "%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F"
+ + "."
+ + "%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F";
assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME));
}
@@ -104,5 +110,4 @@
private void testFilenameEquality(String expected, String path) {
assertEquals(expected, FilenameUtils.getName(path));
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java
index b6237b3..9dfd267 100644
--- a/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/LookaheadInputStreamTest.java
@@ -21,12 +21,9 @@
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
-
import org.junit.jupiter.api.Test;
-/**
- * Test cases for the {@link LookaheadInputStream} class.
- */
+/** Test cases for the {@link LookaheadInputStream} class. */
public class LookaheadInputStreamTest {
@Test
@@ -46,7 +43,7 @@
@Test
public void testBasicLookahead() throws IOException {
- InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'});
+ InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'});
InputStream lookahead = new LookaheadInputStream(stream, 2);
assertEquals('a', lookahead.read());
assertEquals('b', lookahead.read());
@@ -60,7 +57,7 @@
@Test
public void testZeroLookahead() throws IOException {
- InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'});
+ InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'});
InputStream lookahead = new LookaheadInputStream(stream, 0);
assertEquals(-1, lookahead.read());
lookahead.close();
@@ -72,7 +69,7 @@
@Test
public void testMarkLookahead() throws IOException {
- InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'});
+ InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'});
InputStream lookahead = new LookaheadInputStream(stream, 2);
lookahead.mark(1);
assertEquals('a', lookahead.read());
@@ -93,7 +90,7 @@
@Test
public void testSkipLookahead() throws IOException {
- InputStream stream = new ByteArrayInputStream(new byte[]{'a', 'b', 'c'});
+ InputStream stream = new ByteArrayInputStream(new byte[] {'a', 'b', 'c'});
InputStream lookahead = new LookaheadInputStream(stream, 2);
assertEquals(1, lookahead.skip(1));
assertEquals('b', lookahead.read());
@@ -105,5 +102,4 @@
assertEquals('c', stream.read());
assertEquals(-1, stream.read());
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java
index cfe0c15..62a5e02 100644
--- a/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/TailStreamTest.java
@@ -24,26 +24,23 @@
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
-
import org.junit.jupiter.api.Test;
-/**
- * Test class for {@code TailStream}.
- */
+/** Test class for {@code TailStream}. */
public class TailStreamTest {
- /**
- * Constant for generating test text.
- */
- private static final String TEXT = "Lorem ipsum dolor sit amet, consetetur " +
- "sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut " +
- "labore et dolore magna aliquyam erat, sed diam voluptua. At vero" +
- " eos et accusam et justo duo dolores et ea rebum. Stet clita " +
- "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor " + "sit amet.";
+ /** Constant for generating test text. */
+ private static final String TEXT =
+ "Lorem ipsum dolor sit amet, consetetur "
+ + "sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut "
+ + "labore et dolore magna aliquyam erat, sed diam voluptua. At vero"
+ + " eos et accusam et justo duo dolores et ea rebum. Stet clita "
+ + "kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor "
+ + "sit amet.";
/**
* Generates a test text using the specified parameters.
*
- * @param from the start index of the text
+ * @param from the start index of the text
* @param length the length of the text
* @return the generated test text
*/
@@ -59,7 +56,7 @@
/**
* Generates a stream which contains a test text.
*
- * @param from the start index of the text
+ * @param from the start index of the text
* @param length the length of the generated stream
* @return the stream with the test text
*/
@@ -83,9 +80,7 @@
return bos.toByteArray();
}
- /**
- * Tests whether the tail buffer can be obtained before data was read.
- */
+ /** Tests whether the tail buffer can be obtained before data was read. */
@Test
public void testTailBeforeRead() throws IOException {
TailStream stream = new TailStream(generateStream(0, 100), 50);
@@ -93,9 +88,7 @@
stream.close();
}
- /**
- * Tests the content of the tail buffer if it is only partly filled.
- */
+ /** Tests the content of the tail buffer if it is only partly filled. */
@Test
public void testTailBufferPartlyRead() throws IOException {
final int count = 64;
@@ -105,21 +98,17 @@
stream.close();
}
- /**
- * Tests the content of the tail buffer if only single bytes were read.
- */
+ /** Tests the content of the tail buffer if only single bytes were read. */
@Test
public void testTailSingleByteReads() throws IOException {
final int count = 128;
TailStream stream = new TailStream(generateStream(0, 2 * count), count);
readStream(stream);
- assertEquals(generateText(count, count), new String(stream.getTail(), UTF_8),
- "Wrong buffer");
+ assertEquals(
+ generateText(count, count), new String(stream.getTail(), UTF_8), "Wrong buffer");
}
- /**
- * Tests the content of the tail buffer if larger chunks are read.
- */
+ /** Tests the content of the tail buffer if larger chunks are read. */
@Test
public void testTailChunkReads() throws IOException {
final int count = 16384;
@@ -132,14 +121,14 @@
while (read != -1) {
read = stream.read(buf);
}
- assertEquals(generateText(count - tailSize, tailSize),
- new String(stream.getTail(), UTF_8), "Wrong buffer");
+ assertEquals(
+ generateText(count - tailSize, tailSize),
+ new String(stream.getTail(), UTF_8),
+ "Wrong buffer");
stream.close();
}
- /**
- * Tests whether mark() and reset() work as expected.
- */
+ /** Tests whether mark() and reset() work as expected. */
@Test
public void testReadWithMarkAndReset() throws IOException {
final int tailSize = 64;
@@ -150,13 +139,13 @@
stream.read(buf);
stream.reset();
readStream(stream);
- assertEquals(generateText(tailSize, tailSize),
- new String(stream.getTail(), UTF_8), "Wrong buffer");
+ assertEquals(
+ generateText(tailSize, tailSize),
+ new String(stream.getTail(), UTF_8),
+ "Wrong buffer");
}
- /**
- * Tests whether a reset() operation without a mark is simply ignored.
- */
+ /** Tests whether a reset() operation without a mark is simply ignored. */
@Test
public void testResetWithoutMark() throws IOException {
final int tailSize = 75;
@@ -165,14 +154,14 @@
stream.reset();
byte[] buf = new byte[count];
stream.read(buf);
- assertEquals(generateText(count - tailSize, tailSize),
- new String(stream.getTail(), UTF_8), "Wrong buffer");
+ assertEquals(
+ generateText(count - tailSize, tailSize),
+ new String(stream.getTail(), UTF_8),
+ "Wrong buffer");
stream.close();
}
- /**
- * Tests whether skip() also fills the tail buffer.
- */
+ /** Tests whether skip() also fills the tail buffer. */
@Test
public void testSkip() throws IOException {
final int tailSize = 128;
@@ -180,27 +169,24 @@
final int skipCount = 512;
TailStream stream = new TailStream(generateStream(0, count), tailSize);
assertEquals(skipCount, stream.skip(skipCount), "Wrong skip result");
- assertEquals(generateText(skipCount - tailSize, tailSize),
- new String(stream.getTail(), UTF_8), "Wrong buffer");
+ assertEquals(
+ generateText(skipCount - tailSize, tailSize),
+ new String(stream.getTail(), UTF_8),
+ "Wrong buffer");
stream.close();
}
- /**
- * Tests a skip operation at the end of the stream.
- */
+ /** Tests a skip operation at the end of the stream. */
@Test
public void testSkipEOS() throws IOException {
final int count = 128;
TailStream stream = new TailStream(generateStream(0, count), 2 * count);
assertEquals(count, stream.skip(2 * count), "Wrong skip result");
- assertEquals(generateText(0, count), new String(stream.getTail(), UTF_8),
- "Wrong buffer");
+ assertEquals(generateText(0, count), new String(stream.getTail(), UTF_8), "Wrong buffer");
stream.close();
}
- /**
- * Tests skip() if read reaches the end of the stream and returns -1.
- */
+ /** Tests skip() if read reaches the end of the stream and returns -1. */
@Test
public void testSkipReadEnd() throws IOException {
final int count = 128;
diff --git a/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java b/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java
index fffb3f3..06784f5 100644
--- a/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/TemporaryResourcesTest.java
@@ -21,7 +21,6 @@
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
-
import org.junit.jupiter.api.Test;
public class TemporaryResourcesTest {
@@ -31,10 +30,11 @@
Path tempFile;
try (TemporaryResources tempResources = new TemporaryResources()) {
tempFile = tempResources.createTempFile();
- assertTrue(Files.exists(tempFile), "Temp file should exist while TempResources is used");
+ assertTrue(
+ Files.exists(tempFile), "Temp file should exist while TempResources is used");
}
- assertTrue(Files.notExists(tempFile),
+ assertTrue(
+ Files.notExists(tempFile),
"Temp file should not exist after TempResources is closed");
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
index 1f8943e..51e93a4 100644
--- a/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/TikaInputStreamTest.java
@@ -29,18 +29,15 @@
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
-
import org.apache.commons.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-
public class TikaInputStreamTest {
- @TempDir
- Path tempDir;
+ @TempDir Path tempDir;
@Test
public void testFileBased() throws IOException {
@@ -50,19 +47,23 @@
assertNull(stream.getOpenContainer());
assertNull(stream.getInputStreamFactory());
- assertEquals(path, TikaInputStream.get(stream).getPath(),
- "The file returned by the getFile() method should" +
- " be the file used to instantiate a TikaInputStream");
+ assertEquals(
+ path,
+ TikaInputStream.get(stream).getPath(),
+ "The file returned by the getFile() method should"
+ + " be the file used to instantiate a TikaInputStream");
- assertEquals("Hello, World!", readStream(stream),
- "The contents of the TikaInputStream should equal the" +
- " contents of the underlying file");
+ assertEquals(
+ "Hello, World!",
+ readStream(stream),
+ "The contents of the TikaInputStream should equal the"
+ + " contents of the underlying file");
stream.close();
- assertTrue(Files.exists(path),
- "The close() method must not remove the file used to" +
- " instantiate a TikaInputStream");
-
+ assertTrue(
+ Files.exists(path),
+ "The close() method must not remove the file used to"
+ + " instantiate a TikaInputStream");
}
@Test
@@ -79,29 +80,37 @@
assertNull(stream.getOpenContainer());
assertNull(stream.getInputStreamFactory());
- assertEquals("Hello, World!", readFile(file),
- "The contents of the file returned by the getFile method" +
- " should equal the contents of the TikaInputStream");
+ assertEquals(
+ "Hello, World!",
+ readFile(file),
+ "The contents of the file returned by the getFile method"
+ + " should equal the contents of the TikaInputStream");
- assertEquals("Hello, World!", readStream(stream),
- "The contents of the TikaInputStream should not get modified" +
- " by reading the file first");
+ assertEquals(
+ "Hello, World!",
+ readStream(stream),
+ "The contents of the TikaInputStream should not get modified"
+ + " by reading the file first");
stream.close();
- assertFalse(Files.exists(file),
+ assertFalse(
+ Files.exists(file),
"The close() method must remove the temporary file created by a TikaInputStream");
}
@Test
public void testInputStreamFactoryBased() throws IOException {
- TikaInputStream stream = TikaInputStream.get(() -> IOUtils.toInputStream("Hello, World!", UTF_8));
+ TikaInputStream stream =
+ TikaInputStream.get(() -> IOUtils.toInputStream("Hello, World!", UTF_8));
assertFalse(stream.hasFile());
assertNull(stream.getOpenContainer());
assertNotNull(stream.getInputStreamFactory());
- assertEquals("Hello, World!", readStream(stream),
- "The contents of the TikaInputStream should not get modified" +
- " by reading the file first");
+ assertEquals(
+ "Hello, World!",
+ readStream(stream),
+ "The contents of the TikaInputStream should not get modified"
+ + " by reading the file first");
stream.close();
}
@@ -125,8 +134,8 @@
Metadata metadata = new Metadata();
TikaInputStream.get(url, metadata).close();
assertEquals("test.txt", metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY));
- assertEquals(Long.toString(Files.size(Paths.get(url.toURI()))),
+ assertEquals(
+ Long.toString(Files.size(Paths.get(url.toURI()))),
metadata.get(Metadata.CONTENT_LENGTH));
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java
index a035574..c7a9ccf 100644
--- a/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java
+++ b/tika-core/src/test/java/org/apache/tika/language/detect/LanguageNamesTest.java
@@ -34,5 +34,4 @@
// TODO verify that "en-GB" == "en"???
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java b/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java
index 2ce1b8b..3ecf338 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/TestMetadata.java
@@ -35,24 +35,18 @@
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.TikaTest;
import org.apache.tika.utils.DateUtils;
+import org.junit.jupiter.api.Test;
-//Junit imports
+// Junit imports
-/**
- * JUnit based tests of class {@link org.apache.tika.metadata.Metadata}.
- */
+/** JUnit based tests of class {@link org.apache.tika.metadata.Metadata}. */
public class TestMetadata extends TikaTest {
private static final String CONTENTTYPE = "contenttype";
- /**
- * Test for the <code>add(String, String)</code> method.
- */
+ /** Test for the <code>add(String, String)</code> method. */
@Test
public void testAdd() {
String[] values = null;
@@ -87,13 +81,11 @@
meta.add(nonMultiValued, "value2");
fail("add should fail on the second call of a non-multi valued item");
} catch (PropertyTypeException e) {
- //swallow
+ // swallow
}
}
- /**
- * Test for the <code>set(String, String)</code> method.
- */
+ /** Test for the <code>set(String, String)</code> method. */
@Test
public void testSet() {
String[] values = null;
@@ -120,9 +112,7 @@
assertEquals("new value 2", values[1]);
}
- /**
- * Test for <code>setAll(Properties)</code> method.
- */
+ /** Test for <code>setAll(Properties)</code> method. */
@Test
public void testSetProperties() {
String[] values = null;
@@ -150,9 +140,7 @@
assertEquals("value2.1", values[0]);
}
- /**
- * Test for <code>get(String)</code> method.
- */
+ /** Test for <code>get(String)</code> method. */
@Test
public void testGet() {
Metadata meta = new Metadata();
@@ -163,9 +151,7 @@
assertEquals("value-1", meta.get("a-name"));
}
- /**
- * Test for <code>isMultiValued()</code> method.
- */
+ /** Test for <code>isMultiValued()</code> method. */
@Test
public void testIsMultiValued() {
Metadata meta = new Metadata();
@@ -176,9 +162,7 @@
assertTrue(meta.isMultiValued("key"));
}
- /**
- * Test for <code>names</code> method.
- */
+ /** Test for <code>names</code> method. */
@Test
public void testNames() {
String[] names = null;
@@ -195,9 +179,7 @@
assertEquals(2, names.length);
}
- /**
- * Test for <code>remove(String)</code> method.
- */
+ /** Test for <code>remove(String)</code> method. */
@Test
public void testRemove() {
Metadata meta = new Metadata();
@@ -219,9 +201,7 @@
assertNull(meta.get("name-two"));
}
- /**
- * Test for <code>equals(Object)</code> method.
- */
+ /** Test for <code>equals(Object)</code> method. */
@Test
public void testObject() {
Metadata meta1 = new Metadata();
@@ -247,10 +227,7 @@
assertFalse(meta1.equals(meta2));
}
- /**
- * Tests for getting and setting integer
- * based properties
- */
+ /** Tests for getting and setting integer based properties */
@Test
public void testGetSetInt() {
Metadata meta = new Metadata();
@@ -264,13 +241,13 @@
meta.set(Metadata.BITS_PER_SAMPLE, 1);
fail("Shouldn't be able to set a multi valued property as an int");
} catch (PropertyTypeException e) {
- //swallow
+ // swallow
}
try {
meta.set(TikaCoreProperties.CREATED, 1);
fail("Shouldn't be able to set a date property as an int");
} catch (PropertyTypeException e) {
- //swallow
+ // swallow
}
// Can set it and retrieve it
@@ -290,10 +267,7 @@
assertEquals(null, meta.getInt(TikaCoreProperties.CREATED));
}
- /**
- * Tests for getting and setting date
- * based properties
- */
+ /** Tests for getting and setting date based properties */
@Test
public void testGetSetDate() {
Metadata meta = new Metadata();
@@ -308,13 +282,13 @@
meta.set(Metadata.BITS_PER_SAMPLE, new Date(1000));
fail("Shouldn't be able to set a multi valued property as a date");
} catch (PropertyTypeException e) {
- //swallow
+ // swallow
}
try {
meta.set(Metadata.IMAGE_WIDTH, new Date(1000));
fail("Shouldn't be able to set an int property as an date");
} catch (PropertyTypeException e) {
- //swallow
+ // swallow
}
// Can set it and retrieve it
@@ -334,7 +308,7 @@
assertEquals(null, meta.getInt(TikaCoreProperties.CREATED));
// Our format doesn't include milliseconds
- // This means things get rounded
+ // This means things get rounded
meta.set(TikaCoreProperties.CREATED, new Date(1050));
assertEquals("1970-01-01T00:00:01Z", meta.get(TikaCoreProperties.CREATED));
assertEquals(1000, meta.getDate(TikaCoreProperties.CREATED).getTime());
@@ -367,8 +341,8 @@
}
/**
- * Some documents, like jpegs, might have date in unspecified time zone
- * which should be handled like strings but verified to have parseable ISO 8601 format
+ * Some documents, like jpegs, might have date in unspecified time zone which should be handled
+ * like strings but verified to have parseable ISO 8601 format
*/
@Test
public void testGetSetDateUnspecifiedTimezone() {
@@ -376,26 +350,34 @@
// Set explictly without a timezone
meta.set(TikaCoreProperties.CREATED, "1970-01-01T00:00:01");
- assertEquals("1970-01-01T00:00:01", meta.get(TikaCoreProperties.CREATED),
+ assertEquals(
+ "1970-01-01T00:00:01",
+ meta.get(TikaCoreProperties.CREATED),
"should return string without time zone specifier because zone is not known");
// Now ask DateUtils to format for us without one
meta.set(TikaCoreProperties.CREATED, DateUtils.formatDateUnknownTimezone(new Date(1000)));
- assertEquals("1970-01-01T00:00:01", meta.get(TikaCoreProperties.CREATED),
+ assertEquals(
+ "1970-01-01T00:00:01",
+ meta.get(TikaCoreProperties.CREATED),
"should return string without time zone specifier because zone is not known");
}
/**
- * Defines a composite property, then checks that when set as the
- * composite the value can be retrieved with the property or the aliases
+ * Defines a composite property, then checks that when set as the composite the value can be
+ * retrieved with the property or the aliases
*/
@SuppressWarnings("deprecation")
@Test
public void testCompositeProperty() {
Metadata meta = new Metadata();
- Property compositeProperty = Property.composite(DublinCore.DESCRIPTION,
- new Property[]{TikaCoreProperties.DESCRIPTION,
- Property.internalText("testDescriptionAlt")});
+ Property compositeProperty =
+ Property.composite(
+ DublinCore.DESCRIPTION,
+ new Property[] {
+ TikaCoreProperties.DESCRIPTION,
+ Property.internalText("testDescriptionAlt")
+ });
String message = "composite description";
meta.set(compositeProperty, message);
@@ -424,7 +406,6 @@
finished++;
}
}
-
}
@Test
@@ -506,9 +487,8 @@
df.setTimeZone(TimeZone.getTimeZone("UTC"));
m.set(TikaCoreProperties.CREATED, df.format(now));
assertTrue(
- Math.abs(now.getTime() - m.getDate(TikaCoreProperties.CREATED).getTime()) <
- 2000);
-
+ Math.abs(now.getTime() - m.getDate(TikaCoreProperties.CREATED).getTime())
+ < 2000);
}
return 1;
}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
index ac64734..d55bd72 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java
@@ -17,13 +17,10 @@
package org.apache.tika.metadata.filter;
import java.util.Locale;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
-/**
- * Mock Filter for testing uppercasing of all values
- */
+/** Mock Filter for testing uppercasing of all values */
public class MockUpperCaseFilter extends MetadataFilter {
@Override
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
index 0b071d0..9aec84c 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/filter/TestMetadataFilter.java
@@ -24,14 +24,12 @@
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.AbstractTikaConfigTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
+import org.junit.jupiter.api.Test;
public class TestMetadataFilter extends AbstractTikaConfigTest {
@@ -111,7 +109,7 @@
@Test
public void testConfigIncludeAndUCFilter() throws Exception {
TikaConfig config = getConfig("TIKA-3137-include-uc.xml");
- String[] expectedTitles = new String[]{"TITLE1", "TITLE2", "TITLE3"};
+ String[] expectedTitles = new String[] {"TITLE1", "TITLE2", "TITLE3"};
Metadata metadata = new Metadata();
metadata.add("title", "title1");
metadata.add("title", "title2");
@@ -141,7 +139,6 @@
filter.filter(metadata);
assertEquals(2, metadata.size());
assertEquals("author", metadata.get("author"));
-
}
@Test
@@ -182,8 +179,8 @@
@Test
public void testDateNormalizingFilter() throws Exception {
- //test that a Date lacking a timezone, if interpreted as Los Angeles, for example,
- //yields a UTC string that is properly +7 hours.
+ // test that a Date lacking a timezone, if interpreted as Los Angeles, for example,
+ // yields a UTC string that is properly +7 hours.
Metadata m = new Metadata();
m.set(TikaCoreProperties.CREATED, "2021-07-23T01:02:24");
DateNormalizingMetadataFilter filter = new DateNormalizingMetadataFilter();
@@ -243,5 +240,4 @@
assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
assertEquals("text/html", metadata.get(Metadata.CONTENT_TYPE));
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
index 7b7e871..f5f64ab 100644
--- a/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
+++ b/tika-core/src/test/java/org/apache/tika/metadata/writefilter/StandardWriteFilterTest.java
@@ -23,9 +23,6 @@
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Set;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.config.TikaConfigTest;
@@ -38,10 +35,10 @@
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.AutoDetectParserConfig;
import org.apache.tika.parser.ParseContext;
+import org.junit.jupiter.api.Test;
public class StandardWriteFilterTest extends TikaTest {
-
@Test
public void testMetadataFactoryConfig() throws Exception {
TikaConfig tikaConfig =
@@ -50,8 +47,7 @@
MetadataWriteFilterFactory factory = config.getMetadataWriteFilterFactory();
assertEquals(350, ((StandardWriteFilterFactory) factory).getMaxTotalEstimatedBytes());
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
- String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
- "<mock>";
+ String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<mock>";
for (int i = 0; i < 20; i++) {
mock += "<metadata action=\"add\" name=\"dc:creator\">01234567890123456789</metadata>";
}
@@ -59,8 +55,12 @@
mock += "</mock>";
Metadata metadata = new Metadata();
List<Metadata> metadataList =
- getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
- parser, metadata, new ParseContext(), true);
+ getRecursiveMetadata(
+ new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+ parser,
+ metadata,
+ new ParseContext(),
+ true);
assertEquals(1, metadataList.size());
metadata = metadataList.get(0);
@@ -81,8 +81,7 @@
assertEquals(999, ((StandardWriteFilterFactory) factory).getMaxKeySize());
assertEquals(10001, ((StandardWriteFilterFactory) factory).getMaxFieldSize());
AutoDetectParser parser = new AutoDetectParser(tikaConfig);
- String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" +
- "<mock>";
+ String mock = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<mock>";
mock += "<metadata action=\"add\" name=\"dc:subject\">this is not a title</metadata>";
mock += "<metadata action=\"add\" name=\"dc:title\">this is a title</metadata>";
for (int i = 0; i < 20; i++) {
@@ -94,19 +93,23 @@
metadata.add("dc:creator", "abcdefghijabcdefghij");
metadata.add("not-allowed", "not-allowed");
List<Metadata> metadataList =
- getRecursiveMetadata(new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
- parser, metadata, new ParseContext(), true);
+ getRecursiveMetadata(
+ new ByteArrayInputStream(mock.getBytes(StandardCharsets.UTF_8)),
+ parser,
+ metadata,
+ new ParseContext(),
+ true);
assertEquals(1, metadataList.size());
metadata = metadataList.get(0);
- //test that this was removed during the filter existing stage
+ // test that this was removed during the filter existing stage
assertNull(metadata.get("not-allowed"));
- //test that this was not allowed because it isn't in the "include" list
+ // test that this was not allowed because it isn't in the "include" list
assertNull(metadata.get("dc:subject"));
String[] creators = metadata.getValues("dc:creator");
assertEquals("abcdefghijabcdefghij", creators[0]);
- //this gets more than the other test because this is filtering out some fields
+ // this gets more than the other test because this is filtering out some fields
assertEquals(3, creators.length);
assertEquals("012345678901234", creators[2]);
assertContainsCount(" hello ", metadata.get(TikaCoreProperties.TIKA_CONTENT), 30);
@@ -115,16 +118,15 @@
@Test
public void testKeySizeFilter() throws Exception {
- Metadata metadata = filter(10, 1000, 10000, 100,
- null, true);
- //test that must add keys are not truncated
+ Metadata metadata = filter(10, 1000, 10000, 100, null, true);
+ // test that must add keys are not truncated
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser1");
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser2");
metadata.add(TikaCoreProperties.TIKA_PARSED_BY, "some-long-parser3");
assertEquals(3, metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY).length);
metadata.add(OfficeOpenXMLExtended.DOC_SECURITY_STRING, "some doc-security-string");
- //truncated to 10 bytes in UTF-16 = 5 characters
+ // truncated to 10 bytes in UTF-16 = 5 characters
assertEquals("some doc-security-string", metadata.getValues("exten")[0]);
assertTruncated(metadata);
@@ -135,16 +137,14 @@
@Test
public void testAfterMaxHit() throws Exception {
- String k = "dc:creator";//20 bytes
- //key is > maxTotalBytes, so the value isn't even added
- Metadata metadata = filter(100, 10000, 10,
- 100, null, false);
+ String k = "dc:creator"; // 20 bytes
+ // key is > maxTotalBytes, so the value isn't even added
+ Metadata metadata = filter(100, 10000, 10, 100, null, false);
metadata.set(k, "ab");
assertEquals(1, metadata.names().length);
assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
- metadata = filter(100, 10000, 50, 100,
- null, false);
+ metadata = filter(100, 10000, 50, 100, null, false);
for (int i = 0; i < 10; i++) {
metadata.set(k, "abcde");
}
@@ -153,10 +153,10 @@
assertEquals("abcde", metadata.getValues(k)[0]);
assertNull(metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
- metadata.add(k, "abcde");//40
- metadata.add(k, "abc");//46
- metadata.add(k, "abcde");//only the first character is taken from this
- metadata.add(k, "abcde");//this shouldn't even be countenanced
+ metadata.add(k, "abcde"); // 40
+ metadata.add(k, "abc"); // 46
+ metadata.add(k, "abcde"); // only the first character is taken from this
+ metadata.add(k, "abcde"); // this shouldn't even be countenanced
assertEquals(2, metadata.names().length);
assertEquals(4, metadata.getValues(k).length);
@@ -166,8 +166,8 @@
assertEquals("a", metadata.getValues(k)[3]);
assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
- //this will force a reset of the total max bytes because
- //this is a set, not an add. This should get truncated at 15 chars = 30 bytes
+ // this will force a reset of the total max bytes because
+ // this is a set, not an add. This should get truncated at 15 chars = 30 bytes
metadata.set(k, "abcdefghijklmnopqrstuvwx");
assertEquals(2, metadata.names().length);
assertEquals(1, metadata.getValues(k).length);
@@ -177,14 +177,14 @@
@Test
public void testMinSizeForAlwaysInclude() throws Exception {
- //test that mimes don't get truncated
+ // test that mimes don't get truncated
Metadata metadata = filter(100, 10, 10000, 100, null, true);
String mime = getLongestMime().toString();
metadata.set(Metadata.CONTENT_TYPE, mime);
assertEquals(mime, metadata.get(Metadata.CONTENT_TYPE));
- //test that other fields are truncated
+ // test that other fields are truncated
metadata.set("dc:title", "abcdefghij");
assertEquals("abcde", metadata.get("dc:title"));
assertTruncated(metadata);
@@ -202,11 +202,22 @@
private void assertTruncated(Metadata metadata) {
assertEquals("true", metadata.get(TikaCoreProperties.TRUNCATED_METADATA));
}
- private Metadata filter(int maxKeySize, int maxFieldSize, int maxTotalBytes,
- int maxValuesPerField,
- Set<String> includeFields, boolean includeEmpty) {
- MetadataWriteFilter filter = new StandardWriteFilter(maxKeySize, maxFieldSize,
- maxTotalBytes, maxValuesPerField, includeFields, includeEmpty);
+
+ private Metadata filter(
+ int maxKeySize,
+ int maxFieldSize,
+ int maxTotalBytes,
+ int maxValuesPerField,
+ Set<String> includeFields,
+ boolean includeEmpty) {
+ MetadataWriteFilter filter =
+ new StandardWriteFilter(
+ maxKeySize,
+ maxFieldSize,
+ maxTotalBytes,
+ maxValuesPerField,
+ includeFields,
+ includeEmpty);
Metadata metadata = new Metadata();
metadata.setMetadataWriteFilter(filter);
return metadata;
@@ -226,5 +237,4 @@
}
return longest;
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
index 6c57740..8858031 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/CustomReaderTest.java
@@ -24,12 +24,10 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
-
import org.junit.jupiter.api.Test;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
-
public class CustomReaderTest {
@Test
@@ -54,8 +52,8 @@
MimeType another = mimeTypes.forName(key);
assertEquals("kittens", reader.values.get(key));
assertEquals(1, reader.ignorePatterns.size());
- assertEquals(another.toString() + ">>*" + hello.getExtension(),
- reader.ignorePatterns.get(0));
+ assertEquals(
+ another.toString() + ">>*" + hello.getExtension(), reader.ignorePatterns.get(0));
assertTrue(another.isInterpreted(), "Server-side script type not detected");
}
@@ -67,7 +65,6 @@
super(types);
}
-
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
@@ -89,8 +86,13 @@
}
@Override
- protected void handleGlobError(MimeType type, String pattern, MimeTypeException ex,
- String qName, Attributes attributes) throws SAXException {
+ protected void handleGlobError(
+ MimeType type,
+ String pattern,
+ MimeTypeException ex,
+ String qName,
+ Attributes attributes)
+ throws SAXException {
ignorePatterns.add(type.toString() + ">>" + pattern);
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java b/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java
index 64a2beb..5ac9b47 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MediaTypeTest.java
@@ -23,14 +23,14 @@
import java.util.HashMap;
import java.util.Map;
-
import org.junit.jupiter.api.Test;
public class MediaTypeTest {
@Test
public void testBasics() {
- assertEquals("application/octet-stream",
+ assertEquals(
+ "application/octet-stream",
new MediaType("application", "octet-stream").toString());
assertEquals("text/plain", new MediaType("text", "plain").toString());
@@ -39,11 +39,12 @@
assertEquals("text/plain", new MediaType("text", "plain", parameters).toString());
parameters.put("charset", "UTF-8");
- assertEquals("text/plain; charset=UTF-8",
- new MediaType("text", "plain", parameters).toString());
+ assertEquals(
+ "text/plain; charset=UTF-8", new MediaType("text", "plain", parameters).toString());
parameters.put("x-eol-style", "crlf");
- assertEquals("text/plain; charset=UTF-8; x-eol-style=crlf",
+ assertEquals(
+ "text/plain; charset=UTF-8; x-eol-style=crlf",
new MediaType("text", "plain", parameters).toString());
}
@@ -56,11 +57,12 @@
assertEquals("text/plain", new MediaType("text", "PLAIN", parameters).toString());
parameters.put("CHARSET", "UTF-8");
- assertEquals("text/plain; charset=UTF-8",
- new MediaType("TEXT", "plain", parameters).toString());
+ assertEquals(
+ "text/plain; charset=UTF-8", new MediaType("TEXT", "plain", parameters).toString());
parameters.put("X-Eol-Style", "crlf");
- assertEquals("text/plain; charset=UTF-8; x-eol-style=crlf",
+ assertEquals(
+ "text/plain; charset=UTF-8; x-eol-style=crlf",
new MediaType("TeXt", "PlAiN", parameters).toString());
}
@@ -73,11 +75,13 @@
assertEquals("text/plain", new MediaType("text\r\n", " \tplain", parameters).toString());
parameters.put(" charset", "UTF-8");
- assertEquals("text/plain; charset=UTF-8",
+ assertEquals(
+ "text/plain; charset=UTF-8",
new MediaType("\n\ntext", "plain \r", parameters).toString());
parameters.put("\r\n\tx-eol-style \t", "crlf");
- assertEquals("text/plain; charset=UTF-8; x-eol-style=crlf",
+ assertEquals(
+ "text/plain; charset=UTF-8; x-eol-style=crlf",
new MediaType(" text", "\tplain ", parameters).toString());
}
@@ -87,8 +91,9 @@
parameters.put("a", " value with spaces ");
parameters.put("b", "text/plain");
parameters.put("c", "()<>@,;:\\\"/[]?=");
- assertEquals("text/plain; a=\" value with spaces \"; b=\"text\\/plain\"" +
- "; c=\"\\(\\)\\<\\>\\@\\,\\;\\:\\\\\\\"\\/\\[\\]\\?\\=\"",
+ assertEquals(
+ "text/plain; a=\" value with spaces \"; b=\"text\\/plain\""
+ + "; c=\"\\(\\)\\<\\>\\@\\,\\;\\:\\\\\\\"\\/\\[\\]\\?\\=\"",
new MediaType("text", "plain", parameters).toString());
}
@@ -121,9 +126,7 @@
assertTrue(gotCharset && gotFoo && gotFoo2);
}
- /**
- * Per http://tools.ietf.org/html/rfc2045#section-5.1, charset can be in quotes
- */
+ /** Per http://tools.ietf.org/html/rfc2045#section-5.1, charset can be in quotes */
@Test
public void testParseWithParamsAndQuotedCharset() {
// Typical case, with a quoted charset
@@ -171,19 +174,20 @@
assertEquals(0, type.getParameters().keySet().size());
}
- /**
- * TIKA-349
- */
+ /** TIKA-349 */
@Test
public void testOddParameters() {
- assertEquals("text/html; charset=UTF-8",
+ assertEquals(
+ "text/html; charset=UTF-8",
MediaType.parse("text/html;; charset=UTF-8").toString());
- assertEquals("text/html; charset=UTF-8",
+ assertEquals(
+ "text/html; charset=UTF-8",
MediaType.parse("text/html;; charset=UTF-8").toString());
- assertEquals("text/html; charset=UTF-8",
+ assertEquals(
+ "text/html; charset=UTF-8",
MediaType.parse("text/html;; charset=\"UTF-8\"").toString());
- assertEquals("text/html; charset=UTF-8",
+ assertEquals(
+ "text/html; charset=UTF-8",
MediaType.parse("text/html;; charset=\"UTF-8").toString());
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
index 84820ac..84d3064 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeDetectionTest.java
@@ -27,15 +27,13 @@
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-
import org.apache.commons.io.ByteOrderMark;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
public class MimeDetectionTest {
@@ -65,7 +63,9 @@
testFile("application/xml", "test-utf16be.xml");
testFile("application/xml", "test-long-comment.xml");
testFile("application/xslt+xml", "stylesheet.xsl");
- testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
+ testUrl(
+ "application/rdf+xml",
+ "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
"test-difficult-rdf1.xml");
testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml");
// add evil test from TIKA-327
@@ -79,7 +79,7 @@
// test HTML detection of malformed file, previously identified as image/cgm (TIKA-1170)
testFile("text/html", "test-malformed-header.html.bin");
- //test GCMD Directory Interchange Format (.dif) TIKA-1561
+ // test GCMD Directory Interchange Format (.dif) TIKA-1561
testFile("application/dif+xml", "brwNIMS_2014.dif");
// truncated xml should still be detected as xml, See TIKA-3596
@@ -103,59 +103,80 @@
@Test
public void testByteOrderMark() throws Exception {
- assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES
- .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata()));
- assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES
- .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata()));
- assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES
- .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN,
+ MIME_TYPES.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN,
+ MIME_TYPES.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN,
+ MIME_TYPES.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata()));
}
@Test
public void testRFC822WithBOM() throws Exception {
- String header = "From: blah <blah@blah.com>\r\n" + "Received: Friday, January 24, 2020 3:24 PM\r\n" +
- "To: someone@somewhere.com\r\n" + "Cc: someone-else@other.com\r\n" +
- "Subject: Received\r\n";
+ String header =
+ "From: blah <blah@blah.com>\r\n"
+ + "Received: Friday, January 24, 2020 3:24 PM\r\n"
+ + "To: someone@somewhere.com\r\n"
+ + "Cc: someone-else@other.com\r\n"
+ + "Subject: Received\r\n";
MediaType rfc822 = MediaType.parse("message/rfc822");
- assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream
- .builder()
- .setByteArray(header.getBytes(UTF_8))
- .get(), new Metadata()));
+ assertEquals(
+ rfc822,
+ MIME_TYPES.detect(
+ UnsynchronizedByteArrayInputStream.builder()
+ .setByteArray(header.getBytes(UTF_8))
+ .get(),
+ new Metadata()));
int utfLength = ByteOrderMark.UTF_8.length();
byte[] bytes = new byte[header.getBytes(UTF_8).length + utfLength];
System.arraycopy(ByteOrderMark.UTF_8.getBytes(), 0, bytes, 0, utfLength);
System.arraycopy(header.getBytes(UTF_8), 0, bytes, 3, header.getBytes(UTF_8).length);
- assertEquals(rfc822, MIME_TYPES.detect(UnsynchronizedByteArrayInputStream
- .builder()
- .setByteArray(bytes)
- .get(), new Metadata()));
+ assertEquals(
+ rfc822,
+ MIME_TYPES.detect(
+ UnsynchronizedByteArrayInputStream.builder().setByteArray(bytes).get(),
+ new Metadata()));
}
@Test
public void testSuperTypes() {
- assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
- MediaType.parse("text/something")));
+ assertTrue(
+ REGISTRY.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.parse("text/something")));
- assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
- MediaType.TEXT_PLAIN));
+ assertTrue(
+ REGISTRY.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"), MediaType.TEXT_PLAIN));
- assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
- MediaType.OCTET_STREAM));
+ assertTrue(
+ REGISTRY.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"), MediaType.OCTET_STREAM));
- assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("text/something"),
- MediaType.TEXT_PLAIN));
+ assertTrue(
+ REGISTRY.isSpecializationOf(
+ MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
- assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/something+xml"),
- MediaType.APPLICATION_XML));
+ assertTrue(
+ REGISTRY.isSpecializationOf(
+ MediaType.parse("application/something+xml"), MediaType.APPLICATION_XML));
- assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/something+zip"),
- MediaType.APPLICATION_ZIP));
+ assertTrue(
+ REGISTRY.isSpecializationOf(
+ MediaType.parse("application/something+zip"), MediaType.APPLICATION_ZIP));
assertTrue(REGISTRY.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN));
- assertTrue(REGISTRY.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"),
- MediaType.APPLICATION_ZIP));
+ assertTrue(
+ REGISTRY.isSpecializationOf(
+ MediaType.parse("application/vnd.apple.iwork"), MediaType.APPLICATION_ZIP));
}
@SuppressWarnings("unused")
@@ -168,8 +189,7 @@
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, url);
String mime = this.MIME_TYPES.detect(null, metadata).toString();
- assertEquals(expected, mime,
- url + " is not properly detected using only resource name");
+ assertEquals(expected, mime, url + " is not properly detected using only resource name");
}
private void testUrl(String expected, String url, String file) throws IOException {
@@ -193,13 +213,14 @@
try {
Metadata metadata = new Metadata();
String mime = this.MIME_TYPES.detect(in, metadata).toString();
- assertEquals(expected, mime,
- urlOrFileName + " is not properly detected: detected.");
+ assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected.");
- //Add resource name and test again
+ // Add resource name and test again
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName);
mime = this.MIME_TYPES.detect(in, metadata).toString();
- assertEquals(expected, mime,
+ assertEquals(
+ expected,
+ mime,
urlOrFileName + " is not properly detected after adding resource name.");
} finally {
in.close();
@@ -213,37 +234,40 @@
*/
@Test
public void testEmptyDocument() throws IOException {
- assertEquals(MediaType.OCTET_STREAM,
+ assertEquals(
+ MediaType.OCTET_STREAM,
MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), new Metadata()));
Metadata namehint = new Metadata();
namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt");
- assertEquals(MediaType.TEXT_PLAIN,
+ assertEquals(
+ MediaType.TEXT_PLAIN,
MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), namehint));
Metadata typehint = new Metadata();
typehint.set(Metadata.CONTENT_TYPE, "text/plain");
- assertEquals(MediaType.TEXT_PLAIN,
+ assertEquals(
+ MediaType.TEXT_PLAIN,
MIME_TYPES.detect(new ByteArrayInputStream(new byte[0]), typehint));
-
}
/**
- * Test for things like javascript files whose content is enclosed in XML
- * comment delimiters, but that aren't actually XML.
+ * Test for things like javascript files whose content is enclosed in XML comment delimiters,
+ * but that aren't actually XML.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
*/
@Test
public void testNotXML() throws IOException {
- assertEquals(MediaType.TEXT_PLAIN, MIME_TYPES
- .detect(new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)), new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN,
+ MIME_TYPES.detect(
+ new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)), new Metadata()));
}
/**
- * Tests that when we repeatedly test the detection of a document
- * that can be detected with Mime Magic, that we consistently
- * detect it correctly. See TIKA-391 for more details.
+ * Tests that when we repeatedly test the detection of a document that can be detected with Mime
+ * Magic, that we consistently detect it correctly. See TIKA-391 for more details.
*/
@Test
public void testMimeMagicStability() throws IOException {
@@ -253,10 +277,9 @@
}
/**
- * Tests that when two magic matches both apply, and both
- * have the same priority, we use the name to pick the
- * right one based on the glob, or the first one we
- * come across if not. See TIKA-1292 for more details.
+ * Tests that when two magic matches both apply, and both have the same priority, we use the
+ * name to pick the right one based on the glob, or the first one we come across if not. See
+ * TIKA-1292 for more details.
*/
@Test
public void testMimeMagicClashSamePriority() throws IOException {
@@ -280,9 +303,7 @@
assertEquals(helloXType, MIME_TYPES.detect(new ByteArrayInputStream(helloWorld), metadata));
}
- /**
- * Test for TIKA-3771.
- */
+ /** Test for TIKA-3771. */
@Test
public void testPNGWithSomeEmlHeaders() throws IOException {
testFile("image/png", "test-pngNotEml.bin");
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
index 0d904f6..92f542a 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
@@ -32,26 +32,20 @@
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.stream.Collectors;
-
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-
/**
- * These tests try to ensure that the MimeTypesReader
- * has correctly processed the mime-types.xml file.
- * To do this, it tests that various aspects of the
- * mime-types.xml file have ended up correctly as
- * globs, matches, magics etc.
- * <p>
- * If you make updates to mime-types.xml, then the
- * checks in this test may no longer hold true.
- * As such, if tests here start failing after your
- * changes, please review the test details, and
+ * These tests try to ensure that the MimeTypesReader has correctly processed the mime-types.xml
+ * file. To do this, it tests that various aspects of the mime-types.xml file have ended up
+ * correctly as globs, matches, magics etc.
+ *
+ * <p>If you make updates to mime-types.xml, then the checks in this test may no longer hold true.
+ * As such, if tests here start failing after your changes, please review the test details, and
* update it to match the new state of the file!
*/
public class MimeTypesReaderTest {
@@ -68,19 +62,21 @@
return mimeTypes
.detect(new ByteArrayInputStream(text.getBytes(StandardCharsets.UTF_8)), metadata)
.toString();
-
}
@SuppressWarnings("unchecked")
@BeforeEach
- public void setUp() throws NoSuchFieldException, SecurityException, IllegalArgumentException,
- IllegalAccessException {
+ public void setUp()
+ throws NoSuchFieldException,
+ SecurityException,
+ IllegalArgumentException,
+ IllegalAccessException {
this.mimeTypes = TikaConfig.getDefaultConfig().getMimeRepository();
Field magicsField = mimeTypes.getClass().getDeclaredField("magics");
magicsField.setAccessible(true);
magics = (List<Magic>) magicsField.get(mimeTypes);
- //ensure reset of custom mimes path
+ // ensure reset of custom mimes path
customMimeTypes = System.getProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP);
}
@@ -100,9 +96,12 @@
// Check on the type
MimeType html = mimeTypes.forName("text/html");
assertTrue(html.hasMagic());
- assertTrue(html.getMagics().size() >= minMatches,
- "There should be at least " + minMatches + " HTML matches, found " +
- html.getMagics().size());
+ assertTrue(
+ html.getMagics().size() >= minMatches,
+ "There should be at least "
+ + minMatches
+ + " HTML matches, found "
+ + html.getMagics().size());
// Check on the overall magics
List<Magic> htmlMagics = new ArrayList<>();
@@ -112,8 +111,12 @@
}
}
- assertTrue(htmlMagics.size() >= minMatches,
- "There should be at least " + minMatches + " HTML matches, found " + htmlMagics.size());
+ assertTrue(
+ htmlMagics.size() >= minMatches,
+ "There should be at least "
+ + minMatches
+ + " HTML matches, found "
+ + htmlMagics.size());
}
@Test
@@ -123,9 +126,12 @@
// Check on the type
MimeType excel = mimeTypes.forName("application/vnd.ms-excel");
assertTrue(excel.hasMagic());
- assertTrue(excel.getMagics().size() >= minMatches,
- "There should be at least " + minMatches + " Excel matches, found " +
- excel.getMagics().size());
+ assertTrue(
+ excel.getMagics().size() >= minMatches,
+ "There should be at least "
+ + minMatches
+ + " Excel matches, found "
+ + excel.getMagics().size());
// Check on the overall magics
List<Magic> excelMagics = new ArrayList<>();
@@ -135,9 +141,12 @@
}
}
- assertTrue(excel.getMagics().size() >= minMatches,
- "There should be at least " + minMatches + " Excel matches, found " +
- excelMagics.size());
+ assertTrue(
+ excel.getMagics().size() >= minMatches,
+ "There should be at least "
+ + minMatches
+ + " Excel matches, found "
+ + excelMagics.size());
}
/**
@@ -160,8 +169,8 @@
MimeType mime = this.mimeTypes.forName("image/bmp");
assertEquals("BMP", mime.getAcronym());
assertEquals("com.microsoft.bmp", mime.getUniformTypeIdentifier());
- assertEquals("http://en.wikipedia.org/wiki/BMP_file_format",
- mime.getLinks().get(0).toString());
+ assertEquals(
+ "http://en.wikipedia.org/wiki/BMP_file_format", mime.getLinks().get(0).toString());
mime = this.mimeTypes.forName("application/xml");
assertEquals("XML", mime.getAcronym());
@@ -200,10 +209,7 @@
assertEquals("application/x-berkeley-db", mtAltP.toString());
}
- /**
- * TIKA-746 Ensures that the custom mimetype maps were also
- * loaded and used
- */
+ /** TIKA-746 Ensures that the custom mimetype maps were also loaded and used */
@Test
public void testCustomMimeTypes() {
// Check that it knows about our three special ones
@@ -259,12 +265,11 @@
}
}
- /**
- * TIKA-2460 Test loading of custom-mimetypes.xml from sys prop.
- */
+ /** TIKA-2460 Test loading of custom-mimetypes.xml from sys prop. */
@Test
public void testExternalMimeTypes() throws Exception {
- System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
+ System.setProperty(
+ MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/external-mimetypes.xml");
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
Metadata m = new Metadata();
@@ -290,17 +295,16 @@
@Test
public void testGetAliasForJavaScript() throws Exception {
MimeType mt = this.mimeTypes.forName("text/javascript");
- Set<String> aliases = mimeTypes.getMediaTypeRegistry()
- .getAliases(mt.getType())
- .stream()
- .map(MediaType::toString)
- .collect(Collectors.toSet());
+ Set<String> aliases =
+ mimeTypes.getMediaTypeRegistry().getAliases(mt.getType()).stream()
+ .map(MediaType::toString)
+ .collect(Collectors.toSet());
assertEquals(Set.of("application/javascript", "application/x-javascript"), aliases);
}
@Test
public void testGetRegisteredMimesWithParameters() throws Exception {
- //TIKA-1692
+ // TIKA-1692
// Media Type always keeps details / parameters
String name = "application/xml; charset=UTF-8";
@@ -324,15 +328,17 @@
@Test
public void testMultiThreaded() throws Exception {
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes();
- Executors.newFixedThreadPool(1).execute(() -> {
- try {
- for (int i = 0; i < 500 && !stop; i++) {
- mimeTypes.forName("abc" + i + "/abc");
- }
- } catch (MimeTypeException e) {
- e.printStackTrace();
- }
- });
+ Executors.newFixedThreadPool(1)
+ .execute(
+ () -> {
+ try {
+ for (int i = 0; i < 500 && !stop; i++) {
+ mimeTypes.forName("abc" + i + "/abc");
+ }
+ } catch (MimeTypeException e) {
+ e.printStackTrace();
+ }
+ });
for (int i = 0; i < 500 & !stop; i++) {
mimeTypes.getMediaTypeRegistry().getAliases(MediaType.APPLICATION_ZIP);
@@ -341,63 +347,75 @@
@Test
public void testMinShouldMatch() throws Exception {
- System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
+ System.setProperty(
+ MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-minShouldMatch.xml");
MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(new CustomClassLoader());
- //matches one
- assertEquals("hello/world-min-file",
- getTypeAsString(mimeTypes, "Hello World!", new Metadata()));
+ // matches one
+ assertEquals(
+ "hello/world-min-file", getTypeAsString(mimeTypes, "Hello World!", new Metadata()));
- //matches two
- assertEquals("hello/world-min-file",
- getTypeAsString(mimeTypes, "Hello Welt!", new Metadata()));
+ // matches two
+ assertEquals(
+ "hello/world-min-file", getTypeAsString(mimeTypes, "Hello Welt!", new Metadata()));
- //matches two
- assertEquals("hello/world-min-file",
- getTypeAsString(mimeTypes, "Hallo Welt!", new Metadata()));
+ // matches two
+ assertEquals(
+ "hello/world-min-file", getTypeAsString(mimeTypes, "Hallo Welt!", new Metadata()));
- //missing !
+ // missing !
assertEquals("text/plain", getTypeAsString(mimeTypes, "Hello World", new Metadata()));
- //Hello requires world, welt or hallo; monde requires bonjour le
+ // Hello requires world, welt or hallo; monde requires bonjour le
assertEquals("text/plain", getTypeAsString(mimeTypes, "Hello Monde", new Metadata()));
- //this matcher is treated as "or" with minshouldmatch clause
- assertEquals("hello/world-min-file",
+ // this matcher is treated as "or" with minshouldmatch clause
+ assertEquals(
+ "hello/world-min-file",
getTypeAsString(mimeTypes, "Bonjour le Monde!", new Metadata()));
-
}
@Test
public void testBadMinShouldMatch1() {
- System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
+ System.setProperty(
+ MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch1.xml");
- assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
}
@Test
public void testBadMinShouldMatch2() {
- System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
+ System.setProperty(
+ MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch2.xml");
- assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
}
@Test
public void testBadMinShouldMatch3() {
- System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
+ System.setProperty(
+ MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch3.xml");
- assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
}
@Test
public void testBadMinShouldMatch4() {
- System.setProperty(MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
+ System.setProperty(
+ MimeTypesFactory.CUSTOM_MIMES_SYS_PROP,
"src/test/resources/org/apache/tika/mime/custom-mimetypes-badMinShouldMatch4.xml");
- assertThrows(IllegalArgumentException.class, () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
+ assertThrows(
+ IllegalArgumentException.class,
+ () -> MimeTypes.getDefaultMimeTypes(new CustomClassLoader()));
}
- private static class CustomClassLoader extends ClassLoader {
- }
+ private static class CustomClassLoader extends ClassLoader {}
}
diff --git a/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java b/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java
index 25721b1..6ddc6db 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/PatternsTest.java
@@ -21,7 +21,6 @@
import static org.junit.jupiter.api.Assertions.fail;
import java.util.List;
-
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
@@ -39,9 +38,7 @@
text = types.forName("text/plain");
}
- /**
- * Test add()
- */
+ /** Test add() */
@Test
public void testAdd() throws MimeTypeException {
try {
@@ -64,9 +61,7 @@
}
}
- /**
- * Test matches()
- */
+ /** Test matches() */
@Test
public void testMatches() {
try {
@@ -96,5 +91,4 @@
assertTrue(extensions.contains(".jpg"));
assertTrue(extensions.contains(".jpeg"));
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
index c9d0073..5220f81 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTest.java
@@ -28,12 +28,10 @@
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
-
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
public class ProbabilisticMimeDetectionTest {
@@ -63,7 +61,9 @@
testFile("application/xml", "test-utf16be.xml");
testFile("application/xml", "test-long-comment.xml");
testFile("application/xslt+xml", "stylesheet.xsl");
- testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
+ testUrl(
+ "application/rdf+xml",
+ "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
"test-difficult-rdf1.xml");
testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml");
// add evil test from TIKA-327
@@ -81,39 +81,53 @@
@Test
public void testByteOrderMark() throws Exception {
- assertEquals(MediaType.TEXT_PLAIN, proDetector
- .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata()));
- assertEquals(MediaType.TEXT_PLAIN, proDetector
- .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN,
+ proDetector.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN,
+ proDetector.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata()));
- assertEquals(MediaType.TEXT_PLAIN, proDetector
- .detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN,
+ proDetector.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata()));
}
@Test
public void testSuperTypes() {
- assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
- MediaType.parse("text/something")));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.parse("text/something")));
- assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
- MediaType.TEXT_PLAIN));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"), MediaType.TEXT_PLAIN));
- assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
- MediaType.OCTET_STREAM));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"), MediaType.OCTET_STREAM));
- assertTrue(registry.isSpecializationOf(MediaType.parse("text/something"),
- MediaType.TEXT_PLAIN));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
- assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+xml"),
- MediaType.APPLICATION_XML));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("application/something+xml"), MediaType.APPLICATION_XML));
- assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+zip"),
- MediaType.APPLICATION_ZIP));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("application/something+zip"), MediaType.APPLICATION_ZIP));
assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN));
- assertTrue(registry.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"),
- MediaType.APPLICATION_ZIP));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("application/vnd.apple.iwork"), MediaType.APPLICATION_ZIP));
}
@SuppressWarnings("unused")
@@ -143,13 +157,14 @@
try {
Metadata metadata = new Metadata();
String mime = this.proDetector.detect(in, metadata).toString();
- assertEquals(expected, mime,
- urlOrFileName + " is not properly detected: detected.");
+ assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected.");
// Add resource name and test again
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName);
mime = this.proDetector.detect(in, metadata).toString();
- assertEquals(expected, mime,
+ assertEquals(
+ expected,
+ mime,
urlOrFileName + " is not properly detected after adding resource name.");
} finally {
in.close();
@@ -159,43 +174,44 @@
/**
* Test for type detection of empty documents.
*
- * @see <a
- * href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
*/
@Test
public void testEmptyDocument() throws IOException {
- assertEquals(MediaType.OCTET_STREAM,
+ assertEquals(
+ MediaType.OCTET_STREAM,
proDetector.detect(new ByteArrayInputStream(new byte[0]), new Metadata()));
Metadata namehint = new Metadata();
namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt");
- assertEquals(MediaType.TEXT_PLAIN,
+ assertEquals(
+ MediaType.TEXT_PLAIN,
proDetector.detect(new ByteArrayInputStream(new byte[0]), namehint));
Metadata typehint = new Metadata();
typehint.set(Metadata.CONTENT_TYPE, "text/plain");
- assertEquals(MediaType.TEXT_PLAIN,
+ assertEquals(
+ MediaType.TEXT_PLAIN,
proDetector.detect(new ByteArrayInputStream(new byte[0]), typehint));
-
}
/**
- * Test for things like javascript files whose content is enclosed in XML
- * comment delimiters, but that aren't actually XML.
+ * Test for things like javascript files whose content is enclosed in XML comment delimiters,
+ * but that aren't actually XML.
*
- * @see <a
- * href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
*/
@Test
public void testNotXML() throws IOException {
- assertEquals(MediaType.TEXT_PLAIN, proDetector
- .detect(new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)), new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN,
+ proDetector.detect(
+ new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)), new Metadata()));
}
/**
- * Tests that when we repeatedly test the detection of a document that can
- * be detected with Mime Magic, that we consistently detect it correctly.
- * See TIKA-391 for more details.
+ * Tests that when we repeatedly test the detection of a document that can be detected with Mime
+ * Magic, that we consistently detect it correctly. See TIKA-391 for more details.
*/
@Test
public void testMimeMagicStability() throws IOException {
@@ -205,9 +221,9 @@
}
/**
- * Tests that when two magic matches both apply, and both have the same
- * priority, we use the name to pick the right one based on the glob, or the
- * first one we come across if not. See TIKA-1292 for more details.
+ * Tests that when two magic matches both apply, and both have the same priority, we use the
+ * name to pick the right one based on the glob, or the first one we come across if not. See
+ * TIKA-1292 for more details.
*/
@Test
public void testMimeMagicClashSamePriority() throws IOException {
@@ -223,24 +239,29 @@
metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.x-hello-world");
- assertEquals(helloXType,
- proDetector.detect(new ByteArrayInputStream(helloWorld), metadata));
+ assertEquals(
+ helloXType, proDetector.detect(new ByteArrayInputStream(helloWorld), metadata));
// Without, goes for the one that sorts last
metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testingTESTINGtesting");
- assertEquals(helloXType,
- proDetector.detect(new ByteArrayInputStream(helloWorld), metadata));
+ assertEquals(
+ helloXType, proDetector.detect(new ByteArrayInputStream(helloWorld), metadata));
}
@Test
public void testTIKA2237() throws IOException {
Metadata metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, MediaType.text("javascript").toString());
- InputStream input = new ByteArrayInputStream(
- ("function() {};\n" + "try {\n" + " window.location = 'index.html';\n" +
- "} catch (e) {\n" + " console.log(e);\n" + "}")
- .getBytes(StandardCharsets.UTF_8));
+ InputStream input =
+ new ByteArrayInputStream(
+ ("function() {};\n"
+ + "try {\n"
+ + " window.location = 'index.html';\n"
+ + "} catch (e) {\n"
+ + " console.log(e);\n"
+ + "}")
+ .getBytes(StandardCharsets.UTF_8));
MediaType detect = new ProbabilisticMimeDetectionSelector().detect(input, metadata);
assertEquals(MediaType.text("javascript"), detect);
}
diff --git a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
index 69ef03a..4f28fd0 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/ProbabilisticMimeDetectionTestWithTika.java
@@ -27,16 +27,14 @@
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.Tika;
import org.apache.tika.config.ServiceLoader;
import org.apache.tika.detect.DefaultProbDetector;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.ProbabilisticMimeDetectionSelector.Builder;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
public class ProbabilisticMimeDetectionTestWithTika {
@@ -58,9 +56,12 @@
* instantiate the object.
*/
Builder builder = new ProbabilisticMimeDetectionSelector.Builder();
- proSelector = new ProbabilisticMimeDetectionSelector(types,
- builder.priorMagicFileType(0.5f).priorExtensionFileType(0.5f)
- .priorMetaFileType(0.5f));
+ proSelector =
+ new ProbabilisticMimeDetectionSelector(
+ types,
+ builder.priorMagicFileType(0.5f)
+ .priorExtensionFileType(0.5f)
+ .priorMetaFileType(0.5f));
DefaultProbDetector detector = new DefaultProbDetector(proSelector, loader);
// Use a default Tika, except for our different detector
@@ -80,7 +81,9 @@
testFile("application/xml", "test-utf16be.xml");
testFile("application/xml", "test-long-comment.xml");
testFile("application/xslt+xml", "stylesheet.xsl");
- testUrl("application/rdf+xml", "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
+ testUrl(
+ "application/rdf+xml",
+ "http://www.ai.sri.com/daml/services/owl-s/1.2/Process.owl",
"test-difficult-rdf1.xml");
testUrl("application/rdf+xml", "http://www.w3.org/2002/07/owl#", "test-difficult-rdf2.xml");
// add evil test from TIKA-327
@@ -98,43 +101,54 @@
@Test
public void testByteOrderMark() throws Exception {
- assertEquals(MediaType.TEXT_PLAIN.toString(),
- tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)),
- new Metadata()));
- assertEquals(MediaType.TEXT_PLAIN.toString(),
- tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)),
- new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN.toString(),
+ tika.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF_16LE)), new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN.toString(),
+ tika.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF_16BE)), new Metadata()));
- assertEquals(MediaType.TEXT_PLAIN.toString(),
- tika.detect(new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)),
- new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN.toString(),
+ tika.detect(
+ new ByteArrayInputStream("\ufefftest".getBytes(UTF_8)), new Metadata()));
}
@Test
public void testSuperTypes() {
- assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
- MediaType.parse("text/something")));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"),
+ MediaType.parse("text/something")));
- assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
- MediaType.TEXT_PLAIN));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"), MediaType.TEXT_PLAIN));
- assertTrue(registry.isSpecializationOf(MediaType.parse("text/something; charset=UTF-8"),
- MediaType.OCTET_STREAM));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("text/something; charset=UTF-8"), MediaType.OCTET_STREAM));
- assertTrue(registry.isSpecializationOf(MediaType.parse("text/something"),
- MediaType.TEXT_PLAIN));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("text/something"), MediaType.TEXT_PLAIN));
- assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+xml"),
- MediaType.APPLICATION_XML));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("application/something+xml"), MediaType.APPLICATION_XML));
- assertTrue(registry.isSpecializationOf(MediaType.parse("application/something+zip"),
- MediaType.APPLICATION_ZIP));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("application/something+zip"), MediaType.APPLICATION_ZIP));
assertTrue(registry.isSpecializationOf(MediaType.APPLICATION_XML, MediaType.TEXT_PLAIN));
- assertTrue(registry.isSpecializationOf(MediaType.parse("application/vnd.apple.iwork"),
- MediaType.APPLICATION_ZIP));
+ assertTrue(
+ registry.isSpecializationOf(
+ MediaType.parse("application/vnd.apple.iwork"), MediaType.APPLICATION_ZIP));
}
@SuppressWarnings("unused")
@@ -165,14 +179,15 @@
Metadata metadata = new Metadata();
// String mime = this.proDetector.detect(in, metadata).toString();
String mime = tika.detect(in, metadata);
- assertEquals(expected, mime,
- urlOrFileName + " is not properly detected: detected.");
+ assertEquals(expected, mime, urlOrFileName + " is not properly detected: detected.");
// Add resource name and test again
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, urlOrFileName);
// mime = this.proDetector.detect(in, metadata).toString();
mime = tika.detect(in, metadata);
- assertEquals(expected, mime,
+ assertEquals(
+ expected,
+ mime,
urlOrFileName + " is not properly detected after adding resource name.");
} finally {
in.close();
@@ -182,44 +197,44 @@
/**
* Test for type detection of empty documents.
*
- * @see <a
- * href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-483">TIKA-483</a>
*/
@Test
public void testEmptyDocument() throws IOException {
- assertEquals(MediaType.OCTET_STREAM.toString(),
+ assertEquals(
+ MediaType.OCTET_STREAM.toString(),
tika.detect(new ByteArrayInputStream(new byte[0]), new Metadata()));
Metadata namehint = new Metadata();
namehint.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.txt");
- assertEquals(MediaType.TEXT_PLAIN.toString(),
+ assertEquals(
+ MediaType.TEXT_PLAIN.toString(),
tika.detect(new ByteArrayInputStream(new byte[0]), namehint));
Metadata typehint = new Metadata();
typehint.set(Metadata.CONTENT_TYPE, "text/plain");
- assertEquals(MediaType.TEXT_PLAIN.toString(),
+ assertEquals(
+ MediaType.TEXT_PLAIN.toString(),
tika.detect(new ByteArrayInputStream(new byte[0]), typehint));
-
}
/**
- * Test for things like javascript files whose content is enclosed in XML
- * comment delimiters, but that aren't actually XML.
+ * Test for things like javascript files whose content is enclosed in XML comment delimiters,
+ * but that aren't actually XML.
*
- * @see <a
- * href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
+ * @see <a href="https://issues.apache.org/jira/browse/TIKA-426">TIKA-426</a>
*/
@Test
public void testNotXML() throws IOException {
- assertEquals(MediaType.TEXT_PLAIN.toString(),
- tika.detect(new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)),
- new Metadata()));
+ assertEquals(
+ MediaType.TEXT_PLAIN.toString(),
+ tika.detect(
+ new ByteArrayInputStream("<!-- test -->".getBytes(UTF_8)), new Metadata()));
}
/**
- * Tests that when we repeatedly test the detection of a document that can
- * be detected with Mime Magic, that we consistently detect it correctly.
- * See TIKA-391 for more details.
+ * Tests that when we repeatedly test the detection of a document that can be detected with Mime
+ * Magic, that we consistently detect it correctly. See TIKA-391 for more details.
*/
@Test
public void testMimeMagicStability() throws IOException {
@@ -229,9 +244,9 @@
}
/**
- * Tests that when two magic matches both apply, and both have the same
- * priority, we use the name to pick the right one based on the glob, or the
- * first one we come across if not. See TIKA-1292 for more details.
+ * Tests that when two magic matches both apply, and both have the same priority, we use the
+ * name to pick the right one based on the glob, or the first one we come across if not. See
+ * TIKA-1292 for more details.
*/
@Test
public void testMimeMagicClashSamePriority() throws IOException {
@@ -243,18 +258,18 @@
// With a filename, picks the right one
metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.hello.world");
- assertEquals(helloType.toString(),
- tika.detect(new ByteArrayInputStream(helloWorld), metadata));
+ assertEquals(
+ helloType.toString(), tika.detect(new ByteArrayInputStream(helloWorld), metadata));
metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.x-hello-world");
- assertEquals(helloXType.toString(),
- tika.detect(new ByteArrayInputStream(helloWorld), metadata));
+ assertEquals(
+ helloXType.toString(), tika.detect(new ByteArrayInputStream(helloWorld), metadata));
// Without, goes for the one that sorts last
metadata = new Metadata();
metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "testingTESTINGtesting");
- assertEquals(helloXType.toString(),
- tika.detect(new ByteArrayInputStream(helloWorld), metadata));
+ assertEquals(
+ helloXType.toString(), tika.detect(new ByteArrayInputStream(helloWorld), metadata));
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
index 7340e06..ac395c2 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/RFC822DetectionTest.java
@@ -20,12 +20,10 @@
import java.io.IOException;
import java.nio.charset.StandardCharsets;
-
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
+import org.junit.jupiter.api.Test;
public class RFC822DetectionTest {
@@ -33,47 +31,59 @@
@Test
public void testBasic() throws Exception {
- for (String txt : new String[]{
- "Date: blah\nSent: someone\r\nthis is a test",
- "date: blah\nSent: someone\r\nthis is a test",
- "date: blah\nDelivered-To: someone\r\nthis is a test"
- }) {
+ for (String txt :
+ new String[] {
+ "Date: blah\nSent: someone\r\nthis is a test",
+ "date: blah\nSent: someone\r\nthis is a test",
+ "date: blah\nDelivered-To: someone\r\nthis is a test"
+ }) {
assertMime("message/rfc822", txt);
}
- for (String txt : new String[]{
- //test missing colon
- "Date blah\nSent: someone\r\nthis is a test",
- //test precursor junk
- "some precursor junk Date: blah\nSent: someone\r\nthis is a test",
- "some precursor junk\nDate: blah\nSent: someone\r\nthis is a test",
- "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a test",
- //confirm that date is case-insensitive, but delivered-to is case-sensitive
- "date: blah\ndelivered-To: someone\r\nthis is a test",
- //test that a file that starts only with "Subject:" and no other header is
- //detected as text/plain
- "Subject: this is a subject\nand there's some other text",
- "To: someone\nand there's some other text",
- "To: someone or other"
- }) {
+ for (String txt :
+ new String[] {
+ // test missing colon
+ "Date blah\nSent: someone\r\nthis is a test",
+ // test precursor junk
+ "some precursor junk Date: blah\nSent: someone\r\nthis is a test",
+ "some precursor junk\nDate: blah\nSent: someone\r\nthis is a test",
+ "some precursor junk:\nDate: blah\nSent: someone\r\nthis is a test",
+ // confirm that date is case-insensitive, but delivered-to is case-sensitive
+ "date: blah\ndelivered-To: someone\r\nthis is a test",
+ // test that a file that starts only with "Subject:" and no other header is
+ // detected as text/plain
+ "Subject: this is a subject\nand there's some other text",
+ "To: someone\nand there's some other text",
+ "To: someone or other"
+ }) {
assertMime("text/plain", txt);
}
- //TIKA-4153, specifically
- String txt = "Some text here 1.\n" + "Some text here 2.\n" + "Some text here 3.\n" +
- "Original Message-----\n" + "From: some_mail@abc.com\n" +
- "Sent: Thursday, October 31, 2019 9:52 AM\n" +
- "To: Some person, (The XYZ group)\n" +
- "Subject: RE: Mr. Random person phone call: MESSAGE\n" + "Hi,\n" +
- "I am available now to receive the call.\n" + "Some text here 4.\n" +
- "Some text here 5.\n" + "Some text here 6.";
+ // TIKA-4153, specifically
+ String txt =
+ "Some text here 1.\n"
+ + "Some text here 2.\n"
+ + "Some text here 3.\n"
+ + "Original Message-----\n"
+ + "From: some_mail@abc.com\n"
+ + "Sent: Thursday, October 31, 2019 9:52 AM\n"
+ + "To: Some person, (The XYZ group)\n"
+ + "Subject: RE: Mr. Random person phone call: MESSAGE\n"
+ + "Hi,\n"
+ + "I am available now to receive the call.\n"
+ + "Some text here 4.\n"
+ + "Some text here 5.\n"
+ + "Some text here 6.";
assertMime("text/plain", txt);
}
private void assertMime(String expected, String txt) throws IOException {
MediaType mediaType =
- MIME_TYPES.detect(UnsynchronizedByteArrayInputStream.builder()
- .setByteArray(txt.getBytes(StandardCharsets.UTF_8)).get(), new Metadata());
+ MIME_TYPES.detect(
+ UnsynchronizedByteArrayInputStream.builder()
+ .setByteArray(txt.getBytes(StandardCharsets.UTF_8))
+ .get(),
+ new Metadata());
assertEquals(expected, mediaType.toString(), txt);
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 62b061d..026f6fd 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -20,9 +20,6 @@
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.InputStream;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.EmbeddedBytesSelector;
import org.apache.tika.extractor.RUnpackExtractor;
@@ -30,14 +27,15 @@
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.utils.StringUtils;
+import org.junit.jupiter.api.Test;
public class AutoDetectParserConfigTest {
@Test
public void testEmbeddedBytesSelector() throws Exception {
TikaConfig config;
- try (InputStream is = TikaConfig.class.getResourceAsStream(
- "TIKA-4207-embedded-bytes-config.xml")) {
+ try (InputStream is =
+ TikaConfig.class.getResourceAsStream("TIKA-4207-embedded-bytes-config.xml")) {
config = new TikaConfig(is);
}
AutoDetectParserConfig c = config.getAutoDetectParserConfig();
@@ -56,7 +54,6 @@
assertFalse(selector.select(getMetadata("application/pdf", "MACRO")));
assertFalse(selector.select(getMetadata("application/docx", "")));
-
}
private Metadata getMetadata(String mime, String embeddedResourceType) {
diff --git a/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java
index 5519dce..e99756c 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/CompositeParserTest.java
@@ -26,36 +26,37 @@
import java.util.List;
import java.util.Map;
import java.util.Set;
-
-import org.junit.jupiter.api.Test;
-import org.xml.sax.ContentHandler;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.sax.BodyContentHandler;
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
public class CompositeParserTest {
@Test
@SuppressWarnings("serial")
public void testFindDuplicateParsers() {
- Parser a = new EmptyParser() {
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.singleton(MediaType.TEXT_PLAIN);
- }
- };
- Parser b = new EmptyParser() {
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.singleton(MediaType.TEXT_PLAIN);
- }
- };
- Parser c = new EmptyParser() {
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.singleton(MediaType.OCTET_STREAM);
- }
- };
+ Parser a =
+ new EmptyParser() {
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(MediaType.TEXT_PLAIN);
+ }
+ };
+ Parser b =
+ new EmptyParser() {
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(MediaType.TEXT_PLAIN);
+ }
+ };
+ Parser c =
+ new EmptyParser() {
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(MediaType.OCTET_STREAM);
+ }
+ };
CompositeParser composite =
new CompositeParser(MediaTypeRegistry.getDefaultRegistry(), a, b, c);
@@ -86,24 +87,26 @@
bmpCanonicalMetadata.put("BMP", "True");
bmpCanonicalMetadata.put("Canonical", "True");
Parser bmpCanonicalParser =
- new DummyParser(new HashSet<>(Collections.singletonList(bmpCanonical)),
- bmpCanonicalMetadata, null);
+ new DummyParser(
+ new HashSet<>(Collections.singletonList(bmpCanonical)),
+ bmpCanonicalMetadata,
+ null);
MediaType bmpAlias = MediaType.image("x-ms-bmp");
Map<String, String> bmpAliasMetadata = new HashMap<>();
bmpAliasMetadata.put("BMP", "True");
bmpAliasMetadata.put("Alias", "True");
Parser bmpAliasParser =
- new DummyParser(new HashSet<>(Collections.singletonList(bmpAlias)), bmpAliasMetadata,
- null);
+ new DummyParser(
+ new HashSet<>(Collections.singletonList(bmpAlias)), bmpAliasMetadata, null);
TikaConfig config = TikaConfig.getDefaultConfig();
CompositeParser canonical =
new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser);
CompositeParser alias = new CompositeParser(config.getMediaTypeRegistry(), bmpAliasParser);
CompositeParser both =
- new CompositeParser(config.getMediaTypeRegistry(), bmpCanonicalParser,
- bmpAliasParser);
+ new CompositeParser(
+ config.getMediaTypeRegistry(), bmpCanonicalParser, bmpAliasParser);
ContentHandler handler = new BodyContentHandler();
Metadata metadata;
@@ -111,12 +114,11 @@
// Canonical and Canonical
metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
- canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata,
- new ParseContext());
+ canonical.parse(
+ new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
assertEquals("True", metadata.get("BMP"));
assertEquals("True", metadata.get("Canonical"));
-
// Alias and Alias
metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
@@ -124,16 +126,14 @@
assertEquals("True", metadata.get("BMP"));
assertEquals("True", metadata.get("Alias"));
-
// Alias type and Canonical parser
metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, bmpAlias.toString());
- canonical.parse(new ByteArrayInputStream(new byte[0]), handler, metadata,
- new ParseContext());
+ canonical.parse(
+ new ByteArrayInputStream(new byte[0]), handler, metadata, new ParseContext());
assertEquals("True", metadata.get("BMP"));
assertEquals("True", metadata.get("Canonical"));
-
// Canonical type and Alias parser
metadata = new Metadata();
metadata.add(Metadata.CONTENT_TYPE, bmpCanonical.toString());
@@ -141,7 +141,6 @@
assertEquals("True", metadata.get("BMP"));
assertEquals("True", metadata.get("Alias"));
-
// And when both are there, will go for the last one
// to be registered (which is the alias one)
metadata = new Metadata();
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java
index 7b329fa..384c060 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyInitializableParser.java
@@ -22,10 +22,6 @@
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -34,11 +30,10 @@
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
-/**
- * This tests that initialize() is called after adding the parameters
- * configured via TikaConfig
- */
+/** This tests that initialize() is called after adding the parameters configured via TikaConfig */
public class DummyInitializableParser implements Parser, Initializable {
public static String SUM_FIELD = "SUM";
@@ -48,10 +43,8 @@
MIMES.add(MediaType.TEXT_PLAIN);
}
- @Field
- private short shortA = -2;
- @Field
- private short shortB = -3;
+ @Field private short shortA = -2;
+ @Field private short shortB = -3;
private int sum = 0;
@Override
@@ -60,8 +53,9 @@
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
metadata.set(SUM_FIELD, Integer.toString(sum));
}
@@ -75,10 +69,10 @@
@Override
public void checkInitialization(InitializableProblemHandler handler)
throws TikaConfigException {
- //completely arbitrary
+ // completely arbitrary
if (sum > 1000) {
- handler.handleInitializableProblem("DummyInitializableParser",
- "sum cannot be > 1000: " + sum);
+ handler.handleInitializableProblem(
+ "DummyInitializableParser", "sum cannot be > 1000: " + sum);
}
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
index 5483474..da75932 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParameterizedParser.java
@@ -26,14 +26,12 @@
import java.net.URL;
import java.util.HashSet;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* A test Parsers to test {@link Field}
@@ -54,36 +52,26 @@
@Field(name = "testparam")
private String testParam = "init_string";
- @Field
- private short xshort = -2;
+ @Field private short xshort = -2;
- @Field
- private int xint = -3;
+ @Field private int xint = -3;
- @Field
- private long xlong = -4;
+ @Field private long xlong = -4;
@Field(name = "xbigint")
private BigInteger xbigInt;
- @Field
- private float xfloat = -5.0f;
+ @Field private float xfloat = -5.0f;
- @Field
- private double xdouble = -6.0d;
+ @Field private double xdouble = -6.0d;
- @Field
- private boolean xbool = true;
+ @Field private boolean xbool = true;
- @Field
- private URL xurl;
+ @Field private URL xurl;
- @Field
- private URI xuri;
+ @Field private URI xuri;
- @Field
- private String missing = "default";
-
+ @Field private String missing = "default";
private final String inner = "inner";
private File xfile;
@@ -113,8 +101,9 @@
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
metadata.add("testparam", testParam);
metadata.add("xshort", xshort + "");
diff --git a/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java b/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
index 9b1ffcc..240265f 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/DummyParser.java
@@ -21,19 +21,17 @@
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
-
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
* A Dummy Parser for use with unit tests.
- * <p>
- * See also {@link org.apache.tika.parser.mock.MockParser}.
+ *
+ * <p>See also {@link org.apache.tika.parser.mock.MockParser}.
*/
public class DummyParser implements Parser {
private final Set<MediaType> types;
@@ -50,8 +48,9 @@
return types;
}
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
for (Entry<String, String> m : this.metadata.entrySet()) {
metadata.add(m.getKey(), m.getValue());
}
@@ -63,5 +62,4 @@
}
xhtml.endDocument();
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java
index 9571ab2..c42bbee 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/InitializableParserTest.java
@@ -20,13 +20,11 @@
import java.net.URL;
import java.nio.charset.StandardCharsets;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.junit.jupiter.api.Test;
public class InitializableParserTest {
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
index 7ad198f..9550fd3 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParameterizedParserTest.java
@@ -24,44 +24,42 @@
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
-
-import org.junit.jupiter.api.Test;
-import org.xml.sax.SAXException;
-
import org.apache.tika.Tika;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaConfigException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.junit.jupiter.api.Test;
+import org.xml.sax.SAXException;
public class ParameterizedParserTest {
- private static final Map<String, String> expcted = new HashMap<String, String>() {
- {
- put("testparam", "testparamval");
- put("xshort", "1000");
- put("xint", "999999999");
- put("xlong", "9999999999999");
- put("xbigint", "99999999999999999999999999999999999999999999999");
- put("xfloat", "10.2");
- put("xbool", "true");
- put("xdouble", "4.6");
- put("xurl", "http://apache.org");
- put("xfile", "somefile");
- put("xuri", "tika://customuri?param=value");
+ private static final Map<String, String> expcted =
+ new HashMap<String, String>() {
+ {
+ put("testparam", "testparamval");
+ put("xshort", "1000");
+ put("xint", "999999999");
+ put("xlong", "9999999999999");
+ put("xbigint", "99999999999999999999999999999999999999999999999");
+ put("xfloat", "10.2");
+ put("xbool", "true");
+ put("xdouble", "4.6");
+ put("xurl", "http://apache.org");
+ put("xfile", "somefile");
+ put("xuri", "tika://customuri?param=value");
- put("inner", "inner");
- put("missing", "default");
- }
- };
-
+ put("inner", "inner");
+ put("missing", "default");
+ }
+ };
@Test
public void testConfigurableParserTypes() throws Exception {
Metadata md = getMetadata("TIKA-1986-parameterized.xml");
for (Map.Entry<String, String> entry : expcted.entrySet()) {
- assertEquals(entry.getValue(),
- md.get(entry.getKey()), "mismatch for " + entry.getKey());
+ assertEquals(
+ entry.getValue(), md.get(entry.getKey()), "mismatch for " + entry.getKey());
}
}
@@ -69,16 +67,15 @@
public void testConfigurableParserTypesDecorated() throws Exception {
Metadata md = getMetadata("TIKA-1986-parameterized-decorated.xml");
for (Map.Entry<String, String> entry : expcted.entrySet()) {
- assertEquals(entry.getValue(),
- md.get(entry.getKey()), "mismatch for " + entry.getKey());
+ assertEquals(
+ entry.getValue(), md.get(entry.getKey()), "mismatch for " + entry.getKey());
}
}
-
@Test
public void testSomeParams() throws Exception {
- //test that a parameterized parser can read a config file
- //with only some changes to the initial values
+ // test that a parameterized parser can read a config file
+ // with only some changes to the initial values
Metadata md = getMetadata("TIKA-1986-some-parameters.xml");
assertEquals("-6.0", md.get("xdouble"));
assertEquals("testparamval", md.get("testparam"));
@@ -87,20 +84,24 @@
@Test
public void testBadValue() throws Exception {
- assertThrows(TikaConfigException.class, () -> {
- getMetadata("TIKA-1986-bad-values.xml");
- });
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ getMetadata("TIKA-1986-bad-values.xml");
+ });
}
@Test
public void testBadType() throws Exception {
- assertThrows(TikaConfigException.class, () -> {
- getMetadata("TIKA-1986-bad-types.xml");
- });
+ assertThrows(
+ TikaConfigException.class,
+ () -> {
+ getMetadata("TIKA-1986-bad-types.xml");
+ });
}
- //TODO later -- add a test for a parser that isn't configurable
- //but that has params in the config file
+ // TODO later -- add a test for a parser that isn't configurable
+ // but that has params in the config file
private Metadata getMetadata(String name) throws TikaException, IOException, SAXException {
URL url = this.getClass().getResource("/org/apache/tika/config/" + name);
diff --git a/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java b/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java
index 141c058..30ed57b 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/ParserDecoratorTest.java
@@ -25,12 +25,10 @@
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.sax.BodyContentHandler;
+import org.junit.jupiter.api.Test;
public class ParserDecoratorTest {
@@ -46,7 +44,6 @@
Set<MediaType> types;
ParseContext context = new ParseContext();
-
// With a parser of no types, get the decorated type
p = ParserDecorator.withTypes(EmptyParser.INSTANCE, onlyTxt);
types = p.getSupportedTypes(context);
@@ -54,34 +51,28 @@
assertTrue(types.contains(MediaType.TEXT_PLAIN), types.toString());
// With a parser with other types, still just the decorated type
- p = ParserDecorator
- .withTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt);
+ p = ParserDecorator.withTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt);
types = p.getSupportedTypes(context);
assertEquals(1, types.size());
assertTrue(types.contains(MediaType.TEXT_PLAIN), types.toString());
-
// Exclude will remove if there
p = ParserDecorator.withoutTypes(EmptyParser.INSTANCE, onlyTxt);
types = p.getSupportedTypes(context);
assertEquals(0, types.size());
- p = ParserDecorator
- .withoutTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt);
+ p = ParserDecorator.withoutTypes(new DummyParser(onlyOct, new HashMap<>(), ""), onlyTxt);
types = p.getSupportedTypes(context);
assertEquals(1, types.size());
assertTrue(types.contains(MediaType.OCTET_STREAM), types.toString());
- p = ParserDecorator
- .withoutTypes(new DummyParser(both, new HashMap<>(), ""), onlyTxt);
+ p = ParserDecorator.withoutTypes(new DummyParser(both, new HashMap<>(), ""), onlyTxt);
types = p.getSupportedTypes(context);
assertEquals(1, types.size());
assertTrue(types.contains(MediaType.OCTET_STREAM), types.toString());
}
- /**
- * Testing one proposed implementation for TIKA-1509
- */
+ /** Testing one proposed implementation for TIKA-1509 */
@Test
public void withFallback() throws Exception {
Set<MediaType> onlyOct = Collections.singleton(MediaType.OCTET_STREAM);
@@ -97,8 +88,8 @@
EmptyParser pNothing = new EmptyParser();
// Create a combination which will fail first
- @SuppressWarnings("deprecation") Parser p =
- ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText);
+ @SuppressWarnings("deprecation")
+ Parser p = ParserDecorator.withFallbacks(Arrays.asList(pFail, pWork), octAndText);
// Will claim to support the types given, not those on the child parsers
Set<MediaType> types = p.getSupportedTypes(context);
@@ -109,15 +100,14 @@
// Parsing will make it to the second one
metadata = new Metadata();
handler = new BodyContentHandler();
- p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
+ p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
-
// With a parser that will work with no output, will get nothing
p = ParserDecorator.withFallbacks(Arrays.asList(pNothing, pWork), octAndText);
metadata = new Metadata();
handler = new BodyContentHandler();
- p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
+ p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context);
assertEquals("", handler.toString());
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java
index 13c5ead..c65a202 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/RegexCaptureParserTest.java
@@ -22,31 +22,29 @@
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
-
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
-
public class RegexCaptureParserTest {
@Test
public void testBasic() throws Exception {
Metadata m = new Metadata();
ContentHandler contentHandler = new DefaultHandler();
- String output = "Something\n" +
- "Title: the quick brown fox\n" +
- "Author: jumped over\n" +
- "Created: 10/20/2024";
+ String output =
+ "Something\n"
+ + "Title: the quick brown fox\n"
+ + "Author: jumped over\n"
+ + "Created: 10/20/2024";
RegexCaptureParser parser = new RegexCaptureParser();
Map<String, String> regexes = new HashMap<>();
regexes.put("title", "^Title: ([^\r\n]+)");
parser.setCaptureMap(regexes);
- try (InputStream stream =
- TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) {
+ try (InputStream stream = TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) {
parser.parse(stream, contentHandler, m, new ParseContext());
}
assertEquals("the quick brown fox", m.get("title"));
diff --git a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
index d4c3899..eb74cd3 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java
@@ -22,11 +22,6 @@
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
-
-import org.junit.jupiter.api.Test;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.helpers.DefaultHandler;
-
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.io.TikaInputStream;
@@ -36,14 +31,17 @@
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RegexCaptureParser;
+import org.junit.jupiter.api.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.helpers.DefaultHandler;
public class ExternalParserTest extends TikaTest {
@Test
public void testConfigRegexCaptureParser() throws Exception {
- assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{
- "file", "--version"
- }));
+ assumeTrue(
+ org.apache.tika.parser.external.ExternalParser.check(
+ new String[] {"file", "--version"}));
try (InputStream is = TikaConfig.class.getResourceAsStream("TIKA-3557.xml")) {
TikaConfig config = new TikaConfig(is);
@@ -56,12 +54,13 @@
Metadata m = new Metadata();
ContentHandler contentHandler = new DefaultHandler();
- String output = "Something\n" +
- "Title: the quick brown fox\n" +
- "Author: jumped over\n" +
- "Created: 10/20/2024";
+ String output =
+ "Something\n"
+ + "Title: the quick brown fox\n"
+ + "Author: jumped over\n"
+ + "Created: 10/20/2024";
try (InputStream stream =
- TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) {
+ TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) {
outputParser.parse(stream, contentHandler, m, new ParseContext());
}
assertEquals("the quick brown fox", m.get("title"));
@@ -70,8 +69,11 @@
@Test
public void testConfigBasic() throws Exception {
- assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{"file", "--version"}));
- try (InputStream is = TikaConfig.class.getResourceAsStream("TIKA-3557-no-output-parser.xml")) {
+ assumeTrue(
+ org.apache.tika.parser.external.ExternalParser.check(
+ new String[] {"file", "--version"}));
+ try (InputStream is =
+ TikaConfig.class.getResourceAsStream("TIKA-3557-no-output-parser.xml")) {
TikaConfig config = new TikaConfig(is);
CompositeParser p = (CompositeParser) config.getParser();
assertEquals(1, p.getAllComponentParsers().size());
@@ -84,13 +86,14 @@
@Test
public void testExifTool() throws Exception {
- assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{"exiftool",
- "-ver"}));
+ assumeTrue(
+ org.apache.tika.parser.external.ExternalParser.check(
+ new String[] {"exiftool", "-ver"}));
try (InputStream is =
- TikaConfig.class.getResourceAsStream("TIKA-3557-exiftool-example.xml")) {
+ TikaConfig.class.getResourceAsStream("TIKA-3557-exiftool-example.xml")) {
TikaConfig config = new TikaConfig(is);
Parser p = new AutoDetectParser(config);
- //this was the smallest pdf we had
+ // this was the smallest pdf we had
List<Metadata> metadataList = getRecursiveMetadata("testOverlappingText.pdf", p);
assertEquals(1, metadataList.size());
Metadata m = metadataList.get(0);
diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
index de464bc..22011c0 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParser.java
@@ -16,9 +16,13 @@
*/
package org.apache.tika.parser.mock;
-
import static java.nio.charset.StandardCharsets.UTF_8;
+import com.martensigwart.fakeload.FakeLoad;
+import com.martensigwart.fakeload.FakeLoadBuilder;
+import com.martensigwart.fakeload.FakeLoadExecutor;
+import com.martensigwart.fakeload.FakeLoadExecutors;
+import com.martensigwart.fakeload.MemoryUnit;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -39,20 +43,7 @@
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import javax.xml.parsers.DocumentBuilder;
-
-import com.martensigwart.fakeload.FakeLoad;
-import com.martensigwart.fakeload.FakeLoadBuilder;
-import com.martensigwart.fakeload.FakeLoadExecutor;
-import com.martensigwart.fakeload.FakeLoadExecutors;
-import com.martensigwart.fakeload.MemoryUnit;
import org.apache.commons.io.input.CloseShieldInputStream;
-import org.w3c.dom.Document;
-import org.w3c.dom.NamedNodeMap;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
@@ -64,22 +55,25 @@
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.w3c.dom.Document;
+import org.w3c.dom.NamedNodeMap;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
/**
- * This class enables mocking of parser behavior for use in testing
- * wrappers and drivers of parsers.
- * <p>
- * See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation
- * of all the options for this MockParser.
- * <p>
- * Tests for this class are in tika-parsers.
- * <p>
- * See also {@link org.apache.tika.parser.DummyParser} for another option.
+ * This class enables mocking of parser behavior for use in testing wrappers and drivers of parsers.
+ *
+ * <p>See resources/test-documents/mock/example.xml in tika-parsers/test for the documentation of
+ * all the options for this MockParser.
+ *
+ * <p>Tests for this class are in tika-parsers.
+ *
+ * <p>See also {@link org.apache.tika.parser.DummyParser} for another option.
*/
-
public class MockParser implements Parser {
-
private static final long serialVersionUID = 1L;
private static final PrintStream ORIG_STDERR;
private static final PrintStream ORIG_STDOUT;
@@ -113,8 +107,9 @@
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
if (Thread.currentThread().isInterrupted()) {
throw new TikaException("interrupted", new InterruptedException());
}
@@ -123,7 +118,7 @@
DocumentBuilder docBuilder = context.getDocumentBuilder();
doc = docBuilder.parse(new CloseShieldInputStream(stream));
} catch (SAXException e) {
- //to distinguish between SAX on read vs SAX while writing
+ // to distinguish between SAX on read vs SAX while writing
throw new IOException(e);
}
Node root = doc.getDocumentElement();
@@ -136,8 +131,8 @@
xhtml.endDocument();
}
- private void executeAction(Node action, Metadata metadata, ParseContext context,
- XHTMLContentHandler xhtml)
+ private void executeAction(
+ Node action, Metadata metadata, ParseContext context, XHTMLContentHandler xhtml)
throws SAXException, IOException, TikaException {
if (action.getNodeType() != 1) {
@@ -182,21 +177,23 @@
}
private void fakeload(Node action) {
- //https://github.com/msigwart/fakeload
- //with this version of fakeload, you should only need one thread to hit
- //the cpu targets; on Linux with Java 8 at least, two or more threads did
- //not increase the overall CPU over a single thread
+ // https://github.com/msigwart/fakeload
+ // with this version of fakeload, you should only need one thread to hit
+ // the cpu targets; on Linux with Java 8 at least, two or more threads did
+ // not increase the overall CPU over a single thread
int numThreads = 1;
NamedNodeMap attrs = action.getAttributes();
if (attrs == null) {
- throw new IllegalArgumentException("Must specify details...no attributes for " +
- "fakeload?!");
+ throw new IllegalArgumentException(
+ "Must specify details...no attributes for " + "fakeload?!");
}
- if (attrs.getNamedItem("millis") == null || attrs.getNamedItem("cpu") == null ||
- attrs.getNamedItem("mb") == null) {
- throw new IllegalArgumentException("must specify 'millis' (time to process), " +
- "'cpu' (% cpu as an integer, e.g. 50% would be '50'), " +
- "and 'mb' (megabytes as an integer)");
+ if (attrs.getNamedItem("millis") == null
+ || attrs.getNamedItem("cpu") == null
+ || attrs.getNamedItem("mb") == null) {
+ throw new IllegalArgumentException(
+ "must specify 'millis' (time to process), "
+ + "'cpu' (% cpu as an integer, e.g. 50% would be '50'), "
+ + "and 'mb' (megabytes as an integer)");
}
Node n = attrs.getNamedItem("numThreads");
if (n != null) {
@@ -211,13 +208,18 @@
new ExecutorCompletionService<>(executorService);
for (int i = 0; i < numThreads; i++) {
- executorCompletionService.submit(() -> {
- FakeLoad fakeload =
- new FakeLoadBuilder().lasting(millis, TimeUnit.MILLISECONDS)
- .withCpu(cpu).withMemory(mb, MemoryUnit.MB).build();
- FakeLoadExecutor executor = FakeLoadExecutors.newDefaultExecutor();
- executor.execute(fakeload);
- }, 1);
+ executorCompletionService.submit(
+ () -> {
+ FakeLoad fakeload =
+ new FakeLoadBuilder()
+ .lasting(millis, TimeUnit.MILLISECONDS)
+ .withCpu(cpu)
+ .withMemory(mb, MemoryUnit.MB)
+ .build();
+ FakeLoadExecutor executor = FakeLoadExecutors.newDefaultExecutor();
+ executor.execute(fakeload);
+ },
+ 1);
int finished = 0;
try {
@@ -233,9 +235,7 @@
} finally {
executorService.shutdownNow();
}
-
}
-
}
private void throwIllegalChars() throws IOException {
@@ -259,7 +259,8 @@
}
String embeddedText = action.getTextContent();
- EmbeddedDocumentExtractor extractor = EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
+ EmbeddedDocumentExtractor extractor =
+ EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
Metadata m = new Metadata();
m.set(TikaCoreProperties.RESOURCE_NAME_KEY, fileName);
@@ -363,7 +364,7 @@
private void metadata(Node action, Metadata metadata) {
NamedNodeMap attrs = action.getAttributes();
- //throws npe unless there is a name
+ // throws npe unless there is a name
String name = attrs.getNamedItem("name").getNodeValue();
String value = action.getTextContent();
Node actionType = attrs.getNamedItem("action");
@@ -398,7 +399,6 @@
}
}
-
private void throwIt(String className, String msg)
throws IOException, SAXException, TikaException {
Throwable t = null;
@@ -428,7 +428,7 @@
} else if (t instanceof RuntimeException) {
throw (RuntimeException) t;
} else {
- //wrap the throwable in a RuntimeException
+ // wrap the throwable in a RuntimeException
throw new RuntimeException(t);
}
}
@@ -443,11 +443,11 @@
}
private void hangHeavy(long maxMillis, long pulseCheckMillis, boolean interruptible) {
- //do some heavy computation and occasionally check for
- //whether time has exceeded maxMillis (see TIKA-1132 for inspiration)
- //or whether the thread was interrupted.
- //By creating a new Date in the inner loop, we're also intentionally
- //triggering the gc most likely.
+ // do some heavy computation and occasionally check for
+ // whether time has exceeded maxMillis (see TIKA-1132 for inspiration)
+ // or whether the thread was interrupted.
+ // By creating a new Date in the inner loop, we're also intentionally
+ // triggering the gc most likely.
long start = new Date().getTime();
long lastChecked = start;
while (true) {
@@ -489,5 +489,4 @@
}
}
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
index c771694..28ea72d 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserFactory.java
@@ -18,12 +18,10 @@
import java.io.IOException;
import java.util.Map;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserFactory;
-
public class MockParserFactory extends ParserFactory {
public MockParserFactory(Map<String, String> args) {
diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
index 1902b08..661b7da 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/mock/MockParserTest.java
@@ -17,25 +17,22 @@
package org.apache.tika.parser.mock;
import java.util.List;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
+import org.junit.jupiter.api.Test;
public class MockParserTest extends TikaTest {
@Test
public void testFakeload() throws Exception {
- //just make sure there aren't any exceptions
+ // just make sure there aren't any exceptions
getRecursiveMetadata("mock_fakeload.xml");
}
@Test
public void testTimes() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("mock_times.xml");
- assertContainsCount("hello",
- metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT), 30);
+ assertContainsCount("hello", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT), 30);
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java b/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java
index 61a0473..ca25b53 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/mock/VowelParser.java
@@ -17,26 +17,20 @@
package org.apache.tika.parser.mock;
-
import java.util.regex.Matcher;
import java.util.regex.Pattern;
-
+import org.apache.tika.config.Field;
+import org.apache.tika.sax.XHTMLContentHandler;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
-import org.apache.tika.config.Field;
-import org.apache.tika.sax.XHTMLContentHandler;
-
-/**
- * only parses vowels as specified in "vowel" field.
- */
+/** only parses vowels as specified in "vowel" field. */
public class VowelParser extends MockParser {
private static final long serialVersionUID = 1L;
- @Field
- private String vowel = "aeiou";
+ @Field private String vowel = "aeiou";
protected void write(Node action, XHTMLContentHandler xhtml) throws SAXException {
NamedNodeMap attrs = action.getAttributes();
@@ -55,5 +49,4 @@
xhtml.characters(sb.toString());
xhtml.endElement(elementType);
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
index 9462d0c..62ea492 100644
--- a/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/parser/multiple/MultipleParserTest.java
@@ -27,9 +27,6 @@
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -41,11 +38,12 @@
import org.apache.tika.parser.multiple.AbstractMultipleParser.MetadataPolicy;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.utils.ParserUtils;
+import org.junit.jupiter.api.Test;
public class MultipleParserTest {
/**
- * Tests how {@link AbstractMultipleParser} works out which
- * mime types to offer, based on the types of the parsers
+ * Tests how {@link AbstractMultipleParser} works out which mime types to offer, based on the
+ * types of the parsers
*/
@Test
public void testMimeTypeSupported() {
@@ -57,9 +55,7 @@
// TODO One with a subtype
}
- /**
- * Test {@link FallbackParser}
- */
+ /** Test {@link FallbackParser} */
@Test
public void testFallback() throws Exception {
ParseContext context = new ParseContext();
@@ -73,30 +69,27 @@
// Some parsers
ErrorParser pFail = new ErrorParser();
- DummyParser pContent =
- new DummyParser(onlyOct, new HashMap<>(), "Fell back!");
+ DummyParser pContent = new DummyParser(onlyOct, new HashMap<>(), "Fell back!");
EmptyParser pNothing = new EmptyParser();
-
// With only one parser defined, works as normal
p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pContent);
metadata = new Metadata();
handler = new BodyContentHandler();
- p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
+ p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
assertEquals(1, usedParsers.length);
assertEquals(DummyParser.class.getName(), usedParsers[0]);
-
// With a failing parser, will go to the working one
p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent);
metadata = new Metadata();
handler = new BodyContentHandler();
- p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
+ p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
@@ -109,13 +102,12 @@
assertNotNull(metadata.get(ParserUtils.EMBEDDED_PARSER));
assertEquals(ErrorParser.class.getName(), metadata.get(ParserUtils.EMBEDDED_PARSER));
-
// Won't go past a working parser to a second one, stops after one works
p = new FallbackParser(null, MetadataPolicy.DISCARD_ALL, pFail, pContent, pNothing);
metadata = new Metadata();
handler = new BodyContentHandler();
- p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
+ p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context);
assertEquals("Fell back!", handler.toString());
usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
@@ -123,13 +115,10 @@
assertEquals(ErrorParser.class.getName(), usedParsers[0]);
assertEquals(DummyParser.class.getName(), usedParsers[1]);
-
// TODO Check merge policies - First vs Discard
}
- /**
- * Test for {@link SupplementingParser}
- */
+ /** Test for {@link SupplementingParser} */
@Test
public void testSupplemental() throws Exception {
ParseContext context = new ParseContext();
@@ -155,22 +144,20 @@
DummyParser pContent2 = new DummyParser(onlyOct, m2, "Fell back 2!");
EmptyParser pNothing = new EmptyParser();
-
// Supplemental doesn't support DISCARD
try {
new SupplementingParser(null, MetadataPolicy.DISCARD_ALL);
fail("Discard shouldn't be supported");
} catch (IllegalArgumentException e) {
- //swallow
+ // swallow
}
-
// With only one parser defined, works as normal
p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pContent1);
metadata = new Metadata();
handler = new BodyContentHandler();
- p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
+ p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context);
assertEquals("Fell back 1!", handler.toString());
assertEquals("Test1", metadata.get("T1"));
@@ -180,15 +167,15 @@
assertEquals(1, usedParsers.length);
assertEquals(DummyParser.class.getName(), usedParsers[0]);
-
// Check the First, Last and All policies:
// First Wins
- p = new SupplementingParser(null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2,
- pNothing);
+ p =
+ new SupplementingParser(
+ null, MetadataPolicy.FIRST_WINS, pFail, pContent1, pContent2, pNothing);
metadata = new Metadata();
handler = new BodyContentHandler();
- p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
+ p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context);
assertEquals("Fell back 1!Fell back 2!", handler.toString());
assertEquals("Test1", metadata.get("T1"));
@@ -201,14 +188,14 @@
assertEquals(DummyParser.class.getName(), usedParsers[1]);
assertEquals(EmptyParser.class.getName(), usedParsers[2]);
-
// Last Wins
- p = new SupplementingParser(null, MetadataPolicy.LAST_WINS, pFail, pContent1, pContent2,
- pNothing);
+ p =
+ new SupplementingParser(
+ null, MetadataPolicy.LAST_WINS, pFail, pContent1, pContent2, pNothing);
metadata = new Metadata();
handler = new BodyContentHandler();
- p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
+ p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context);
assertEquals("Fell back 1!Fell back 2!", handler.toString());
assertEquals("Test1", metadata.get("T1"));
@@ -221,14 +208,14 @@
assertEquals(DummyParser.class.getName(), usedParsers[1]);
assertEquals(EmptyParser.class.getName(), usedParsers[2]);
-
// Merge
- p = new SupplementingParser(null, MetadataPolicy.KEEP_ALL, pFail, pContent1, pContent2,
- pNothing);
+ p =
+ new SupplementingParser(
+ null, MetadataPolicy.KEEP_ALL, pFail, pContent1, pContent2, pNothing);
metadata = new Metadata();
handler = new BodyContentHandler();
- p.parse(new ByteArrayInputStream(new byte[]{0, 1, 2, 3, 4}), handler, metadata, context);
+ p.parse(new ByteArrayInputStream(new byte[] {0, 1, 2, 3, 4}), handler, metadata, context);
assertEquals("Fell back 1!Fell back 2!", handler.toString());
assertEquals("Test1", metadata.get("T1"));
@@ -243,11 +230,9 @@
assertEquals(DummyParser.class.getName(), usedParsers[1]);
assertEquals(EmptyParser.class.getName(), usedParsers[2]);
-
// Check the error details always come through, no matter the policy
// TODO
-
// Check that each parser gets its own ContentHandler if a factory was given
// TODO
}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
index 4aca520..efa1ac4 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/PipesServerTest.java
@@ -22,14 +22,10 @@
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
-
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream;
import org.apache.commons.io.output.UnsynchronizedByteArrayOutputStream;
-import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.io.TempDir;
-
import org.apache.tika.TikaTest;
import org.apache.tika.extractor.BasicEmbeddedDocumentBytesHandler;
import org.apache.tika.metadata.Metadata;
@@ -38,12 +34,14 @@
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.fetcher.Fetcher;
import org.apache.tika.pipes.fetcher.FetcherManager;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
public class PipesServerTest extends TikaTest {
/**
- * This test is useful for stepping through the debugger on PipesServer
- * without having to attach the debugger to the forked process.
+ * This test is useful for stepping through the debugger on PipesServer without having to attach
+ * the debugger to the forked process.
*
* @param tmp
* @throws Exception
@@ -51,29 +49,38 @@
@Test
public void testBasic(@TempDir Path tmp) throws Exception {
Path tikaConfig = tmp.resolve("tika-config.xml");
- String xml = IOUtils.toString(
- PipesServerTest.class.getResourceAsStream("TIKA-3941.xml"), StandardCharsets.UTF_8);
+ String xml =
+ IOUtils.toString(
+ PipesServerTest.class.getResourceAsStream("TIKA-3941.xml"),
+ StandardCharsets.UTF_8);
xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString());
Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8));
- Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/mock_times.xml"),
+ Files.copy(
+ PipesServerTest.class.getResourceAsStream("/test-documents/mock_times.xml"),
tmp.resolve("mock.xml"));
- PipesServer pipesServer = new PipesServer(tikaConfig,
- new UnsynchronizedByteArrayInputStream(new byte[0]),
- new PrintStream(UnsynchronizedByteArrayOutputStream.builder().get(), true,
- StandardCharsets.UTF_8.name()),
- -1, 30000, 30000);
+ PipesServer pipesServer =
+ new PipesServer(
+ tikaConfig,
+ new UnsynchronizedByteArrayInputStream(new byte[0]),
+ new PrintStream(
+ UnsynchronizedByteArrayOutputStream.builder().get(),
+ true,
+ StandardCharsets.UTF_8.name()),
+ -1,
+ 30000,
+ 30000);
pipesServer.initializeResources();
- FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id",
- new FetchKey("fs", "mock.xml"),
- new EmitKey("", ""));
+ FetchEmitTuple fetchEmitTuple =
+ new FetchEmitTuple("id", new FetchKey("fs", "mock.xml"), new EmitKey("", ""));
Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher();
- PipesServer.MetadataListAndEmbeddedBytes
- parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
- assertEquals("5f3b924303e960ce35d7f705e91d3018dd110a9c3cef0546a91fe013d6dad6fd",
+ PipesServer.MetadataListAndEmbeddedBytes parseData =
+ pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
+ assertEquals(
+ "5f3b924303e960ce35d7f705e91d3018dd110a9c3cef0546a91fe013d6dad6fd",
parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256"));
}
@@ -85,51 +92,65 @@
Files.createDirectories(tmp);
Path tikaConfig = tmp.resolve("tika-config.xml");
- String xml = IOUtils.toString(
- PipesServerTest.class.getResourceAsStream("TIKA-4207.xml"),
- StandardCharsets.UTF_8);
+ String xml =
+ IOUtils.toString(
+ PipesServerTest.class.getResourceAsStream("TIKA-4207.xml"),
+ StandardCharsets.UTF_8);
xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString());
Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8));
- Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"),
+ Files.copy(
+ PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"),
tmp.resolve("mock.xml"));
- PipesServer pipesServer = new PipesServer(tikaConfig,
- new UnsynchronizedByteArrayInputStream(new byte[0]),
- new PrintStream(UnsynchronizedByteArrayOutputStream.builder().get(), true,
- StandardCharsets.UTF_8.name()),
- -1, 30000, 30000);
+ PipesServer pipesServer =
+ new PipesServer(
+ tikaConfig,
+ new UnsynchronizedByteArrayInputStream(new byte[0]),
+ new PrintStream(
+ UnsynchronizedByteArrayOutputStream.builder().get(),
+ true,
+ StandardCharsets.UTF_8.name()),
+ -1,
+ 30000,
+ 30000);
pipesServer.initializeResources();
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig =
new EmbeddedDocumentBytesConfig(true);
embeddedDocumentBytesConfig.setIncludeOriginal(true);
- FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id",
- new FetchKey("fs", "mock.xml"),
- new EmitKey("", ""), new Metadata(),
- HandlerConfig.DEFAULT_HANDLER_CONFIG, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT,
- embeddedDocumentBytesConfig);
+ FetchEmitTuple fetchEmitTuple =
+ new FetchEmitTuple(
+ "id",
+ new FetchKey("fs", "mock.xml"),
+ new EmitKey("", ""),
+ new Metadata(),
+ HandlerConfig.DEFAULT_HANDLER_CONFIG,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT,
+ embeddedDocumentBytesConfig);
Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher();
- PipesServer.MetadataListAndEmbeddedBytes
- parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
+ PipesServer.MetadataListAndEmbeddedBytes parseData =
+ pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
assertEquals(2, parseData.metadataList.size());
byte[] bytes0 =
IOUtils.toByteArray(
- ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
- .getDocument(0));
+ ((BasicEmbeddedDocumentBytesHandler)
+ parseData.getEmbeddedDocumentBytesHandler())
+ .getDocument(0));
byte[] bytes1 =
IOUtils.toByteArray(
- ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+ ((BasicEmbeddedDocumentBytesHandler)
+ parseData.getEmbeddedDocumentBytesHandler())
.getDocument(1));
- assertContains("is to trigger mock on the embedded",
- new String(bytes0, StandardCharsets.UTF_8));
+ assertContains(
+ "is to trigger mock on the embedded", new String(bytes0, StandardCharsets.UTF_8));
- assertContains("embeddedAuthor</metadata>",
- new String(bytes1, StandardCharsets.UTF_8));
- assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a",
+ assertContains("embeddedAuthor</metadata>", new String(bytes1, StandardCharsets.UTF_8));
+ assertEquals(
+ "fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a",
parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256"));
}
@@ -141,50 +162,65 @@
Files.createDirectories(tmp);
Path tikaConfig = tmp.resolve("tika-config.xml");
- String xml = IOUtils.toString(
- PipesServerTest.class.getResourceAsStream("TIKA-4207-limit-bytes.xml"),
- StandardCharsets.UTF_8);
+ String xml =
+ IOUtils.toString(
+ PipesServerTest.class.getResourceAsStream("TIKA-4207-limit-bytes.xml"),
+ StandardCharsets.UTF_8);
xml = xml.replace("BASE_PATH", tmp.toAbsolutePath().toString());
Files.write(tikaConfig, xml.getBytes(StandardCharsets.UTF_8));
- Files.copy(PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"),
+ Files.copy(
+ PipesServerTest.class.getResourceAsStream("/test-documents/basic_embedded.xml"),
tmp.resolve("mock.xml"));
- PipesServer pipesServer = new PipesServer(tikaConfig,
- new UnsynchronizedByteArrayInputStream(new byte[0]),
- new PrintStream(UnsynchronizedByteArrayOutputStream.builder().get(), true,
- StandardCharsets.UTF_8.name()),
- -1, 30000, 30000);
+ PipesServer pipesServer =
+ new PipesServer(
+ tikaConfig,
+ new UnsynchronizedByteArrayInputStream(new byte[0]),
+ new PrintStream(
+ UnsynchronizedByteArrayOutputStream.builder().get(),
+ true,
+ StandardCharsets.UTF_8.name()),
+ -1,
+ 30000,
+ 30000);
pipesServer.initializeResources();
EmbeddedDocumentBytesConfig embeddedDocumentBytesConfig =
new EmbeddedDocumentBytesConfig(true);
embeddedDocumentBytesConfig.setIncludeOriginal(true);
- FetchEmitTuple fetchEmitTuple = new FetchEmitTuple("id",
- new FetchKey("fs", "mock.xml"),
- new EmitKey("", ""), new Metadata(),
- HandlerConfig.DEFAULT_HANDLER_CONFIG, FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT,
- embeddedDocumentBytesConfig);
+ FetchEmitTuple fetchEmitTuple =
+ new FetchEmitTuple(
+ "id",
+ new FetchKey("fs", "mock.xml"),
+ new EmitKey("", ""),
+ new Metadata(),
+ HandlerConfig.DEFAULT_HANDLER_CONFIG,
+ FetchEmitTuple.ON_PARSE_EXCEPTION.EMIT,
+ embeddedDocumentBytesConfig);
Fetcher fetcher = FetcherManager.load(tikaConfig).getFetcher();
- PipesServer.MetadataListAndEmbeddedBytes
- parseData = pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
+ PipesServer.MetadataListAndEmbeddedBytes parseData =
+ pipesServer.parseFromTuple(fetchEmitTuple, fetcher);
assertEquals(2, parseData.metadataList.size());
byte[] bytes0 =
IOUtils.toByteArray(
- ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+ ((BasicEmbeddedDocumentBytesHandler)
+ parseData.getEmbeddedDocumentBytesHandler())
.getDocument(0));
byte[] bytes1 =
IOUtils.toByteArray(
- ((BasicEmbeddedDocumentBytesHandler)parseData.getEmbeddedDocumentBytesHandler())
+ ((BasicEmbeddedDocumentBytesHandler)
+ parseData.getEmbeddedDocumentBytesHandler())
.getDocument(1));
- assertContains("is to trigger mock on the embedded",
- new String(bytes0, StandardCharsets.UTF_8));
+ assertContains(
+ "is to trigger mock on the embedded", new String(bytes0, StandardCharsets.UTF_8));
assertEquals(10, bytes1.length);
- assertEquals("fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a",
+ assertEquals(
+ "fdaa937c96d1ed010b8d307ccddf9d11c3b48db732a8771eaafe99d59e076d0a",
parseData.metadataList.get(0).get("X-TIKA:digest:SHA-256"));
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java b/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java
index 4522a2e..af67e78 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/async/AsyncChaosMonkeyTest.java
@@ -26,10 +26,6 @@
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
-
-import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.io.TempDir;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.pipes.FetchEmitTuple;
@@ -39,40 +35,49 @@
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.pipesiterator.PipesIterator;
import org.apache.tika.utils.ProcessUtils;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
public class AsyncChaosMonkeyTest {
- private final String OOM = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<mock>" +
- "<throw class=\"java.lang.OutOfMemoryError\">oom message</throw>\n</mock>";
- private final String OK = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<mock>" +
- "<metadata action=\"add\" name=\"dc:creator\">Nikolai Lobachevsky</metadata>" +
- "<write element=\"p\">main_content</write>" +
- "</mock>";
+ private final String OOM =
+ "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ + "<mock>"
+ + "<throw class=\"java.lang.OutOfMemoryError\">oom message</throw>\n</mock>";
+ private final String OK =
+ "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ + "<mock>"
+ + "<metadata action=\"add\" name=\"dc:creator\">Nikolai Lobachevsky</metadata>"
+ + "<write element=\"p\">main_content</write>"
+ + "</mock>";
- private final String TIMEOUT = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<mock>" +
- "<metadata action=\"add\" name=\"dc:creator\">Nikolai Lobachevsky</metadata>" +
- "<write element=\"p\">main_content</write>" +
- "<fakeload millis=\"60000\" cpu=\"1\" mb=\"10\"/>" + "</mock>";
+ private final String TIMEOUT =
+ "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ + "<mock>"
+ + "<metadata action=\"add\" name=\"dc:creator\">Nikolai Lobachevsky</metadata>"
+ + "<write element=\"p\">main_content</write>"
+ + "<fakeload millis=\"60000\" cpu=\"1\" mb=\"10\"/>"
+ + "</mock>";
- private final String SYSTEM_EXIT = "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<mock>" +
- "<metadata action=\"add\" name=\"dc:creator\">Nikolai Lobachevsky</metadata>" +
- "<write element=\"p\">main_content</write>" +
- "<system_exit/>" + "</mock>";
+ private final String SYSTEM_EXIT =
+ "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ + "<mock>"
+ + "<metadata action=\"add\" name=\"dc:creator\">Nikolai Lobachevsky</metadata>"
+ + "<write element=\"p\">main_content</write>"
+ + "<system_exit/>"
+ + "</mock>";
private final int totalFiles = 100;
- @TempDir
- private Path inputDir;
+ @TempDir private Path inputDir;
- @TempDir
- private Path configDir;
+ @TempDir private Path configDir;
private int ok = 0;
private int oom = 0;
private int timeouts = 0;
private int crash = 0;
-
public Path setUp(boolean emitIntermediateResults) throws SQLException, IOException {
ok = 0;
oom = 0;
@@ -80,28 +85,36 @@
crash = 0;
Path tikaConfigPath = Files.createTempFile(configDir, "tika-config-", ".xml");
String xml =
- "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<properties>" + " <emitters>" +
- " <emitter class=\"org.apache.tika.pipes.async.MockEmitter\">\n" +
- " <name>mock</name>\n" + " </emitter>" +
- " </emitters>" + " <fetchers>" +
- " <fetcher class=\"org.apache.tika.pipes.fetcher.fs.FileSystemFetcher\">" +
- " <name>mock</name>\n" + " <basePath>" +
- ProcessUtils.escapeCommandLine(inputDir.toAbsolutePath().toString()) +
- "</basePath>\n" + " </fetcher>" + " </fetchers>" +
- " <autoDetectParserConfig>\n" +
- " <digesterFactory\n" +
- " class=\"org.apache.tika.pipes.async.MockDigesterFactory\"/>\n" +
- "</autoDetectParserConfig>" +
- "<async><pipesReporter class=\"org.apache.tika.pipes.async.MockReporter\"/>" +
- "<emitIntermediateResults>" + emitIntermediateResults +
- "</emitIntermediateResults>" +
- "<tikaConfig>" +
- ProcessUtils.escapeCommandLine(tikaConfigPath.toAbsolutePath().toString()) +
- "</tikaConfig><forkedJvmArgs><arg>-Xmx512m</arg" +
- "></forkedJvmArgs><maxForEmitBatchBytes>1000000</maxForEmitBatchBytes>" +
- "<timeoutMillis>5000</timeoutMillis>" +
- "<numClients>4</numClients></async>" +
- "</properties>";
+ "<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ + "<properties>"
+ + " <emitters>"
+ + " <emitter class=\"org.apache.tika.pipes.async.MockEmitter\">\n"
+ + " <name>mock</name>\n"
+ + " </emitter>"
+ + " </emitters>"
+ + " <fetchers>"
+ + " <fetcher class=\"org.apache.tika.pipes.fetcher.fs.FileSystemFetcher\">"
+ + " <name>mock</name>\n"
+ + " <basePath>"
+ + ProcessUtils.escapeCommandLine(inputDir.toAbsolutePath().toString())
+ + "</basePath>\n"
+ + " </fetcher>"
+ + " </fetchers>"
+ + " <autoDetectParserConfig>\n"
+ + " <digesterFactory\n"
+ + " class=\"org.apache.tika.pipes.async.MockDigesterFactory\"/>\n"
+ + "</autoDetectParserConfig>"
+ + "<async><pipesReporter class=\"org.apache.tika.pipes.async.MockReporter\"/>"
+ + "<emitIntermediateResults>"
+ + emitIntermediateResults
+ + "</emitIntermediateResults>"
+ + "<tikaConfig>"
+ + ProcessUtils.escapeCommandLine(tikaConfigPath.toAbsolutePath().toString())
+ + "</tikaConfig><forkedJvmArgs><arg>-Xmx512m</arg"
+ + "></forkedJvmArgs><maxForEmitBatchBytes>1000000</maxForEmitBatchBytes>"
+ + "<timeoutMillis>5000</timeoutMillis>"
+ + "<numClients>4</numClients></async>"
+ + "</properties>";
Files.write(tikaConfigPath, xml.getBytes(StandardCharsets.UTF_8));
Random r = new Random();
for (int i = 0; i < totalFiles; i++) {
@@ -110,7 +123,8 @@
Files.write(inputDir.resolve(i + ".xml"), OOM.getBytes(StandardCharsets.UTF_8));
oom++;
} else if (f < 0.10) {
- Files.write(inputDir.resolve(i + ".xml"), SYSTEM_EXIT.getBytes(StandardCharsets.UTF_8));
+ Files.write(
+ inputDir.resolve(i + ".xml"), SYSTEM_EXIT.getBytes(StandardCharsets.UTF_8));
crash++;
} else if (f < 0.13) {
Files.write(inputDir.resolve(i + ".xml"), TIMEOUT.getBytes(StandardCharsets.UTF_8));
@@ -125,32 +139,35 @@
return tikaConfigPath;
}
-/*
- private void writeLarge(Path resolve) throws IOException {
- try (BufferedWriter writer = Files.newBufferedWriter(resolve, StandardCharsets.UTF_8)) {
- writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
- writer.write("<mock>");
- for (int i = 0; i < 10000000; i++) {
- writer.write("<write element=\"p\">hello hello hello hello hello</write>");
+ /*
+ private void writeLarge(Path resolve) throws IOException {
+ try (BufferedWriter writer = Files.newBufferedWriter(resolve, StandardCharsets.UTF_8)) {
+ writer.write("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
+ writer.write("<mock>");
+ for (int i = 0; i < 10000000; i++) {
+ writer.write("<write element=\"p\">hello hello hello hello hello</write>");
+ }
+ writer.write("</mock>");
}
- writer.write("</mock>");
}
- }
-*/
+ */
@Test
public void testBasic() throws Exception {
AsyncProcessor processor = new AsyncProcessor(setUp(false));
for (int i = 0; i < totalFiles; i++) {
- FetchEmitTuple t = new FetchEmitTuple("myId-" + i,
- new FetchKey("mock", i + ".xml"),
- new EmitKey("mock", "emit-" + i), new Metadata());
+ FetchEmitTuple t =
+ new FetchEmitTuple(
+ "myId-" + i,
+ new FetchKey("mock", i + ".xml"),
+ new EmitKey("mock", "emit-" + i),
+ new Metadata());
processor.offer(t, 1000);
}
for (int i = 0; i < 10; i++) {
processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000);
}
- //TODO clean this up
+ // TODO clean this up
while (processor.checkActive()) {
Thread.sleep(100);
}
@@ -162,7 +179,8 @@
assertEquals(ok, emitKeys.size());
assertEquals(100, MockReporter.RESULTS.size());
for (PipesResult r : MockReporter.RESULTS) {
- assertEquals("application/mock+xml",
+ assertEquals(
+ "application/mock+xml",
r.getEmitData().getMetadataList().get(0).get(Metadata.CONTENT_TYPE));
}
}
@@ -171,14 +189,18 @@
public void testEmitIntermediate() throws Exception {
AsyncProcessor processor = new AsyncProcessor(setUp(true));
for (int i = 0; i < totalFiles; i++) {
- FetchEmitTuple t = new FetchEmitTuple("myId-" + i, new FetchKey("mock", i + ".xml"),
- new EmitKey("mock", "emit-" + i), new Metadata());
+ FetchEmitTuple t =
+ new FetchEmitTuple(
+ "myId-" + i,
+ new FetchKey("mock", i + ".xml"),
+ new EmitKey("mock", "emit-" + i),
+ new Metadata());
processor.offer(t, 1000);
}
for (int i = 0; i < 10; i++) {
processor.offer(PipesIterator.COMPLETED_SEMAPHORE, 1000);
}
- //TODO clean this up
+ // TODO clean this up
while (processor.checkActive()) {
Thread.sleep(100);
}
@@ -187,10 +209,10 @@
int observedOOM = 0;
for (EmitData d : MockEmitter.EMIT_DATA) {
emitKeys.add(d.getEmitKey().getEmitKey());
- assertEquals(64,
- d.getMetadataList().get(0).get("X-TIKA:digest:SHA-256").trim().length());
- assertEquals("application/mock+xml",
- d.getMetadataList().get(0).get(Metadata.CONTENT_TYPE));
+ assertEquals(
+ 64, d.getMetadataList().get(0).get("X-TIKA:digest:SHA-256").trim().length());
+ assertEquals(
+ "application/mock+xml", d.getMetadataList().get(0).get(Metadata.CONTENT_TYPE));
String val = d.getMetadataList().get(0).get(TikaCoreProperties.PIPES_RESULT);
if ("OOM".equals(val)) {
observedOOM++;
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java b/tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java
index 2374c14..33f6a98 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/async/MockEmitter.java
@@ -21,7 +21,6 @@
import java.util.Collections;
import java.util.List;
import java.util.concurrent.ArrayBlockingQueue;
-
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.emitter.AbstractEmitter;
import org.apache.tika.pipes.emitter.EmitData;
@@ -32,8 +31,7 @@
static ArrayBlockingQueue<EmitData> EMIT_DATA = new ArrayBlockingQueue<>(10000);
- public MockEmitter() {
- }
+ public MockEmitter() {}
public static List<EmitData> getData() {
return new ArrayList<>(EMIT_DATA);
@@ -43,8 +41,8 @@
public void emit(String emitKey, List<Metadata> metadataList)
throws IOException, TikaEmitterException {
emit(
- Collections.singletonList(new EmitData(new EmitKey(getName(), emitKey),
- metadataList)));
+ Collections.singletonList(
+ new EmitData(new EmitKey(getName(), emitKey), metadataList)));
}
@Override
@@ -54,5 +52,4 @@
EMIT_DATA.offer(d);
}
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java b/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java
index 10af275..d234ac5 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/async/MockFetcher.java
@@ -20,16 +20,19 @@
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
-
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.pipes.fetcher.Fetcher;
public class MockFetcher implements Fetcher {
- private static final byte[] BYTES = ("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>" + "<mock>" +
- "<metadata action=\"add\" name=\"dc:creator\">Nikolai Lobachevsky</metadata>" +
- "<write element=\"p\">main_content</write>" + "</mock>").getBytes(StandardCharsets.UTF_8);
+ private static final byte[] BYTES =
+ ("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"
+ + "<mock>"
+ + "<metadata action=\"add\" name=\"dc:creator\">Nikolai Lobachevsky</metadata>"
+ + "<write element=\"p\">main_content</write>"
+ + "</mock>")
+ .getBytes(StandardCharsets.UTF_8);
@Override
public String getName() {
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java b/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java
index 6e8308c..0c05648 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporter.java
@@ -17,7 +17,6 @@
package org.apache.tika.pipes.async;
import java.util.concurrent.ArrayBlockingQueue;
-
import org.apache.tika.config.Field;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.PipesReporter;
@@ -35,14 +34,10 @@
}
@Override
- public void error(Throwable t) {
-
- }
+ public void error(Throwable t) {}
@Override
- public void error(String msg) {
-
- }
+ public void error(String msg) {}
@Field
public void setEndpoint(String endpoint) {
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java b/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java
index 9bfcd55..cbd4c0a 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/async/MockReporterTest.java
@@ -22,11 +22,9 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.pipes.CompositePipesReporter;
import org.apache.tika.pipes.PipesReporter;
+import org.junit.jupiter.api.Test;
public class MockReporterTest {
@@ -36,7 +34,7 @@
AsyncConfig asyncConfig = AsyncConfig.load(configPath);
PipesReporter reporter = asyncConfig.getPipesReporter();
assertTrue(reporter instanceof MockReporter);
- assertEquals("somethingOrOther", ((MockReporter)reporter).getEndpoint());
+ assertEquals("somethingOrOther", ((MockReporter) reporter).getEndpoint());
}
@Test
@@ -45,8 +43,8 @@
AsyncConfig asyncConfig = AsyncConfig.load(configPath);
PipesReporter reporter = asyncConfig.getPipesReporter();
assertTrue(reporter instanceof CompositePipesReporter);
- List<PipesReporter> reporters = ((CompositePipesReporter)reporter).getPipesReporters();
- assertEquals("somethingOrOther1", ((MockReporter)reporters.get(0)).getEndpoint());
- assertEquals("somethingOrOther2", ((MockReporter)reporters.get(1)).getEndpoint());
+ List<PipesReporter> reporters = ((CompositePipesReporter) reporter).getPipesReporters();
+ assertEquals("somethingOrOther1", ((MockReporter) reporters.get(0)).getEndpoint());
+ assertEquals("somethingOrOther2", ((MockReporter) reporters.get(1)).getEndpoint());
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java b/tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java
index 036a959..f37f0ee 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/emitter/MockEmitter.java
@@ -19,7 +19,6 @@
import java.io.IOException;
import java.util.List;
import java.util.Map;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -29,13 +28,10 @@
public class MockEmitter extends AbstractEmitter implements Initializable {
- @Field
- private boolean throwOnCheck = false;
+ @Field private boolean throwOnCheck = false;
@Override
- public void initialize(Map<String, Param> params) throws TikaConfigException {
-
- }
+ public void initialize(Map<String, Param> params) throws TikaConfigException {}
public void setThrowOnCheck(boolean throwOnCheck) {
this.throwOnCheck = throwOnCheck;
@@ -48,12 +44,9 @@
if (throwOnCheck) {
throw new TikaConfigException("throw on check");
}
-
}
@Override
public void emit(String emitKey, List<Metadata> metadataList)
- throws IOException, TikaEmitterException {
-
- }
+ throws IOException, TikaEmitterException {}
}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java
index 0604327..78621dd 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/MockFetcher.java
@@ -21,7 +21,6 @@
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Map;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.InitializableProblemHandler;
@@ -34,12 +33,9 @@
private Map<String, Param> params;
- @Field
- private String byteString = null;
+ @Field private String byteString = null;
- @Field
- private boolean throwOnCheck = false;
-
+ @Field private boolean throwOnCheck = false;
public void setThrowOnCheck(boolean throwOnCheck) {
this.throwOnCheck = throwOnCheck;
@@ -62,10 +58,10 @@
}
}
-
@Override
public InputStream fetch(String fetchKey, Metadata metadata) throws TikaException, IOException {
- return byteString == null ? new ByteArrayInputStream(new byte[0]) :
- new ByteArrayInputStream(byteString.getBytes(StandardCharsets.UTF_8));
+ return byteString == null
+ ? new ByteArrayInputStream(new byte[0])
+ : new ByteArrayInputStream(byteString.getBytes(StandardCharsets.UTF_8));
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java
index 7e29ac2..f74fc54 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/fetcher/fs/FileSystemFetcherTest.java
@@ -23,11 +23,8 @@
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.nio.file.Paths;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.InitializableProblemHandler;
-
+import org.junit.jupiter.api.Test;
public class FileSystemFetcherTest {
@@ -48,10 +45,12 @@
@Test
public void testNullByte() throws Exception {
FileSystemFetcher f = new FileSystemFetcher();
- assertThrows(InvalidPathException.class, () -> {
- f.setBasePath("bad\u0000path");
- f.setName("fs");
- f.checkInitialization(InitializableProblemHandler.IGNORE);
- });
+ assertThrows(
+ InvalidPathException.class,
+ () -> {
+ f.setBasePath("bad\u0000path");
+ f.setName("fs");
+ f.checkInitialization(InitializableProblemHandler.IGNORE);
+ });
}
}
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java
index 7b37ec5..303f508 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/FileSystemPipesIteratorTest.java
@@ -28,13 +28,10 @@
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
-
-import org.junit.jupiter.api.Test;
-import org.junit.jupiter.api.Timeout;
-
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.pipesiterator.fs.FileSystemPipesIterator;
-
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.Timeout;
public class FileSystemPipesIteratorTest {
@@ -45,14 +42,12 @@
result = walk.filter(Files::isRegularFile).collect(Collectors.toList());
}
return result;
-
}
@Test
@Timeout(30000)
public void testBasic() throws Exception {
- URL url =
- FileSystemPipesIteratorTest.class.getResource("/test-documents");
+ URL url = FileSystemPipesIteratorTest.class.getResource("/test-documents");
Path root = Paths.get(url.toURI());
List<Path> files = listFiles(root);
Set<String> truthSet = new HashSet<>();
diff --git a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java
index fec827d..3c24b9d 100644
--- a/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java
+++ b/tika-core/src/test/java/org/apache/tika/pipes/pipesiterator/filelist/FileListPipesIteratorTest.java
@@ -23,11 +23,9 @@
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.InitializableProblemHandler;
import org.apache.tika.pipes.FetchEmitTuple;
+import org.junit.jupiter.api.Test;
public class FileListPipesIteratorTest {
diff --git a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
index 111d2ea..86ffcc5 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/BasicContentHandlerFactoryTest.java
@@ -26,7 +26,12 @@
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.util.Set;
-
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.WriteLimitReachedException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
import org.junit.jupiter.api.Test;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
@@ -34,22 +39,13 @@
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.exception.WriteLimitReachedException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-
-/**
- * Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class.
- */
+/** Test cases for the {@link org.apache.tika.sax.BodyContentHandler} class. */
public class BasicContentHandlerFactoryTest {
- //default max char len (at least in WriteOutContentHandler is 100k)
+ // default max char len (at least in WriteOutContentHandler is 100k)
private static final int OVER_DEFAULT = 120000;
- //copied from TikaTest in tika-parsers package
+ // copied from TikaTest in tika-parsers package
public static void assertNotContains(String needle, String haystack) {
assertFalse(haystack.contains(needle), needle + " found in:\n" + haystack);
}
@@ -76,13 +72,14 @@
.getNewContentHandler();
assertTrue(handler instanceof DefaultHandler);
p.parse(null, handler, null, null);
- //unfortunatley, the DefaultHandler does not return "",
+ // unfortunatley, the DefaultHandler does not return "",
assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString());
- //tests that no write limit exception is thrown
+ // tests that no write limit exception is thrown
p = new MockParser(100);
- handler = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5)
- .getNewContentHandler();
+ handler =
+ new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, 5)
+ .getNewContentHandler();
assertTrue(handler instanceof DefaultHandler);
p.parse(null, handler, null, null);
assertContains("org.xml.sax.helpers.DefaultHandler", handler.toString());
@@ -102,7 +99,7 @@
assertNotContains("<body", extracted);
assertNotContains("<html", extracted);
assertTrue(extracted.length() > 110000);
- //now test write limit
+ // now test write limit
p = new MockParser(10);
handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
assertTrue(handler instanceof WriteOutContentHandler);
@@ -111,7 +108,7 @@
assertContains("This ", extracted);
assertNotContains("aaaa", extracted);
- //now test outputstream call
+ // now test outputstream call
p = new MockParser(OVER_DEFAULT);
ByteArrayOutputStream os = new ByteArrayOutputStream();
handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8);
@@ -128,8 +125,8 @@
handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8);
assertTrue(handler instanceof WriteOutContentHandler);
assertWriteLimitReached(p, (WriteOutContentHandler) handler);
- //When writing to an OutputStream and a write limit is reached,
- //currently, nothing is written.
+ // When writing to an OutputStream and a write limit is reached,
+ // currently, nothing is written.
assertEquals(0, os.toByteArray().length);
}
@@ -146,7 +143,7 @@
assertContains("aaaaaaaaaa", extracted);
assertTrue(extracted.length() > 110000);
- //now test write limit
+ // now test write limit
p = new MockParser(10);
handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
assertTrue(handler instanceof WriteOutContentHandler);
@@ -155,7 +152,7 @@
assertContains("This ", extracted);
assertNotContains("aaaa", extracted);
- //now test outputstream call
+ // now test outputstream call
p = new MockParser(OVER_DEFAULT);
ByteArrayOutputStream os = new ByteArrayOutputStream();
handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8);
@@ -167,7 +164,6 @@
assertContains("<html", os.toByteArray());
assertTrue(os.toByteArray().length > 110000);
-
p = new MockParser(10);
os = new ByteArrayOutputStream();
handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8);
@@ -189,7 +185,7 @@
assertContains("aaaaaaaaaa", extracted);
assertTrue(handler.toString().length() > 110000);
- //now test write limit
+ // now test write limit
p = new MockParser(10);
handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
assertTrue(handler instanceof WriteOutContentHandler);
@@ -198,7 +194,7 @@
assertContains("This ", extracted);
assertNotContains("aaaa", extracted);
- //now test outputstream call
+ // now test outputstream call
p = new MockParser(OVER_DEFAULT);
ByteArrayOutputStream os = new ByteArrayOutputStream();
handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8);
@@ -211,7 +207,6 @@
assertContains("<html", os.toByteArray());
assertTrue(os.toByteArray().length > 110000);
-
p = new MockParser(10);
os = new ByteArrayOutputStream();
handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler(os, UTF_8);
@@ -234,7 +229,7 @@
assertContains("aaaaaaaaaa", extracted);
assertTrue(extracted.length() > 110000);
- //now test write limit
+ // now test write limit
p = new MockParser(10);
handler = new BasicContentHandlerFactory(type, 5).getNewContentHandler();
assertTrue(handler instanceof BodyContentHandler);
@@ -243,7 +238,7 @@
assertNotContains("This ", extracted);
assertContains("aaaa", extracted);
- //now test outputstream call
+ // now test outputstream call
p = new MockParser(OVER_DEFAULT);
ByteArrayOutputStream os = new ByteArrayOutputStream();
handler = new BasicContentHandlerFactory(type, -1).getNewContentHandler(os, UTF_8);
@@ -277,13 +272,13 @@
assertTrue(wlr, "WriteLimitReached");
}
- //TODO: is there a better way than to repeat this with diff signature?
+ // TODO: is there a better way than to repeat this with diff signature?
private void assertWriteLimitReached(Parser p, BodyContentHandler handler) throws Exception {
boolean wlr = false;
try {
p.parse(null, handler, null, null);
} catch (SAXException e) {
- if (! WriteLimitReachedException.isWriteLimitReached(e)) {
+ if (!WriteLimitReachedException.isWriteLimitReached(e)) {
throw e;
}
@@ -292,8 +287,8 @@
assertTrue(wlr, "WriteLimitReached");
}
- //Simple mockparser that writes a title
- //and charsToWrite number of 'a'
+ // Simple mockparser that writes a title
+ // and charsToWrite number of 'a'
private static class MockParser implements Parser {
private final String XHTML = "http://www.w3.org/1999/xhtml";
private final Attributes EMPTY_ATTRIBUTES = new AttributesImpl();
@@ -311,8 +306,9 @@
}
@Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
- ParseContext context) throws IOException, SAXException, TikaException {
+ public void parse(
+ InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
handler.startDocument();
handler.startPrefixMapping("", XHTML);
handler.startElement(XHTML, "html", "html", EMPTY_ATTRIBUTES);
diff --git a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
index 19bf853..1937ea1 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/BodyContentHandlerTest.java
@@ -23,24 +23,20 @@
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.mock.MockParser;
+import org.junit.jupiter.api.Test;
-/**
- * Test cases for the {@link BodyContentHandler} class.
- */
+/** Test cases for the {@link BodyContentHandler} class. */
public class BodyContentHandlerTest extends TikaTest {
/**
- * Test that the conversion to an {@link OutputStream} doesn't leave
- * characters unflushed in an internal buffer.
+ * Test that the conversion to an {@link OutputStream} doesn't leave characters unflushed in an
+ * internal buffer.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-179">TIKA-179</a>
*/
@@ -49,8 +45,8 @@
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
XHTMLContentHandler xhtml =
- new XHTMLContentHandler(new BodyContentHandler(
- new OutputStreamWriter(buffer, UTF_8)),
+ new XHTMLContentHandler(
+ new BodyContentHandler(new OutputStreamWriter(buffer, UTF_8)),
new Metadata());
xhtml.startDocument();
xhtml.element("p", "Test text");
@@ -61,7 +57,7 @@
@Test
public void testLimit() throws Exception {
- //TIKA-2668 - java 11-ea
+ // TIKA-2668 - java 11-ea
Parser p = new MockParser();
WriteOutContentHandler handler = new WriteOutContentHandler(15);
Metadata metadata = new Metadata();
diff --git a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java
index 8864314..5c252a6 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/CustomErrorHandlerTest.java
@@ -23,19 +23,17 @@
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import javax.xml.parsers.ParserConfigurationException;
-
import org.apache.commons.io.output.ByteArrayOutputStream;
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.utils.XMLReaderUtils;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
-import org.apache.tika.TikaTest;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.utils.XMLReaderUtils;
-
/**
* Test that validates a custom {@link ContentHandlerDecorator} can handle errors during XML parsing
*
@@ -46,12 +44,13 @@
private static String DEFAULT_SAX_PARSER_FACTORY;
private static String SAX_PARSER_FACTORY_KEY = "javax.xml.parsers.SAXParserFactory";
+
@BeforeAll
public static void setUp() throws TikaException {
DEFAULT_SAX_PARSER_FACTORY = System.getProperty(SAX_PARSER_FACTORY_KEY);
- System.setProperty(SAX_PARSER_FACTORY_KEY,
- "org.apache.tika.sax.ErrorResistantSAXParserFactory");
- //forces re-initialization
+ System.setProperty(
+ SAX_PARSER_FACTORY_KEY, "org.apache.tika.sax.ErrorResistantSAXParserFactory");
+ // forces re-initialization
XMLReaderUtils.setPoolSize(10);
}
@@ -62,9 +61,10 @@
} else {
System.setProperty(SAX_PARSER_FACTORY_KEY, DEFAULT_SAX_PARSER_FACTORY);
}
- //forces re-initialization
+ // forces re-initialization
XMLReaderUtils.setPoolSize(10);
}
+
private void extractXml(InputStream blobStream, OutputStream textStream)
throws IOException, SAXException, TikaException, ParserConfigurationException {
@@ -92,7 +92,8 @@
try {
String content = extractTestData("undeclared_entity.xml");
assertContains("START", content);
- //This assertion passes only if custom error handler is called to handle fatal exception
+ // This assertion passes only if custom error handler is called to handle fatal
+ // exception
assertContains("END", content);
} catch (SAXException e) {
fail("Exception returned from parser and not handled in error handler " + e);
diff --git a/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java
index 3ad2978..a8388f9 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/LinkContentHandlerTest.java
@@ -21,9 +21,7 @@
import org.junit.jupiter.api.Test;
import org.xml.sax.helpers.AttributesImpl;
-/**
- * Test cases for the {@link LinkContentHandler} class.
- */
+/** Test cases for the {@link LinkContentHandler} class. */
public class LinkContentHandlerTest {
/**
@@ -34,9 +32,9 @@
LinkContentHandler linkContentHandler = new LinkContentHandler(true);
linkContentHandler.startElement(XHTMLContentHandler.XHTML, "a", "", new AttributesImpl());
- char[] anchorText =
- {'\n', 'N', 'o', ' ', 'w', 'h', 'i', 't', 'e', '\n', '\t', '\t', 's', 'p', 'a', 'c',
- 'e'};
+ char[] anchorText = {
+ '\n', 'N', 'o', ' ', 'w', 'h', 'i', 't', 'e', '\n', '\t', '\t', 's', 'p', 'a', 'c', 'e'
+ };
linkContentHandler.characters(anchorText, 1, anchorText.length - 1);
linkContentHandler.endElement(XHTMLContentHandler.XHTML, "a", "");
@@ -72,7 +70,8 @@
linkContentHandler.startElement(XHTMLContentHandler.XHTML, "link", "", atts);
linkContentHandler.endElement(XHTMLContentHandler.XHTML, "link", "");
- assertEquals("http://tika.apache.org/stylesheet.css",
+ assertEquals(
+ "http://tika.apache.org/stylesheet.css",
linkContentHandler.getLinks().get(0).getUri());
assertEquals("stylesheet", linkContentHandler.getLinks().get(0).getRel());
}
@@ -90,7 +89,8 @@
linkContentHandler.startElement(XHTMLContentHandler.XHTML, "iframe", "", atts);
linkContentHandler.endElement(XHTMLContentHandler.XHTML, "iframe", "");
- assertEquals("http://tika.apache.org/iframe.html",
+ assertEquals(
+ "http://tika.apache.org/iframe.html",
linkContentHandler.getLinks().get(0).getUri());
}
@@ -107,8 +107,8 @@
linkContentHandler.startElement(XHTMLContentHandler.XHTML, "script", "", atts);
linkContentHandler.endElement(XHTMLContentHandler.XHTML, "script", "");
- assertEquals("http://tika.apache.org/script.js",
- linkContentHandler.getLinks().get(0).getUri());
+ assertEquals(
+ "http://tika.apache.org/script.js", linkContentHandler.getLinks().get(0).getUri());
}
/**
diff --git a/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java b/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java
index d903a4d..acd3399 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/NonValidatingContentHandler.java
@@ -18,13 +18,11 @@
import java.io.IOException;
import java.io.InputStream;
-
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
-
public class NonValidatingContentHandler extends ContentHandlerDecorator {
class ClosedInputStream extends InputStream {
@@ -37,7 +35,6 @@
public int read() {
return -1;
}
-
}
public NonValidatingContentHandler(ContentHandler handler) {
@@ -46,17 +43,17 @@
@Override
public void warning(SAXParseException e) throws SAXException {
- //NO-OP
+ // NO-OP
}
@Override
public void error(SAXParseException e) throws SAXException {
- //NO-OP
+ // NO-OP
}
@Override
public void fatalError(SAXParseException e) throws SAXException {
- //NO-OP
+ // NO-OP
}
@Override
@@ -64,5 +61,4 @@
throws SAXException, IOException {
return new InputSource(new ClosedInputStream());
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java
index 6c7e945..dca1220 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/OfflineContentHandlerTest.java
@@ -22,15 +22,12 @@
import java.net.ConnectException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
-
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.xml.sax.InputSource;
import org.xml.sax.helpers.DefaultHandler;
-/**
- * Unit tests for the {@link OfflineContentHandler} class.
- */
+/** Unit tests for the {@link OfflineContentHandler} class. */
public class OfflineContentHandlerTest {
private SAXParser parser;
@@ -56,14 +53,13 @@
@Test
public void testExternalEntity() throws Exception {
String xml =
- "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">" +
- " ]><foo>&bar;</foo>";
+ "<!DOCTYPE foo ["
+ + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">"
+ + " ]><foo>&bar;</foo>";
try {
parser.parse(new InputSource(new StringReader(xml)), offline);
} catch (ConnectException e) {
fail("Parser tried to access the external DTD:" + e);
}
}
-
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java
index 47918a9..d85aab9 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/RichTextContentHandlerTest.java
@@ -21,26 +21,22 @@
import java.io.ByteArrayOutputStream;
import java.io.OutputStreamWriter;
-
+import org.apache.tika.metadata.Metadata;
import org.junit.jupiter.api.Test;
import org.xml.sax.helpers.AttributesImpl;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * Test cases for the {@link RichTextContentHandler} class.
- */
+/** Test cases for the {@link RichTextContentHandler} class. */
public class RichTextContentHandlerTest {
- /**
- * Test to check img tags are detected and rich text version used.
- */
+ /** Test to check img tags are detected and rich text version used. */
@Test
public void aTagTest() throws Exception {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- XHTMLContentHandler xhtml = new XHTMLContentHandler(new RichTextContentHandler(
- new OutputStreamWriter(buffer, UTF_8)), new Metadata());
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(
+ new RichTextContentHandler(new OutputStreamWriter(buffer, UTF_8)),
+ new Metadata());
xhtml.startDocument();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "", "name", "", "value");
@@ -50,15 +46,15 @@
assertEquals("\n\n\n\n[bookmark: value]", buffer.toString(UTF_8.name()));
}
- /**
- * Test to check a tags are detected and rich text version used.
- */
+ /** Test to check a tags are detected and rich text version used. */
@Test
public void imgTagTest() throws Exception {
ByteArrayOutputStream buffer = new ByteArrayOutputStream();
- XHTMLContentHandler xhtml = new XHTMLContentHandler(new RichTextContentHandler(
- new OutputStreamWriter(buffer, UTF_8)), new Metadata());
+ XHTMLContentHandler xhtml =
+ new XHTMLContentHandler(
+ new RichTextContentHandler(new OutputStreamWriter(buffer, UTF_8)),
+ new Metadata());
xhtml.startDocument();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "", "alt", "", "value");
@@ -67,5 +63,4 @@
assertEquals("\n\n\n\n[image: value]", buffer.toString(UTF_8.name()));
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java
index 80d1bfd..a5bd65a 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/SafeContentHandlerTest.java
@@ -23,9 +23,7 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
-/**
- * Unit tests for the {@link SafeContentHandler} class.
- */
+/** Unit tests for the {@link SafeContentHandler} class. */
public class SafeContentHandlerTest {
private ContentHandler output;
@@ -78,5 +76,4 @@
safe.ignorableWhitespace("\udb00\ubfff".toCharArray(), 0, 2);
assertEquals("\ufffd\ubfff", output.toString());
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java
index 421e6c2..f6ad542 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java
@@ -19,20 +19,16 @@
import static org.junit.jupiter.api.Assertions.fail;
import java.io.IOException;
-
import org.apache.commons.io.input.NullInputStream;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.xml.sax.helpers.DefaultHandler;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.TikaInputStream;
-
-/**
- * Tests for the {@link SecureContentHandler} class.
- */
+/** Tests for the {@link SecureContentHandler} class. */
public class SecureContentHandlerTest {
private static final int MANY_BYTES = 2000000;
@@ -50,7 +46,7 @@
@Test
public void testZeroCharactersPerByte() throws IOException {
try {
- char[] ch = new char[]{'x'};
+ char[] ch = new char[] {'x'};
for (int i = 0; i < MANY_BYTES; i++) {
stream.read();
}
@@ -160,5 +156,4 @@
}
}
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java b/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java
index 361b88d..7ae52bb 100755
--- a/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/SerializerTest.java
@@ -40,16 +40,16 @@
@Test
public void testToXMLContentHandler() throws Exception {
assertStartDocument("", new ToXMLContentHandler());
- assertStartDocument("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n",
- new ToXMLContentHandler("UTF-8"));
+ assertStartDocument(
+ "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n", new ToXMLContentHandler("UTF-8"));
assertCharacters("content", new ToXMLContentHandler());
assertCharacterEscaping("<&\">", new ToXMLContentHandler());
assertIgnorableWhitespace(" \t\r\n", new ToXMLContentHandler());
assertEmptyElement("<br />", new ToXMLContentHandler());
- assertEmptyElementWithAttributes("<meta name=\"foo\" value=\"bar\" />",
- new ToXMLContentHandler());
- assertEmptyElementWithAttributeEscaping("<p class=\"<&">\" />",
- new ToXMLContentHandler());
+ assertEmptyElementWithAttributes(
+ "<meta name=\"foo\" value=\"bar\" />", new ToXMLContentHandler());
+ assertEmptyElementWithAttributeEscaping(
+ "<p class=\"<&">\" />", new ToXMLContentHandler());
assertElement("<p>content</p>", new ToXMLContentHandler());
assertElementWithAttributes("<p class=\"test\">content</p>", new ToXMLContentHandler());
}
@@ -61,10 +61,10 @@
assertCharacterEscaping("<&\">", new ToHTMLContentHandler());
assertIgnorableWhitespace(" \t\r\n", new ToHTMLContentHandler());
assertEmptyElement("<br>", new ToHTMLContentHandler());
- assertEmptyElementWithAttributes("<meta name=\"foo\" value=\"bar\">",
- new ToHTMLContentHandler());
- assertEmptyElementWithAttributeEscaping("<p class=\"<&">\"></p>",
- new ToHTMLContentHandler());
+ assertEmptyElementWithAttributes(
+ "<meta name=\"foo\" value=\"bar\">", new ToHTMLContentHandler());
+ assertEmptyElementWithAttributeEscaping(
+ "<p class=\"<&">\"></p>", new ToHTMLContentHandler());
assertElement("<p>content</p>", new ToHTMLContentHandler());
assertElementWithAttributes("<p class=\"test\">content</p>", new ToHTMLContentHandler());
}
@@ -133,5 +133,4 @@
handler.endElement("", "p", "p");
assertEquals(expected, handler.toString());
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
index 136c62b..f0e993b 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
@@ -21,18 +21,14 @@
import java.util.ArrayList;
import java.util.List;
-
+import org.apache.tika.metadata.Metadata;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
-import org.apache.tika.metadata.Metadata;
-
-/**
- * Unit tests for the {@link XHTMLContentHandler} class.
- */
+/** Unit tests for the {@link XHTMLContentHandler} class. */
public class XHTMLContentHandlerTest {
private ContentHandler output;
@@ -40,8 +36,8 @@
private XHTMLContentHandler xhtml;
/**
- * Return array of non-zerolength words. Splitting on whitespace will get us
- * empty words for emptylines.
+ * Return array of non-zerolength words. Splitting on whitespace will get us empty words for
+ * emptylines.
*
* @param string some mix of newlines and real words
* @return array of real words.
@@ -65,8 +61,7 @@
}
/**
- * Test that content in block elements are properly separated in text
- * output.
+ * Test that content in block elements are properly separated in text output.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-188">TIKA-188</a>
*/
@@ -104,8 +99,7 @@
}
/**
- * Test that content in option elements are properly separated in text
- * output.
+ * Test that content in option elements are properly separated in text output.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-394">TIKA-394</a>
*/
@@ -149,8 +143,8 @@
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", "");
- attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "",
- "http://schema.org/Event");
+ attributes.addAttribute(
+ XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", "http://schema.org/Event");
xhtmlContentHandler.startDocument();
xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "body", "body", attributes);
@@ -168,8 +162,8 @@
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute(XHTMLContentHandler.XHTML, "itemscope", "itemscope", "", "");
- attributes.addAttribute(XHTMLContentHandler.XHTML, "itemtype", "itemtype", "",
- "http://schema.org/Event");
+ attributes.addAttribute(
+ XHTMLContentHandler.XHTML, "itemtype", "itemtype", "", "http://schema.org/Event");
xhtmlContentHandler.startDocument();
xhtmlContentHandler.startElement(XHTMLContentHandler.XHTML, "html", "html", attributes);
@@ -217,5 +211,4 @@
assertEquals(1, words.length);
assertEquals("a\ufffdz", words[0]);
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java b/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java
index 2a3f1d4..324e90e 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/xpath/XPathParserTest.java
@@ -130,5 +130,4 @@
assertFalse(matcher.matchesAttribute(NS, "name"));
assertFalse(matcher.matchesAttribute(NS, "eman"));
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java
index 1f05631..702e7f9 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/AnnotationUtilsTest.java
@@ -20,17 +20,14 @@
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.junit.jupiter.api.Assertions.fail;
+import aQute.bnd.annotation.metatype.Configurable;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
-
-import aQute.bnd.annotation.metatype.Configurable;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.Field;
import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaConfigException;
-
+import org.junit.jupiter.api.Test;
/**
* @since 6/1/16
@@ -63,7 +60,7 @@
AnnotationUtils.assignFieldParams(new MyParser(), params);
fail("Exception expected");
} catch (TikaConfigException e) {
- //expected
+ // expected
}
}
@@ -73,6 +70,7 @@
class MyParser extends Configurable {
@Field(required = true)
int config;
+
@Field(required = true, name = "config")
Integer config2;
}
@@ -89,7 +87,6 @@
e.printStackTrace();
fail("Exception Not expected");
}
-
}
@Test
@@ -117,25 +114,25 @@
AnnotationUtils.assignFieldParams(new MyParser(), params);
fail("Exception expected");
} catch (TikaConfigException e) {
- //expected
+ // expected
}
}
-
@Test
public void testParserInheritance() {
class Parent {
@Field(required = true)
int overridden;
+
@Field(required = true)
int parentField;
-
}
class Child extends Parent {
@Field(required = true)
int overridden;
+
@Field(required = true)
int childField;
}
@@ -162,11 +159,10 @@
AnnotationUtils.assignFieldParams(new Child(), params);
fail("Exception expected, parent class field not set");
} catch (TikaConfigException e) {
- //expected
+ // expected
}
}
-
@Test
public void testParamValueInheritance() {
@@ -193,10 +189,8 @@
AnnotationUtils.assignFieldParams(parser, params);
fail("Exception expected, Date is not assignable to CharSequence.");
} catch (TikaConfigException e) {
- //expected
+ // expected
}
-
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
index 8a6574a..55d3d14 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/CharsetUtilsTest.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.utils;
-
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -68,5 +67,4 @@
assertEquals("KOI8-R", CharsetUtils.clean("koi8r"));
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
index 6ac9d72..57bddde 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/ConcurrentUtilsTest.java
@@ -20,25 +20,26 @@
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Future;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.config.TikaConfig;
import org.apache.tika.parser.ParseContext;
+import org.junit.jupiter.api.Test;
public class ConcurrentUtilsTest {
@Test
public void testExecuteThread() throws Exception {
ParseContext context = new ParseContext();
- Future result = ConcurrentUtils.execute(context, new Runnable() {
+ Future result =
+ ConcurrentUtils.execute(
+ context,
+ new Runnable() {
- @Override
- public void run() {
- //Do nothing
+ @Override
+ public void run() {
+ // Do nothing
- }
- });
+ }
+ });
assertNull(result.get());
}
@@ -48,16 +49,18 @@
TikaConfig config = TikaConfig.getDefaultConfig();
ParseContext context = new ParseContext();
context.set(ExecutorService.class, config.getExecutorService());
- Future result = ConcurrentUtils.execute(context, new Runnable() {
+ Future result =
+ ConcurrentUtils.execute(
+ context,
+ new Runnable() {
- @Override
- public void run() {
- //Do nothing
+ @Override
+ public void run() {
+ // Do nothing
- }
- });
+ }
+ });
assertNull(result.get());
}
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java
index 030836e..0ff44f2 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/RegexUtilsTest.java
@@ -21,7 +21,6 @@
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.util.List;
-
import org.junit.jupiter.api.Test;
/**
@@ -31,10 +30,7 @@
*/
public class RegexUtilsTest {
- /**
- * Test {@link RegexUtils#extractLinks(String)} with no links.
- */
-
+ /** Test {@link RegexUtils#extractLinks(String)} with no links. */
@Test
public void testExtractLinksNone() {
List<String> links = null;
@@ -52,31 +48,28 @@
assertEquals(0, links.size());
}
-
- /**
- * Test {@link RegexUtils#extractLinks(String)} for http.
- */
+ /** Test {@link RegexUtils#extractLinks(String)} for http. */
@Test
public void testExtractLinksHttp() {
- List<String> links = RegexUtils.extractLinks(
- "Test with http://www.nutch.org/index.html is it found? " +
- "What about www.google.com at http://www.google.de " +
- "A longer URL could be http://www.sybit.com/solutions/portals.html");
+ List<String> links =
+ RegexUtils.extractLinks(
+ "Test with http://www.nutch.org/index.html is it found? "
+ + "What about www.google.com at http://www.google.de "
+ + "A longer URL could be http://www.sybit.com/solutions/portals.html");
assertTrue(links.size() == 3, "Url not found!");
assertEquals("http://www.nutch.org/index.html", links.get(0), "Wrong URL");
assertEquals("http://www.google.de", links.get(1), "Wrong URL");
- assertEquals("http://www.sybit.com/solutions/portals.html", links.get(2),
- "Wrong URL");
+ assertEquals("http://www.sybit.com/solutions/portals.html", links.get(2), "Wrong URL");
}
- /**
- * Test {@link RegexUtils#extractLinks(String)} for ftp.
- */
+ /** Test {@link RegexUtils#extractLinks(String)} for ftp. */
@Test
public void testExtractLinksFtp() {
- List<String> links = RegexUtils.extractLinks("Test with ftp://www.nutch.org is it found? " +
- "What about www.google.com at ftp://www.google.de");
+ List<String> links =
+ RegexUtils.extractLinks(
+ "Test with ftp://www.nutch.org is it found? "
+ + "What about www.google.com at ftp://www.google.de");
assertTrue(links.size() == 2, "Url not found!");
assertEquals("ftp://www.nutch.org", links.get(0), "Wrong URL");
diff --git a/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
index 199c003..712cffc 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/ServiceLoaderUtilsTest.java
@@ -23,26 +23,30 @@
import java.util.Collections;
import java.util.List;
import java.util.Random;
-
import org.apache.custom.detect.MyCustomDetector;
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.detect.Detector;
import org.apache.tika.detect.EmptyDetector;
import org.apache.tika.detect.FileCommandDetector;
import org.apache.tika.detect.OverrideDetector;
import org.apache.tika.detect.ZeroSizeFileDetector;
+import org.junit.jupiter.api.Test;
public class ServiceLoaderUtilsTest {
@Test
public void testSort() throws Exception {
- //OverrideDetector is moved to index 0
- //by the private service loading in DefaultDetector.
- //This tests that a custom detector always comes first
- //and then reverse alphabetical order
- Detector[] detectors = new Detector[]{new MyCustomDetector(), new EmptyDetector(),
- new FileCommandDetector(), new OverrideDetector(), new ZeroSizeFileDetector()};
+ // OverrideDetector is moved to index 0
+ // by the private service loading in DefaultDetector.
+ // This tests that a custom detector always comes first
+ // and then reverse alphabetical order
+ Detector[] detectors =
+ new Detector[] {
+ new MyCustomDetector(),
+ new EmptyDetector(),
+ new FileCommandDetector(),
+ new OverrideDetector(),
+ new ZeroSizeFileDetector()
+ };
List<Detector> expected = Arrays.asList(detectors);
List<Detector> shuffled = new ArrayList<>(expected);
Random random = new Random(42);
@@ -52,6 +56,4 @@
assertEquals(expected, shuffled, "failed on iteration " + i);
}
}
-
-
}
diff --git a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
index 18e2535..828ca25 100644
--- a/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/utils/XMLReaderUtilsTest.java
@@ -21,20 +21,20 @@
import java.io.ByteArrayInputStream;
import java.net.ConnectException;
import java.nio.charset.StandardCharsets;
-
-import org.junit.jupiter.api.Test;
-
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.ToTextContentHandler;
+import org.junit.jupiter.api.Test;
public class XMLReaderUtilsTest {
- //make sure that parseSAX actually defends against external entities
+ // make sure that parseSAX actually defends against external entities
@Test
public void testExternalDTD() throws Exception {
String xml = "<!DOCTYPE foo SYSTEM \"http://127.234.172.38:7845/bar\"><foo/>";
try {
- XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
- new ToTextContentHandler(), new ParseContext());
+ XMLReaderUtils.parseSAX(
+ new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new ToTextContentHandler(),
+ new ParseContext());
} catch (ConnectException e) {
fail("Parser tried to access the external DTD:" + e);
}
@@ -43,11 +43,14 @@
@Test
public void testExternalEntity() throws Exception {
String xml =
- "<!DOCTYPE foo [" + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">" +
- " ]><foo>&bar;</foo>";
+ "<!DOCTYPE foo ["
+ + " <!ENTITY bar SYSTEM \"http://127.234.172.38:7845/bar\">"
+ + " ]><foo>&bar;</foo>";
try {
- XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
- new ToTextContentHandler(), new ParseContext());
+ XMLReaderUtils.parseSAX(
+ new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)),
+ new ToTextContentHandler(),
+ new ParseContext());
} catch (ConnectException e) {
fail("Parser tried to access the external DTD:" + e);
}
diff --git a/tika-parent/checkstyle.xml b/tika-parent/checkstyle.xml
index 55dc19f..7c15591 100644
--- a/tika-parent/checkstyle.xml
+++ b/tika-parent/checkstyle.xml
@@ -36,109 +36,7 @@
<property name="fileExtensions" value="java, properties, xml"/>
- <!-- <module name="SuppressionCommentFilter"/> -->
-
- <module name="RegexpHeader">
- <property name="header"
- value="^.*$\n^\W*Licensed to the Apache Software Foundation \(ASF\) under one or more$"/>
- </module>
-
- <!-- Checks for whitespace -->
- <!-- See http://checkstyle.sf.net/config_whitespace.html -->
- <module name="FileTabCharacter">
- <property name="eachLine" value="true"/>
- </module>
- <module name="LineLength">
- <property name="max" value="180"/>
- <property name="ignorePattern" value="^package.*|^import.*|a href|href|http://|https://|ftp://"/>
- </module>
- <module name="NewlineAtEndOfFile">
- <property name="lineSeparator" value="lf"/>
- </module>
-
- <module name="RegexpMultiline">
- <property name="format" value="\r\n"/>
- <property name="message" value="CRLF line endings are prohibited"/>
- </module>
-
<module name="TreeWalker">
- <!--<module name="FileContentsHolder"/>-->
- <module name="IllegalImport">
- <property name="regexp" value="true"/>
- <!-- Reject any org.junit import that's not also org.junit.jupiter: -->
- <property name="illegalClasses" value="^org\.junit\.(?!jupiter\.).+"/>
- </module>
- <module name="OuterTypeFilename"/>
- <module name="IllegalTokenText">
- <property name="tokens" value="STRING_LITERAL, CHAR_LITERAL"/>
- <property name="format" value="\\u00(08|09|0(a|A)|0(c|C)|0(d|D)|22|27|5(C|c))|\\(0(10|11|12|14|15|42|47)|134)"/>
- <property name="message" value="Avoid using corresponding octal or Unicode escape."/>
- </module>
<module name="AvoidStarImport"/>
- <module name="UnusedImports"/>
- <module name="OneTopLevelClass"/>
- <module name="NoLineWrap"/>
- <!--<module name="NeedBraces"/>-->
- <!--<module name="LeftCurly">-->
- <!--<property name="maxLineLength" value="100"/>-->
- <!--</module>-->
- <!--<module name="RightCurly"/>-->
- <!--<module name="RightCurly">-->
- <!--<property name="option" value="alone"/>-->
- <!--<property name="tokens" value="CLASS_DEF, METHOD_DEF, CTOR_DEF, LITERAL_FOR, LITERAL_WHILE, LITERAL_DO, STATIC_INIT, INSTANCE_INIT"/>-->
- <!--</module>-->
- <module name="WhitespaceAround">
- <property name="allowEmptyConstructors" value="true"/>
- <property name="allowEmptyMethods" value="true"/>
- <property name="allowEmptyTypes" value="true"/>
- <property name="allowEmptyLoops" value="true"/>
- </module>
- <module name="OneStatementPerLine"/>
- <module name="PackageName">
- <property name="format" value="^[a-z]+(\.[a-z][a-z0-9]*)*$"/>
- <message key="name.invalidPattern"
- value="Package name ''{0}'' must match pattern ''{1}''."/>
- </module>
- <module name="MethodTypeParameterName">
- <property name="format" value="(^[A-Z][0-9]?)$|([A-Z][a-zA-Z0-9]*[T]$)"/>
- <message key="name.invalidPattern"
- value="Method type name ''{0}'' must match pattern ''{1}''."/>
- </module>
- <module name="InterfaceTypeParameterName">
- <property name="format" value="(^[A-Z][0-9]?)$|([A-Z][a-zA-Z0-9]*[T]$)"/>
- <message key="name.invalidPattern"
- value="Interface type name ''{0}'' must match pattern ''{1}''."/>
- </module>
- <module name="GenericWhitespace">
- <message key="ws.followed"
- value="GenericWhitespace ''{0}'' is followed by whitespace."/>
- <message key="ws.preceded"
- value="GenericWhitespace ''{0}'' is preceded with whitespace."/>
- <message key="ws.illegalFollow"
- value="GenericWhitespace ''{0}'' should followed by whitespace."/>
- <message key="ws.notPreceded"
- value="GenericWhitespace ''{0}'' is not preceded with whitespace."/>
- </module>
- <module name="Indentation">
- <property name="basicOffset" value="4"/>
- <property name="braceAdjustment" value="0"/>
- <property name="caseIndent" value="4"/>
- <property name="throwsIndent" value="8"/>
- <property name="lineWrappingIndentation" value="8"/>
- <property name="arrayInitIndent" value="4"/>
- <property name="severity" value="error"/>
- </module>
- <module name="EmptyCatchBlock">
- <property name="exceptionVariableName" value="expected|ignore"/>
- </module>
- <module name="CustomImportOrder">
- <property name="sortImportsInGroupAlphabetically" value="true"/>
- <property name="separateLineBetweenGroups" value="true"/>
- <property name="specialImportsRegExp" value="apache\.tika\."/>
- <property name="customImportOrderRules"
- value="STATIC###STANDARD_JAVA_PACKAGE###THIRD_PARTY_PACKAGE###SPECIAL_IMPORTS"/>
- </module>
- <module name="EqualsHashCode"/>
- <module name="ArrayTypeStyle"/>
</module>
</module>
diff --git a/tika-parent/pom.xml b/tika-parent/pom.xml
index 3c331ee..e286f97 100644
--- a/tika-parent/pom.xml
+++ b/tika-parent/pom.xml
@@ -1273,8 +1273,8 @@
</execution>
</executions>
</plugin>
- <plugin>
- <!-- mvn validate -->
+<!-- <plugin>
+ < ! - - mvn validate - - >
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-checkstyle-plugin</artifactId>
<version>${checkstyle.plugin.version}</version>
@@ -1303,7 +1303,7 @@
</goals>
</execution>
</executions>
- </plugin>
+ </plugin> -->
</plugins>
</build>