Merge branch 'TIKA-2672' of https://github.com/ThejanW/tika into TIKA-2672
diff --git a/NOTICE.txt b/NOTICE.txt
index 7b50eb1..2bae78e 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt
@@ -15,3 +15,7 @@
OpenCSV: Copyright 2005 Bytecode Pty Ltd. Licensed under the Apache License, Version 2.0
IPTC Photo Metadata descriptions Copyright 2010 International Press Telecommunications Council.
+
+Tika-mimetypes.xml includes mimetype definitions that were adapted from the PRONOM Technical Registry
+by The National Archives (http://www.nationalarchives.gov.uk/PRONOM/Default.aspx). PRONOM is published
+under the Open Government License 3.0 (http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/)
diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 785acc7..96f922f 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -502,6 +502,8 @@
*/
private Object openContainer;
+ private int consecutiveEOFs = 0;
+
/**
* Creates a TikaInputStream instance. This private constructor is used
* by the static factory methods based on the available information.
@@ -718,6 +720,7 @@
super.reset();
position = mark;
mark = -1;
+ consecutiveEOFs = 0;
}
@Override
@@ -735,9 +738,15 @@
}
@Override
- protected void afterRead(int n) {
+ protected void afterRead(int n) throws IOException {
if (n != -1) {
position += n;
+ } else {
+ consecutiveEOFs++;
+ if (consecutiveEOFs > 1000) {
+ throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
+ "If you think your file is not corrupt, please open an issue on Tika's JIRA");
+ }
}
}
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
index 813eda0..65938be 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
@@ -21,6 +21,7 @@
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
+import java.util.concurrent.ConcurrentHashMap;
/**
* Registry of known Internet media types.
@@ -46,7 +47,7 @@
* as a mapping from the alias to the corresponding canonical type.
*/
private final Map<MediaType, MediaType> registry =
- new HashMap<MediaType, MediaType>();
+ new ConcurrentHashMap<>();
/**
* Known type inheritance relationships. The mapping is from a media type
@@ -74,7 +75,7 @@
* @return known aliases
*/
public SortedSet<MediaType> getAliases(MediaType type) {
- SortedSet<MediaType> aliases = new TreeSet<MediaType>();
+ SortedSet<MediaType> aliases = new TreeSet<>();
for (Map.Entry<MediaType, MediaType> entry : registry.entrySet()) {
if (entry.getValue().equals(type) && !entry.getKey().equals(type)) {
aliases.add(entry.getKey());
diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
index 4acfe01..0009ac9 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
@@ -30,6 +30,7 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
import javax.xml.namespace.QName;
@@ -103,8 +104,7 @@
private final MediaTypeRegistry registry = new MediaTypeRegistry();
/** All the registered MimeTypes indexed on their canonical names */
- private final Map<MediaType, MimeType> types =
- new HashMap<MediaType, MimeType>();
+ private final Map<MediaType, MimeType> types = new HashMap<>();
/** The patterns matcher */
private Patterns patterns = new Patterns(registry);
@@ -426,7 +426,6 @@
*
* @return the minimum length of data to provide.
* @see #getMimeType(byte[])
- * @see #getMimeType(String, byte[])
*/
public int getMinLength() {
// This needs to be reasonably large to be able to correctly detect
diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 104cd2c..e12dd02 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -1561,9 +1561,69 @@
<mime-type type="application/vnd.llamagraphics.life-balance.exchange+xml">
<glob pattern="*.lbe"/>
</mime-type>
+
<mime-type type="application/vnd.lotus-1-2-3">
+ <alias type="application/x-123"/>
+ <_comment>Lotus 1-2-3</_comment>
+ <!-- <glob pattern="*.wks"/> - conflicts with application/vnd.ms-works -->
+ <glob pattern="*.wk1"/>
+ <glob pattern="*.wk2"/>
+ <glob pattern="*.wk3"/>
+ <glob pattern="*.wk4"/>
<glob pattern="*.123"/>
</mime-type>
+
+ <mime-type type="application/vnd.lotus-1-2-3;version=1">
+ <sub-class-of type="application/vnd.lotus-1-2-3"/>
+ <_comment>Lotus 1-2-3, version 1</_comment>
+ <magic priority="50">
+ <match value="0x000002000404" type="string" offset="0">
+ <!-- <glob pattern="*.wks"/> - conflicts with application/vnd.ms-works -->
+ </match>
+ </magic>
+ </mime-type>
+
+ <mime-type type="application/vnd.lotus-1-2-3;version=2">
+ <sub-class-of type="application/vnd.lotus-1-2-3"/>
+ <_comment>Lotus 1-2-3, version 2</_comment>
+ <magic priority="50">
+ <match value="0x00000200060406000800" type="string" offset="0">
+ </match>
+ </magic>
+ <glob pattern="*.wk1"/>
+ <glob pattern="*.wk2"/>
+ </mime-type>
+
+ <mime-type type="application/vnd.lotus-1-2-3;version=3">
+ <sub-class-of type="application/vnd.lotus-1-2-3"/>
+ <_comment>Lotus 1-2-3, version 3</_comment>
+ <magic priority="50">
+ <match value="0x00001A0000100400" type="string" offset="0">
+ </match>
+ </magic>
+ <glob pattern="*.wk3"/>
+ </mime-type>
+
+ <mime-type type="application/vnd.lotus-1-2-3;version=4">
+ <sub-class-of type="application/vnd.lotus-1-2-3"/>
+ <_comment>Lotus 1-2-3, version 4-5</_comment>
+ <magic priority="50">
+ <match value="0x00001A0002100400" type="string" offset="0">
+ </match>
+ </magic>
+ <glob pattern="*.wk4"/>
+ </mime-type>
+
+ <mime-type type="application/vnd.lotus-1-2-3;version=97+9.x">
+ <sub-class-of type="application/vnd.lotus-1-2-3"/>
+ <_comment>Lotus 1-2-3, version 97/9.x</_comment>
+ <magic priority="50">
+ <match value="0x00001A0003100400" type="string" offset="0">
+ </match>
+ </magic>
+ <glob pattern="*.123"/>
+ </mime-type>
+
<mime-type type="application/vnd.lotus-approach">
<glob pattern="*.apr"/>
</mime-type>
@@ -2728,6 +2788,14 @@
</mime-type>
<!-- TODO: figure out how to identify earlier versions -->
+ <mime-type type="application/vnd.wordperfect;version=4.2">
+ <sub-class-of type="application/vnd.wordperfect"/>
+ <magic priority="50">
+ <match value="0xCB0A01" type="string" offset="0">
+ <match value="0xCB" type="string" offset="5"/>
+ </match>
+ </magic>
+ </mime-type>
<mime-type type="application/vnd.wordperfect;version=5.0">
<sub-class-of type="application/vnd.wordperfect"/>
<magic priority="50">
@@ -2863,13 +2931,6 @@
<!-- <glob pattern="*.vst"/> --> <!-- conflicting with application/vnd.visio-->
</mime-type>
- <mime-type type="application/x-123">
- <magic priority="50">
- <match value="0x00001a00" type="big32" offset="0" />
- <match value="0x00000200" type="big32" offset="0" />
- </magic>
- </mime-type>
-
<mime-type type="application/x-abiword">
<glob pattern="*.abw"/>
</mime-type>
@@ -3971,12 +4032,65 @@
</_comment>
<!-- qp2 and wb3 are currently detected by POIFSContainerDetector
TODO: add detection for wb2 and wb1 -->
+ <glob pattern="*.wq1"/>
+ <glob pattern="*.wq2"/>
+ <glob pattern="*.wkq"/>
<glob pattern="*.qpw"/>
<glob pattern="*.wb1"/>
<glob pattern="*.wb2"/>
<glob pattern="*.wb3"/>
</mime-type>
+ <mime-type type="application/x-quattro-pro;version=1-4">
+ <sub-class-of type="application/x-quattro-pro"/>
+ <_comment>Quattro Pro for DOS, version 1-4</_comment>
+ <magic priority="50">
+ <match value="0x000002002051" type="string" offset="0">
+ </match>
+ </magic>
+ <glob pattern="*.wq1"/>
+ <glob pattern="*.wkq"/>
+ </mime-type>
+
+ <mime-type type="application/x-quattro-pro;version=5">
+ <sub-class-of type="application/x-quattro-pro"/>
+ <_comment>Quattro Pro for DOS, version 5</_comment>
+ <magic priority="50">
+ <match value="0x000002002151" type="string" offset="0">
+ </match>
+ </magic>
+ <glob pattern="*.wq2"/>
+ <glob pattern="*.wkq"/>
+ </mime-type>
+
+ <!-- First Quattro Pro for Windows had major version number 1, which
+ was followed by 5. This is confusing, perhaps adding "win" qualifier to version
+ field could clear this up (but it's quite ugly as well) -->
+
+ <mime-type type="application/x-quattro-pro;version=1+5">
+ <sub-class-of type="application/x-quattro-pro"/>
+ <_comment>Quattro Pro for Windows, version 1, 5</_comment>
+ <magic priority="50">
+ <match value="0x000002000110" type="string" offset="0">
+ </match>
+ </magic>
+ <glob pattern="*.wb1"/>
+ </mime-type>
+
+ <mime-type type="application/x-quattro-pro;version=6">
+ <sub-class-of type="application/x-quattro-pro"/>
+ <_comment>Quattro Pro for Windows, version 6</_comment>
+ <magic priority="50">
+ <match value="0x000002000210" type="string" offset="0">
+ </match>
+ </magic>
+ <glob pattern="*.wb2"/>
+ </mime-type>
+
+ <!-- Quattro Pro for Windows 7-8 (wb3) and and 9 (qpw) files are
+ currently detected by POIFSContainerDetector
+ -->
+
<mime-type type="application/xquery">
<_comment>XQuery source code</_comment>
<glob pattern="*.xq"/>
diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
index b94b095..a511538 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
@@ -25,8 +25,10 @@
import java.io.ByteArrayInputStream;
import java.lang.reflect.Field;
import java.util.ArrayList;
+import java.util.ConcurrentModificationException;
import java.util.List;
import java.util.Set;
+import java.util.concurrent.Executors;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.metadata.Metadata;
@@ -49,6 +51,8 @@
*/
public class MimeTypesReaderTest {
+ static boolean stop = false;
+
private MimeTypes mimeTypes;
private List<Magic> magics;
@@ -280,4 +284,22 @@
assertEquals(name, mimeType.toString());
assertEquals(".ditamap", mimeType.getExtension());
}
+
+ @Test
+ public void testMultiThreaded() throws Exception {
+ MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes();
+ Executors.newSingleThreadExecutor().execute(()-> {
+ try {
+ for (int i = 0; i < 500 && !stop; i++) {
+ mimeTypes.forName("abc"+i+"/abc");
+ }
+ } catch (MimeTypeException e ) {
+ e.printStackTrace();
+ }}
+ );
+
+ for (int i = 0; i < 500 & !stop; i++) {
+ mimeTypes.getMediaTypeRegistry().getAliases(MediaType.APPLICATION_ZIP);
+ }
+ }
}
diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index e67609e..ff289d8 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml
@@ -44,8 +44,8 @@
<brotli.version>0.1.2</brotli.version>
<mime4j.version>0.8.1</mime4j.version>
<vorbis.version>0.8</vorbis.version>
- <pdfbox.version>2.0.9</pdfbox.version>
- <jempbox.version>1.8.13</jempbox.version>
+ <pdfbox.version>2.0.11</pdfbox.version>
+ <jempbox.version>1.8.15</jempbox.version>
<netcdf-java.version>4.5.5</netcdf-java.version>
<sis.version>0.8</sis.version>
<parso.version>2.0.9</parso.version>
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
index 84141b9..c4c5188 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
@@ -83,7 +83,7 @@
private static final Pattern HTTP_META_PATTERN = Pattern.compile(
- "(?is)<\\s*meta\\s+([^<>]+)"
+ "(?is)<\\s*meta(?:/|\\s+)([^<>]+)"
);
//this should match both the older:
@@ -97,7 +97,7 @@
//For a more general "not" matcher, try:
//("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)")
private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile(
- ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
+ ("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
);
private static final Charset ASCII = Charset.forName("US-ASCII");
@@ -154,6 +154,10 @@
if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
continue;
}
+ if ("x-user-defined".equalsIgnoreCase(candCharset)) {
+ candCharset = "windows-1252";
+ }
+
if (CharsetUtils.isSupported(candCharset)) {
try {
return CharsetUtils.forName(candCharset);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index a1ef0da..adf591a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
@@ -29,6 +29,8 @@
import org.apache.tika.detect.AutoDetectReader;
import org.apache.tika.detect.EncodingDetector;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractEncodingDetectorParser;
@@ -40,6 +42,8 @@
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import javax.swing.text.AbstractDocument;
+
/**
* HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
* and post-processes the events to produce XHTML and metadata expected by
@@ -90,6 +94,27 @@
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
+ TemporaryResources tmp = null;
+ try {
+ if (!TikaInputStream.isTikaInputStream(stream)) {
+ tmp = new TemporaryResources();
+ stream = TikaInputStream.get(stream, tmp);
+ }
+ //AutoDetectReader can throw exceptions during
+ //initialization. If we just created a
+ //TemporaryResources, we need to make sure to close it.
+ parseImpl(stream, handler, metadata, context);
+ } finally {
+ if (tmp != null) {
+ tmp.close();
+ }
+ }
+
+ }
+
+
+ private void parseImpl(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
// Automatically detect the character encoding
try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
metadata, getEncodingDetector(context))) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
new file mode 100644
index 0000000..487f747
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
@@ -0,0 +1,491 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static java.nio.charset.StandardCharsets.*;
+import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SequenceMatcher.caseInsensitive;
+import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SingleByteMatcher.matchers;
+
+/**
+ * This is a strict html encoding detector that enforces the standard
+ * far more strictly than the HtmlEncodingDetector.
+ */
+public class StrictHtmlEncodingDetector implements EncodingDetector {
+ private static final String CHARSET_LABEL_FILE = "whatwg-encoding-labels.tsv";
+ private static Map<String, Charset> CHARSET_LABELS = getCharsetLabels();
+
+ private static Map<String, Charset> getCharsetLabels() {
+ String path = StrictHtmlEncodingDetector.class.getPackage().getName().replace('.', '/');
+ String filename = '/' + path + '/' + CHARSET_LABEL_FILE;
+ InputStream inputStream = StrictHtmlEncodingDetector.class.getResourceAsStream(filename);
+ Objects.requireNonNull(inputStream, "Missing charset label mapping file : " + filename);
+ try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.US_ASCII))) {
+ return buffer.lines()
+ .filter(s -> !s.startsWith("#"))
+ .map(s -> s.split("\t"))
+ .filter(parts -> parts.length >= 2)
+ .collect(Collectors.toMap(
+ parts -> parts[0],
+ StrictHtmlEncodingDetector::charsetFromStandard
+ ));
+ } catch (IOException e) {
+ throw new UncheckedIOException("Unable to read the charset label mapping", e);
+ }
+ }
+
+ private static Charset charsetFromStandard(String[] names) {
+ for (int i = 1; i < names.length; i++) {
+ try {
+ return Charset.forName(names[1]);
+ } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
+ }
+ // The only single-byte charset extended charset that must be present on every Java platform
+ return StandardCharsets.ISO_8859_1;
+ }
+
+ private static Charset getCharsetByLabel(String label) {
+ if (label == null) return null;
+ label = label.trim().toLowerCase(Locale.US);
+ return CHARSET_LABELS.get(label);
+ }
+
+ @Override
+ public Charset detect(InputStream input, Metadata metadata) throws IOException {
+ PreScanner preScanner = new PreScanner(input);
+
+ // If there is a BOM at the beginning, the detection does not go further
+ Charset bomCharset = preScanner.detectBOM();
+ if (bomCharset != null) return bomCharset;
+
+ // Assume that if there was a charset specified either by the end user or the transport level,
+ // it was stored in the metadata
+ String incomingCharsetName = metadata.get(Metadata.CONTENT_ENCODING);
+ if (incomingCharsetName != null) {
+ Charset incomingCharset = getCharsetByLabel(incomingCharsetName);
+ if (incomingCharset != null) return incomingCharset;
+ }
+
+ return preScanner.scan();
+ }
+
+ static class PreScanner {
+
+ private static final Pattern META_CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
+ private static ByteMatcher COMMENT_START = new SequenceMatcher("<!--");
+ private static ByteMatcher COMMENT_END = new SequenceMatcher("-->");
+ private static ByteMatcher LETTER = new OrMatcher(
+ new RangeMatcher((byte) 'a', (byte) 'z'),
+ new RangeMatcher((byte) 'A', (byte) 'Z')
+ );
+ private static ByteMatcher SPACE = new OrMatcher(matchers(0x09, 0x0A, 0x0C, 0x0D, 0x20));
+ private static ByteMatcher SLASH = new SingleByteMatcher((byte) '/');
+ private static ByteMatcher EQUAL = new SingleByteMatcher((byte) '=');
+ private static ByteMatcher TAG_END = new SingleByteMatcher((byte) '>');
+ private static ByteMatcher SINGLE_QUOTE = new SingleByteMatcher((byte) '\'');
+ private static ByteMatcher DOUBLE_QUOTE = new SingleByteMatcher((byte) '"');
+ private static ByteMatcher QUOTE = new OrMatcher(SINGLE_QUOTE, DOUBLE_QUOTE);
+ private static ByteMatcher TAG_END_OR_SLASH = new OrMatcher(SLASH, TAG_END);
+ private static ByteMatcher SPACE_OR_SLASH = new OrMatcher(SPACE, SLASH);
+ private static ByteMatcher SPACE_OR_TAG_END = new OrMatcher(SPACE, TAG_END);
+ private static ByteMatcher META_START = new SequenceMatcher(caseInsensitive("<meta"), SPACE_OR_SLASH);
+ private static ByteMatcher TAG_START = new SequenceMatcher(
+ new SingleByteMatcher((byte) '<'),
+ new OrMatcher(SLASH, LETTER)
+ );
+ private static ByteMatcher TAG_BODY = new NegativeMatcher(new OrMatcher(SPACE, TAG_END));
+ private static ByteMatcher SPECIAL_TAG_START = new SequenceMatcher(
+ new SingleByteMatcher((byte) '<'),
+ new OrMatcher(matchers("!/?"))
+ );
+ private static ByteMatcher UTF8_BOM = new SequenceMatcher(matchers(0xEF, 0xBB, 0xBF));
+ private static ByteMatcher UTF16_BE_BOM = new SequenceMatcher(matchers(0xFE, 0xFF));
+ private static ByteMatcher UTF16_LE_BOM = new SequenceMatcher(matchers(0xFF, 0xFE));
+
+
+ PushbackInputStream stream;
+ private CharsetDetectionResult detectedCharset = new CharsetDetectionResult();
+
+ public PreScanner(InputStream inputStream) {
+ this.stream = new PushbackInputStream(inputStream, 32);
+ }
+
+ public Charset scan() {
+ while (processAtLeastOneByte()) {
+ if (detectedCharset.isFound()) {
+ return detectedCharset.getCharset();
+ }
+ }
+ return null;
+ }
+
+ private Charset detectBOM() {
+ try {
+ if (UTF8_BOM.matches(stream)) return StandardCharsets.UTF_8;
+ else if (UTF16_BE_BOM.matches(stream)) return StandardCharsets.UTF_16BE;
+ else if (UTF16_LE_BOM.matches(stream)) return StandardCharsets.UTF_16LE;
+ } catch (IOException e) { /* stream could not be read, also return null */ }
+ return null;
+ }
+
+ private boolean processAtLeastOneByte() {
+ try {
+ return processComment() ||
+ processMeta() ||
+ processTag() ||
+ processSpecialTag() ||
+ processAny();
+ } catch (IOException e) {
+ return false;
+ }
+ }
+
+ private boolean processAny() throws IOException {
+ int read = stream.read();
+ return read != -1;
+ }
+
+ private boolean hasBytes() throws IOException {
+ int read = stream.read();
+ if (read != -1) stream.unread(read);
+ return read != -1;
+ }
+
+ private boolean processComment() throws IOException {
+ if (COMMENT_START.matches(stream)) {
+ // The two '-' in the '-->' sequence can be the same as those in the '<!--' sequence.
+ stream.unread("--".getBytes(StandardCharsets.US_ASCII));
+ return COMMENT_END.advanceUntilMatches(stream);
+ }
+ return false;
+ }
+
+ private boolean processTag() throws IOException {
+ if (TAG_START.matches(stream)) {
+ TAG_BODY.skipAll(stream);
+ while (getAttribute() != null) {/*ignore the attribute*/}
+ return true;
+ }
+ return false;
+ }
+
+ private boolean processSpecialTag() throws IOException {
+ if (SPECIAL_TAG_START.matches(stream)) {
+ TAG_BODY.skipAll(stream);
+ return TAG_END.advanceUntilMatches(stream);
+ }
+ return false;
+ }
+
+ private boolean processMeta() throws IOException {
+ if (META_START.matches(stream)) {
+ Set<String> attributeNames = new HashSet<>();
+ boolean gotPragma = false;
+ Boolean needPragma = null;
+ CharsetDetectionResult charset = new CharsetDetectionResult();
+ while (hasBytes()) {
+ Attribute attribute = getAttribute();
+ if (attribute == null) break;
+ if (attributeNames.contains(attribute.getName())) continue;
+ attributeNames.add(attribute.getName());
+ switch (attribute.getName()) {
+ case "http-equiv":
+ if (attribute.getValue().equals("content-type"))
+ gotPragma = true;
+ break;
+ case "content":
+ String charsetName = getEncodingFromMeta(attribute.getValue());
+ if (!charset.isFound() && charsetName != null) {
+ charset.find(charsetName);
+ needPragma = true;
+ }
+ break;
+ case "charset":
+ charset.find(attribute.getValue());
+ needPragma = false;
+ break;
+ default: // Ignore non-charset related attributes
+ }
+ }
+ if (needPragma != null && !(needPragma && !gotPragma)) {
+ detectedCharset = charset;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ private String getEncodingFromMeta(String attributeValue) {
+ Matcher matcher = META_CHARSET_PATTERN.matcher(attributeValue);
+ if (!matcher.find()) return null;
+ return matcher.group(2);
+ }
+
+ private Attribute getAttribute() throws IOException {
+ SPACE_OR_SLASH.skipAll(stream);
+ if (TAG_END.peekMatches(stream)) return null;
+ StringBuilder name = new StringBuilder();
+ while (!EQUAL.peekMatches(stream) || name.length() == 0) {
+ if (TAG_END_OR_SLASH.peekMatches(stream)) {
+ break;
+ } else if (SPACE.peekMatches(stream)) {
+ SPACE.skipAll(stream);
+ break;
+ } else {
+ name.append(getLowerCaseChar());
+ }
+ }
+
+ if (!EQUAL.matches(stream)) return new Attribute(name.toString(), "");
+ SPACE.skipAll(stream);
+
+ StringBuilder value = new StringBuilder();
+ byte[] quoteMatched = QUOTE.match(stream);
+ if (quoteMatched != null) {
+ char quote = (char) quoteMatched[0];
+ int nextChar = -1;
+ while (nextChar != quote) {
+ if (nextChar != -1) value.append((char) nextChar);
+ nextChar = getLowerCaseChar();
+ }
+ } else {
+ while (!SPACE_OR_TAG_END.peekMatches(stream)) {
+ value.append(getLowerCaseChar());
+ }
+ }
+ return new Attribute(name.toString(), value.toString());
+ }
+
+ private char getLowerCaseChar() throws IOException {
+ int nextPoint = stream.read();
+ if (nextPoint == -1) throw new IOException();
+ if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
+ return (char) nextPoint;
+ }
+ }
+
+ static class Attribute {
+ String name;
+ String value;
+
+ public Attribute(String name, String value) {
+ this.name = name;
+ this.value = value;
+ }
+
+ public String getName() {
+ return name;
+ }
+
+ public String getValue() {
+ return value;
+ }
+ }
+
+ /**
+ * A detection may either not find a charset, find an invalid charset, or find a valid charset
+ */
+ static class CharsetDetectionResult {
+ private boolean found = false;
+ private Charset charset = null;
+
+ public CharsetDetectionResult() { /* default result: not found */}
+
+ public boolean isFound() {
+ return found;
+ }
+
+ public void find(String charsetName) {
+ this.found = true;
+ charsetName = charsetName.trim();
+ if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
+ this.charset = getCharsetByLabel(charsetName);
+ // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
+ if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
+ }
+
+ public Charset getCharset() {
+ // the result may be null even if found is true, in the case there is a charset specified,
+ // but it is invalid
+ return charset;
+ }
+ }
+
+ static abstract class ByteMatcher {
+
+ abstract byte[] match(PushbackInputStream pushbackInputStream) throws IOException;
+
+ boolean matches(PushbackInputStream pushbackInputStream) throws IOException {
+ return this.match(pushbackInputStream) != null;
+ }
+
+ boolean advanceUntilMatches(PushbackInputStream pushbackInputStream) throws IOException {
+ while (!this.matches(pushbackInputStream)) {
+ int nextByte = pushbackInputStream.read();
+ if (nextByte == -1) return false;
+ }
+ return true;
+ }
+
+ void skipAll(PushbackInputStream pushbackInputStream) throws IOException {
+ while (matches(pushbackInputStream)) {/* just skip the byte */}
+ }
+
+ public boolean peekMatches(PushbackInputStream pushbackInputStream) throws IOException {
+ byte[] matched = this.match(pushbackInputStream);
+ if (matched != null) pushbackInputStream.unread(matched);
+ return matched != null;
+ }
+ }
+
+ static class SingleByteMatcher extends ByteMatcher {
+ private byte b;
+
+ public SingleByteMatcher(byte b) {
+ this.b = b;
+ }
+
+ public static ByteMatcher[] matchers(String s) {
+ return matchers(s.chars());
+ }
+
+ public static ByteMatcher[] matchers(int... bytes) {
+ return matchers(IntStream.of(bytes));
+ }
+
+ public static ByteMatcher[] matchers(IntStream byteStream) {
+ return byteStream
+ .mapToObj(i -> new SingleByteMatcher((byte) i))
+ .toArray(ByteMatcher[]::new);
+ }
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ int read = pushbackInputStream.read();
+ if ((byte) read == b) return new byte[]{b};
+ if (read != -1) pushbackInputStream.unread(read);
+ return null;
+ }
+ }
+
+ static class SequenceMatcher extends ByteMatcher {
+ private ByteMatcher[] matchers;
+
+ public SequenceMatcher(ByteMatcher... matchers) {
+ this.matchers = matchers;
+ }
+
+ public SequenceMatcher(String s) {
+ this(matchers(s));
+ }
+
+ public static SequenceMatcher caseInsensitive(String s) {
+ ByteMatcher[] lowerMatchers = matchers(s.toLowerCase(Locale.US));
+ ByteMatcher[] upperMatchers = matchers(s.toUpperCase(Locale.US));
+ OrMatcher[] matchers = IntStream
+ .range(0, Math.min(lowerMatchers.length, upperMatchers.length))
+ .mapToObj(i -> new OrMatcher(lowerMatchers[i], upperMatchers[i]))
+ .toArray(OrMatcher[]::new);
+ return new SequenceMatcher(matchers);
+ }
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ ByteArrayOutputStream allMatched = new ByteArrayOutputStream();
+ for (ByteMatcher m : matchers) {
+ byte[] matched = m.match(pushbackInputStream);
+ if (matched == null) {
+ pushbackInputStream.unread(allMatched.toByteArray());
+ return null;
+ } else {
+ allMatched.write(matched);
+ }
+ }
+ return allMatched.toByteArray();
+ }
+ }
+
+ static class OrMatcher extends ByteMatcher {
+ private ByteMatcher[] matchers;
+
+ public OrMatcher(ByteMatcher... matchers) {
+ this.matchers = matchers;
+ }
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ for (ByteMatcher m : matchers) {
+ byte[] matched = m.match(pushbackInputStream);
+ if (matched != null) return matched;
+ }
+ return null;
+ }
+ }
+
+ static class NegativeMatcher extends ByteMatcher {
+ private ByteMatcher matcher;
+
+ public NegativeMatcher(ByteMatcher matcher) {
+ this.matcher = matcher;
+ }
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ byte[] matched = matcher.match(pushbackInputStream);
+ if (matched == null) {
+ int read = pushbackInputStream.read();
+ if (read == -1) return null;
+ return new byte[]{(byte) read};
+ } else {
+ pushbackInputStream.unread(matched);
+ return null;
+ }
+ }
+ }
+
+ static class RangeMatcher extends ByteMatcher {
+ private byte low;
+ private byte high;
+
+ public RangeMatcher(byte low, byte high) {
+ this.low = low;
+ this.high = high;
+ }
+
+
+ @Override
+ byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+ int read = pushbackInputStream.read();
+ if (read >= low && read <= high) return new byte[]{(byte) read};
+ if (read != -1) pushbackInputStream.unread(read);
+ return null;
+ }
+ }
+}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index c8aa65e..86ac3cf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -174,10 +174,13 @@
private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
ZipEntry entry = zipStream.getNextEntry();
- while (entry != null) {
+ if (entry == null) {
+ throw new IOException("No entries found in ZipInputStream");
+ }
+ do {
handleZipEntry(entry, zipStream, metadata, context, handler);
entry = zipStream.getNextEntry();
- }
+ } while (entry != null);
}
private void handleZipFile(ZipFile zipFile, Metadata metadata,
diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv b/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
new file mode 100644
index 0000000..92ddecb
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
@@ -0,0 +1,234 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.unicode-1-1-utf-8 UTF-8
+#
+# label encoding fallback
+utf-8 UTF-8
+utf8 UTF-8
+866 IBM866
+cp866 IBM866
+csibm866 IBM866
+ibm866 IBM866
+csisolatin2 ISO-8859-2
+iso-8859-2 ISO-8859-2
+iso-ir-101 ISO-8859-2
+iso8859-2 ISO-8859-2
+iso88592 ISO-8859-2
+iso_8859-2 ISO-8859-2
+iso_8859-2:1987 ISO-8859-2
+l2 ISO-8859-2
+latin2 ISO-8859-2
+csisolatin3 ISO-8859-3
+iso-8859-3 ISO-8859-3
+iso-ir-109 ISO-8859-3
+iso8859-3 ISO-8859-3
+iso88593 ISO-8859-3
+iso_8859-3 ISO-8859-3
+iso_8859-3:1988 ISO-8859-3
+l3 ISO-8859-3
+latin3 ISO-8859-3
+csisolatin4 ISO-8859-4
+iso-8859-4 ISO-8859-4
+iso-ir-110 ISO-8859-4
+iso8859-4 ISO-8859-4
+iso88594 ISO-8859-4
+iso_8859-4 ISO-8859-4
+iso_8859-4:1988 ISO-8859-4
+l4 ISO-8859-4
+latin4 ISO-8859-4
+csisolatincyrillic ISO-8859-5
+cyrillic ISO-8859-5
+iso-8859-5 ISO-8859-5
+iso-ir-144 ISO-8859-5
+iso8859-5 ISO-8859-5
+iso88595 ISO-8859-5
+iso_8859-5 ISO-8859-5
+iso_8859-5:1988 ISO-8859-5
+arabic ISO-8859-6
+asmo-708 ISO-8859-6
+csiso88596e ISO-8859-6
+csiso88596i ISO-8859-6
+csisolatinarabic ISO-8859-6
+ecma-114 ISO-8859-6
+iso-8859-6 ISO-8859-6
+iso-8859-6-e ISO-8859-6
+iso-8859-6-i ISO-8859-6
+iso-ir-127 ISO-8859-6
+iso8859-6 ISO-8859-6
+iso88596 ISO-8859-6
+iso_8859-6 ISO-8859-6
+iso_8859-6:1987 ISO-8859-6
+csisolatingreek ISO-8859-7
+ecma-118 ISO-8859-7
+elot_928 ISO-8859-7
+greek ISO-8859-7
+greek8 ISO-8859-7
+iso-8859-7 ISO-8859-7
+iso-ir-126 ISO-8859-7
+iso8859-7 ISO-8859-7
+iso88597 ISO-8859-7
+iso_8859-7 ISO-8859-7
+iso_8859-7:1987 ISO-8859-7
+sun_eu_greek ISO-8859-7
+csiso88598e ISO-8859-8
+csisolatinhebrew ISO-8859-8
+hebrew ISO-8859-8
+iso-8859-8 ISO-8859-8
+iso-8859-8-e ISO-8859-8
+iso-ir-138 ISO-8859-8
+iso8859-8 ISO-8859-8
+iso88598 ISO-8859-8
+iso_8859-8 ISO-8859-8
+iso_8859-8:1988 ISO-8859-8
+visual ISO-8859-8
+csiso88598i ISO-8859-8-I ISO-8859-8
+iso-8859-8-i ISO-8859-8-I ISO-8859-8
+logical ISO-8859-8-I ISO-8859-8
+csisolatin6 ISO-8859-10 ISO-8859-4
+iso-8859-10 ISO-8859-10 ISO-8859-4
+iso-ir-157 ISO-8859-10 ISO-8859-4
+iso8859-10 ISO-8859-10 ISO-8859-4
+iso885910 ISO-8859-10 ISO-8859-4
+l6 ISO-8859-10 ISO-8859-4
+latin6 ISO-8859-10 ISO-8859-4
+iso-8859-13 ISO-8859-13
+iso8859-13 ISO-8859-13
+iso885913 ISO-8859-13
+iso-8859-14 ISO-8859-14 ISO-8859-1
+iso8859-14 ISO-8859-14 ISO-8859-1
+iso885914 ISO-8859-14 ISO-8859-1
+csisolatin9 ISO-8859-15
+iso-8859-15 ISO-8859-15
+iso8859-15 ISO-8859-15
+iso885915 ISO-8859-15
+iso_8859-15 ISO-8859-15
+l9 ISO-8859-15
+iso-8859-16 ISO-8859-16 ISO-8859-1
+cskoi8r KOI8-R
+koi KOI8-R
+koi8 KOI8-R
+koi8-r KOI8-R
+koi8_r KOI8-R
+koi8-ru KOI8-U
+koi8-u KOI8-U
+csmacintosh x-MacRoman
+mac x-MacRoman
+macintosh x-MacRoman
+x-mac-roman x-MacRoman
+dos-874 windows-874
+iso-8859-11 windows-874
+iso8859-11 windows-874
+iso885911 windows-874
+tis-620 windows-874
+windows-874 windows-874
+cp1250 windows-1250
+windows-1250 windows-1250
+x-cp1250 windows-1250
+cp1251 windows-1251
+windows-1251 windows-1251
+x-cp1251 windows-1251
+ansi_x3.4-1968 windows-1252
+ascii windows-1252
+cp1252 windows-1252
+cp819 windows-1252
+csisolatin1 windows-1252
+ibm819 windows-1252
+iso-8859-1 windows-1252
+iso-ir-100 windows-1252
+iso8859-1 windows-1252
+iso88591 windows-1252
+iso_8859-1 windows-1252
+iso_8859-1:1987 windows-1252
+l1 windows-1252
+latin1 windows-1252
+us-ascii windows-1252
+windows-1252 windows-1252
+x-cp1252 windows-1252
+cp1253 windows-1253
+windows-1253 windows-1253
+x-cp1253 windows-1253
+cp1254 windows-1254
+csisolatin5 windows-1254
+iso-8859-9 windows-1254
+iso-ir-148 windows-1254
+iso8859-9 windows-1254
+iso88599 windows-1254
+iso_8859-9 windows-1254
+iso_8859-9:1989 windows-1254
+l5 windows-1254
+latin5 windows-1254
+windows-1254 windows-1254
+x-cp1254 windows-1254
+cp1255 windows-1255
+windows-1255 windows-1255
+x-cp1255 windows-1255
+cp1256 windows-1256
+windows-1256 windows-1256
+x-cp1256 windows-1256
+cp1257 windows-1257
+windows-1257 windows-1257
+x-cp1257 windows-1257
+cp1258 windows-1258
+windows-1258 windows-1258
+x-cp1258 windows-1258
+x-mac-cyrillic x-MacCyrillic
+x-mac-ukrainian x-MacCyrillic
+chinese GBK
+csgb2312 GBK
+csiso58gb231280 GBK
+gb2312 GBK
+gb_2312 GBK
+gb_2312-80 GBK
+gbk GBK
+iso-ir-58 GBK
+x-gbk GBK
+gb18030 gb18030
+big5 Big5
+big5-hkscs Big5
+cn-big5 Big5
+csbig5 Big5
+x-x-big5 Big5
+cseucpkdfmtjapanese EUC-JP
+euc-jp EUC-JP
+x-euc-jp EUC-JP
+csiso2022jp ISO-2022-JP
+iso-2022-jp ISO-2022-JP
+csshiftjis Shift_JIS
+ms932 Shift_JIS
+ms_kanji Shift_JIS
+shift-jis Shift_JIS
+shift_jis Shift_JIS
+sjis Shift_JIS
+windows-31j Shift_JIS
+x-sjis Shift_JIS
+cseuckr EUC-KR
+csksc56011987 EUC-KR
+euc-kr EUC-KR
+iso-ir-149 EUC-KR
+korean EUC-KR
+ks_c_5601-1987 EUC-KR
+ks_c_5601-1989 EUC-KR
+ksc5601 EUC-KR
+ksc_5601 EUC-KR
+windows-949 EUC-KR
+csiso2022kr replacement
+hz-gb-2312 replacement
+iso-2022-cn replacement
+iso-2022-cn-ext replacement
+iso-2022-kr replacement
+replacement replacement
+utf-16be UTF-16BE
+utf-16 UTF-16LE
+utf-16le UTF-16LE
+x-user-defined x-user-defined
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
new file mode 100644
index 0000000..931f5e1
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.*;
+
+public class HtmlEncodingDetectorTest {
+
+ @Test
+ public void basic() throws IOException {
+ assertWindows1252("<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ @Ignore("can we can prove this harms detection")
+ public void utf16() throws IOException {
+ // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
+ assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
+ }
+
+ @Test
+ public void xUserDefined() throws IOException {
+ // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
+ assertWindows1252("<meta charset='x-user-defined'>");
+ }
+
+ @Test
+ public void withSlash() throws IOException {
+ assertWindows1252("<meta/charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ @Ignore("until we do a full parse")
+ public void insideTag() throws IOException {
+ assertWindows1252("<meta name='description'" +
+ "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
+ "<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ @Ignore("until we do a full parse")
+ public void missingAttribute() throws IOException {
+ assertWindows1252(
+ "<meta content='charset=UTF-8'>" + // missing http-equiv attribute
+ "<meta charset='WINDOWS-1252'>" // valid declaration
+ );
+ }
+
+ @Test
+ @Ignore("until we do a full parse")
+ public void insideSpecialTag() throws IOException {
+ // Content inside <?, <!, and </ should be ignored
+ for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
+ assertWindows1252(
+ "<" + (char) b + // start comment
+ "<meta charset='UTF-8'>" + // inside special tag
+ "<meta charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ @Ignore("until we can prove this harms detection")
+ public void spaceBeforeTag() throws IOException {
+ assertWindows1252(
+ "< meta charset='UTF-8'>" + // invalid charset declaration
+ "<meta charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ public void invalidAttribute() throws IOException {
+ assertWindows1252(
+ "<meta " +
+ "badcharset='UTF-8' " + // invalid charset declaration
+ "charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ @Ignore("until we can prove this harms detection")
+ public void unmatchedQuote() throws IOException {
+ assertWindows1252(
+ "<meta http-equiv='content-type' content='charset=\"UTF-8'>" + // invalid charset declaration
+ "<meta charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+
+ @Test
+ @Ignore("until we do a full parse")
+ public void withCompactComment() throws IOException {
+ // <!--> is a valid comment
+ assertWindows1252(
+ "<!--" + // start comment
+ "<meta charset='UTF-8'>" + // inside comment
+ "-->" + // end comment
+ "<!-->" + // compact comment
+ "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
+ );
+ }
+
+ private void assertWindows1252(String html) throws IOException {
+ assertCharset(html, Charset.forName("WINDOWS-1252"));
+ }
+
+ private void assertCharset(String html, Charset charset) throws IOException {
+ assertEquals(html + " should be detected as " + charset,
+ charset, detectCharset(html));
+ }
+
+ private Charset detectCharset(String test) throws IOException {
+ Metadata metadata = new Metadata();
+ InputStream inStream = new ByteArrayInputStream(test.getBytes(StandardCharsets.UTF_8));
+ return new HtmlEncodingDetector().detect(inStream, metadata);
+ }
+}
\ No newline at end of file
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
new file mode 100644
index 0000000..1c0da8d
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.assertEquals;
+
+public class StrictHtmlEncodingDetectorTest {
+ private Metadata metadata = new Metadata();
+
+ @Before
+ public void setUp() {
+ this.metadata = new Metadata();
+ }
+
+ @Test
+ public void basic() throws IOException {
+ assertWindows1252("<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void duplicateMeta() throws IOException {
+ assertWindows1252("<meta charset='WINDOWS-1252'>" +
+ "<meta charset='UTF-8'>");
+ }
+
+ @Test
+ public void httpEquiv() throws IOException {
+ assertWindows1252("<meta " +
+ "http-equiv='content-type' " +
+ "content='text/html; charset=\"WINDOWS-1252\"'>"); // quotes around the charset are allowed
+ assertWindows1252("<meta " +
+ "content=' charset = WINDOWS-1252' " + // The charset may be anywhere in the content attribute
+ "http-equiv='content-type' >");
+ }
+
+ @Test
+ public void httpEquivDuplicateCharset() throws IOException {
+ assertWindows1252("<meta " +
+ "http-equiv='content-type' " +
+ "content='charset=WINDOWS-1252;" + // The detection should stop after the semicolon
+ "charset=UTF-8'>");
+ }
+
+ @Test
+ public void htmlFragment() throws IOException {
+ assertWindows1252("<!doctype html><html class=nojs><head><meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void verBadHtml() throws IOException {
+ // check that the parser is not confused by garbage before the declaration
+ assertWindows1252("<< l \" == / '=x\n >" +
+ "<!--> " +
+ "< <x'/ <=> " +
+ "<meta/>" +
+ "<a x/>" +
+ "<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void incompleteMeta() throws IOException {
+ assertWindows1252("<meta charset='WINDOWS-1252'"); // missing '>' at the end
+ }
+
+ @Test
+ public void charsetWithWhiteSpaces() throws IOException {
+ assertWindows1252("<meta charset=' \t\n WINDOWS-1252 \t\n'>");
+ }
+
+ @Test
+ public void mixedCase() throws IOException {
+ assertWindows1252("<mEtA chArsEt='WInDOWs-1252'>");
+ }
+
+ @Test
+ public void utf16() throws IOException {
+ // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
+ assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
+ }
+
+ @Test
+ public void xUserDefined() throws IOException {
+ // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
+ assertWindows1252("<meta charset='x-user-defined'>");
+ }
+
+ @Test
+ public void iso88591() throws IOException {
+ // In the spec, iso-8859-1 is an alias for WINDOWS-1252
+ assertWindows1252("<meta charset='iso-8859-1'>");
+ }
+
+ @Test
+ public void macintoshEncoding() throws IOException {
+ // The mac roman encoding exists in java, but under the name x-MacRoman
+ assertCharset("<meta charset='macintosh'>", Charset.forName("x-MacRoman"));
+ }
+
+ @Test
+ public void bom() throws IOException {
+ // A BOM should have precedence over the meta
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
+ }
+
+ @Test
+ public void withSlash() throws IOException {
+ assertWindows1252("<meta/charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void insideDescription() throws IOException {
+ assertWindows1252("<meta name='description'" +
+ "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
+ "<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void insideTag() throws IOException {
+ assertWindows1252("<tag " +
+ "attribute=\"<meta charset='UTF-8'>\" " + // inside attribute
+ "<meta charset='UTF-8' " + // still inside tag
+ "/>" + // tag end
+ "<meta charset='WINDOWS-1252'>");
+ }
+
+ @Test
+ public void missingAttribute() throws IOException {
+ assertWindows1252(
+ "<meta content='charset=UTF-8'>" + // missing http-equiv attribute
+ "<meta charset='WINDOWS-1252'>" // valid declaration
+ );
+ }
+
+ @Test
+ public void insideSpecialTag() throws IOException {
+ // Content inside <?, <!, and </ should be ignored
+ for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
+ assertWindows1252(
+ "<" + (char) b + // start comment
+ "<meta charset='UTF-8'>" + // inside special tag
+ "<meta charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ public void spaceBeforeTag() throws IOException {
+ assertWindows1252(
+ "< meta charset='UTF-8'>" + // invalid charset declaration
+ "<meta charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ public void invalidAttribute() throws IOException {
+ assertWindows1252(
+ "<meta " +
+ "badcharset='UTF-8' " + // invalid charset declaration
+ "charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ public void unmatchedQuote() throws IOException {
+ assertWindows1252(
+ "<meta http-equiv='content-type' content='charset=\"UTF-8'>" + // invalid charset declaration
+ "<meta charset='WINDOWS-1252'>" // real charset declaration
+ );
+ }
+
+ @Test
+ public void realWorld() throws IOException {
+ assertWindows1252("<!DOCTYPE html>\n" +
+ "<html lang=\"fr\">\n" +
+ "<head>\n" +
+ "<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':\n" +
+ "\t\t\tnew Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],\n" +
+ "\t\t\tj=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=\n" +
+ "\t\t\t'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);\n" +
+ "\t\t\t})(window,document,'script','dataLayer','GTM-PNX8H8X');</script>\n" +
+ "<title>Horaires Transilien 2018 - Lignes A B C D E H J K L N P R U</title>\n" +
+ "<meta name=\"description\" content=\"Consultez les horaires du Transilien en temps réel. Lignes A et B du RER. Lignes C D E H J K L N P R U du Transilien.\">\n" +
+ "<meta name=\"keywords\" content=\"horaires transilien\">\n" +
+ "<meta charset=\"windows-1252\">\n" +
+ "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
+ "<meta name=\"robots\" content=\"follow, index\">\n" +
+ "<base hr");
+ }
+
+ @Test
+ public void withCompactComment() throws IOException {
+ // <!--> is a valid comment
+ assertWindows1252(
+ "<!--" + // start comment
+ "<meta charset='UTF-8'>" + // inside comment
+ "-->" + // end comment
+ "<!-->" + // compact comment
+ "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
+ );
+ }
+
+ @Test
+ public void withUserProvidedCharset() throws IOException {
+ metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
+ // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
+ assertWindows1252("");
+ assertWindows1252("<meta charset='UTF-8'>");
+ assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>");
+ // if a BOM is present, it has precedence over transport layer information
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
+ assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
+ }
+
+ @Test
+ public void throwResistance() throws IOException {
+ // The preprocessing should return right after having found the charset
+ // So if an error is thrown in the stream AFTER the declaration,
+ // it shouldn't see it
+ assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'>"));
+ assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'><some other tag"));
+
+ // But if an error is thrown before the end of the meta tag, it should see it
+ // and return unsuccessfully
+ assertCharset(throwAfter("<meta charset='WINDOWS-1252'"), null);
+
+ // If there is no meta, but an error is thrown, the detector simply returns
+ // unsuccessfully (it should not throw runtime errors)
+ assertCharset(throwAfter("<"), null);
+ assertCharset(throwAfter("<!"), null);
+ assertCharset(throwAfter("<!doctype"), null);
+ assertCharset(throwAfter("<!doctype html><html"), null);
+ assertCharset(throwAfter("<!doctype html><html attr"), null);
+ assertCharset(throwAfter("<!doctype html><html attr="), null);
+ assertCharset(throwAfter("<!doctype html><html attr=x"), null);
+ assertCharset(throwAfter("<!doctype html><html attr='x"), null);
+ }
+
+ private void assertWindows1252(String html) throws IOException {
+ assertCharset(html, Charset.forName("WINDOWS-1252"));
+ }
+
+ private void assertWindows1252(InputStream inStream) throws IOException {
+ assertCharset(inStream, Charset.forName("WINDOWS-1252"));
+ }
+
+ private void assertCharset(String html, Charset charset) throws IOException {
+ final Charset contentsCharset = (charset == null) ? StandardCharsets.UTF_8 : charset;
+ InputStream inStream = new ByteArrayInputStream(html.getBytes(contentsCharset));
+ final Charset detected = detectCharset(inStream);
+ assertEquals(html + " should be detected as " + charset, charset, detected);
+ }
+
+ private void assertCharset(InputStream inStream, Charset charset) throws IOException {
+ final Charset detected = detectCharset(inStream);
+ assertEquals(charset, detected);
+ }
+
+ private Charset detectCharset(InputStream inStream) throws IOException {
+ return new StrictHtmlEncodingDetector().detect(inStream, metadata);
+ }
+
+ private InputStream throwAfter(String html) {
+ byte[] contents = html.getBytes(StandardCharsets.UTF_8);
+ InputStream contentsInStream = new ByteArrayInputStream(contents);
+ InputStream errorThrowing = new InputStream() {
+ @Override
+ public int read() throws IOException {
+ throw new IOException("test exception");
+ }
+ };
+ return new SequenceInputStream(contentsInStream, errorThrowing);
+ }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 7b93271..3b8048c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -19,6 +19,7 @@
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
@@ -367,6 +368,28 @@
assertEquals(3, metadataList.size());
}
+ @Test(expected = IOException.class)
+ public void testInvalidFromStream() throws Exception {
+ try (InputStream is = this.getClass().getResource(
+ "/test-documents/testODTnotaZipFile.odt").openStream()) {
+ OpenDocumentParser parser = new OpenDocumentParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(is, handler, metadata, new ParseContext());
+ }
+ }
+
+ @Test(expected = IOException.class)
+ public void testInvalidFromFile() throws Exception {
+ try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource(
+ "/test-documents/testODTnotaZipFile.odt"))) {
+ OpenDocumentParser parser = new OpenDocumentParser();
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ parser.parse(tis, handler, metadata, new ParseContext());
+ }
+ }
+
private ParseContext getNonRecursingParseContext() {
ParseContext parseContext = new ParseContext();
parseContext.set(Parser.class, new EmptyParser());
diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.123 b/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.123
new file mode 100644
index 0000000..60c2ec5
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.123
Binary files differ
diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.wk4 b/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.wk4
new file mode 100644
index 0000000..3283716
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.wk4
Binary files differ
diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123.wk1 b/tika-parsers/src/test/resources/test-documents/testLotus123.wk1
new file mode 100644
index 0000000..34a8a3e
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123.wk1
Binary files differ
diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123.wk3 b/tika-parsers/src/test/resources/test-documents/testLotus123.wk3
new file mode 100644
index 0000000..bda0c8c
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123.wk3
Binary files differ
diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123.wks b/tika-parsers/src/test/resources/test-documents/testLotus123.wks
new file mode 100644
index 0000000..2324b24
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123.wks
Binary files differ
diff --git a/tika-parsers/src/test/resources/test-documents/testODTnotaZipFile.odt b/tika-parsers/src/test/resources/test-documents/testODTnotaZipFile.odt
new file mode 100644
index 0000000..9c1d376
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testODTnotaZipFile.odt
@@ -0,0 +1 @@
+This is not a zip file!
diff --git a/tika-parsers/src/test/resources/test-documents/testQuattro.wb1 b/tika-parsers/src/test/resources/test-documents/testQuattro.wb1
new file mode 100644
index 0000000..16db0bd
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testQuattro.wb1
Binary files differ
diff --git a/tika-parsers/src/test/resources/test-documents/testQuattro.wb2 b/tika-parsers/src/test/resources/test-documents/testQuattro.wb2
new file mode 100644
index 0000000..9ed7aa4
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testQuattro.wb2
Binary files differ
diff --git a/tika-parsers/src/test/resources/test-documents/testQuattro.wq1 b/tika-parsers/src/test/resources/test-documents/testQuattro.wq1
new file mode 100644
index 0000000..310d838
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testQuattro.wq1
Binary files differ
diff --git a/tika-parsers/src/test/resources/test-documents/testQuattro.wq2 b/tika-parsers/src/test/resources/test-documents/testQuattro.wq2
new file mode 100644
index 0000000..4a73104
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testQuattro.wq2
Binary files differ
diff --git a/tika-parsers/src/test/resources/test-documents/testWordPerfect_42.doc b/tika-parsers/src/test/resources/test-documents/testWordPerfect_42.doc
new file mode 100644
index 0000000..e6cf1e8
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testWordPerfect_42.doc
Binary files differ
diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index be91b93..1b12590 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
@@ -155,17 +155,26 @@
public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders,
Parser embeddedParser) {
- TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
- PDFParserConfig pdfParserConfig = new PDFParserConfig();
+ //lazily initialize configs
+ //if a header is submitted, any params set in --tika-config tika-config.xml
+ //upon server startup will be ignored.
+ TesseractOCRConfig ocrConfig = null;
+ PDFParserConfig pdfParserConfig = null;
for (String key : httpHeaders.keySet()) {
if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+ ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
} else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+ pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
}
}
- parseContext.set(TesseractOCRConfig.class, ocrConfig);
- parseContext.set(PDFParserConfig.class, pdfParserConfig);
+ if (ocrConfig != null) {
+ parseContext.set(TesseractOCRConfig.class, ocrConfig);
+ }
+ if (pdfParserConfig != null) {
+ parseContext.set(PDFParserConfig.class, pdfParserConfig);
+ }
if (embeddedParser != null) {
parseContext.set(Parser.class, embeddedParser);
}
diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 7b35fec..f851e97 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
@@ -80,8 +80,8 @@
}
@Before
- public void setUp() {
- this.tika = TikaConfig.getDefaultConfig();
+ public void setUp() throws Exception {
+ this.tika = new TikaConfig(getClass().getResourceAsStream("tika-config-for-server-tests.xml"));
TikaResource.init(tika,
new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
new DefaultInputStreamFactory());
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
index e4e60a5..eadacfa 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
@@ -96,12 +96,12 @@
.get();
String text = getStringFromInputStream((InputStream) response.getEntity());
- assertContains("<h2>DefaultParser</h2>", text);
+ assertContains("<h3>DefaultParser</h3>", text);
assertContains("Composite", text);
- assertContains("<h3>OpusParser", text);
- assertContains("<h3>PackageParser", text);
- assertContains("<h3>OOXMLParser", text);
+ assertContains("<h4>OpusParser", text);
+ assertContains("<h4>PackageParser", text);
+ assertContains("<h4>OOXMLParser", text);
assertContains(OpusParser.class.getName(), text);
assertContains(PackageParser.class.getName(), text);
@@ -138,46 +138,51 @@
assertEquals(true, json.containsKey("name"));
assertEquals(true, json.containsKey("composite"));
assertEquals(true, json.containsKey("children"));
- assertEquals("org.apache.tika.parser.DefaultParser", json.get("name"));
+ assertEquals("org.apache.tika.parser.CompositeParser", json.get("name"));
assertEquals(Boolean.TRUE, json.get("composite"));
// At least 20 child parsers which aren't composite, except for CompositeExternalParser
Object[] children = (Object[]) (Object) json.get("children");
- assertTrue(children.length >= 20);
- boolean hasOpus = false, hasOOXML = false, hasPDF = false, hasZip = false;
+ assertTrue(children.length >= 2);
+ boolean hasOpus = false, hasOOXML = false, hasZip = false;
int nonComposite = 0;
int composite = 0;
for (Object o : children) {
- Map<String, Object> d = (Map<String, Object>) o;
- assertEquals(true, d.containsKey("name"));
- assertEquals(true, d.containsKey("composite"));
+ Map<String, Object> child = (Map<String, Object>) o;
+ assertEquals(true, child.containsKey("name"));
+ assertEquals(true, child.containsKey("composite"));
- if (d.get("composite") == Boolean.FALSE)
- nonComposite++;
- else
- composite++;
-
- // Will only have mime types if requested
- if (d.get("composite") == Boolean.FALSE)
- assertEquals(details, d.containsKey("supportedTypes"));
+ Object[] grandChildrenArr = (Object[]) child.get("children");
+ if (grandChildrenArr == null) {
+ continue;
+ }
+ assertTrue(grandChildrenArr.length > 50);
+ for (Object grandChildO : grandChildrenArr) {
+ Map<String, Object> grandChildren = (Map<String, Object>) grandChildO;
- String name = (String) d.get("name");
- if (OpusParser.class.getName().equals(name)) {
- hasOpus = true;
- }
- if (OOXMLParser.class.getName().equals(name)) {
- hasOOXML = true;
- }
- if (PDFParser.class.getName().equals(name)) {
- hasPDF = true;
- }
- if (PackageParser.class.getName().equals(name)) {
- hasZip = true;
+ if (grandChildren.get("composite") == Boolean.FALSE)
+ nonComposite++;
+ else
+ composite++;
+
+ // Will only have mime types if requested
+ if (grandChildren.get("composite") == Boolean.FALSE)
+ assertEquals(details, grandChildren.containsKey("supportedTypes"));
+
+ String name = (String) grandChildren.get("name");
+ if (OpusParser.class.getName().equals(name)) {
+ hasOpus = true;
+ }
+ if (OOXMLParser.class.getName().equals(name)) {
+ hasOOXML = true;
+ }
+ if (PackageParser.class.getName().equals(name)) {
+ hasZip = true;
+ }
}
}
assertEquals(true, hasOpus);
assertEquals(true, hasOOXML);
- assertEquals(true, hasPDF);
assertEquals(true, hasZip);
assertTrue(nonComposite > 20);
assertTrue(composite == 0 || composite == 1); // if CompositeExternalParser is available it will be 1
diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 295ce74..b519170 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
@@ -258,6 +258,44 @@
assertEquals(500, response.getStatus());
}
+ //TIKA-2669
+ @Test
+ public void testPDFConfig() throws Exception {
+
+ Response response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+ String responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+ assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
+ responseMsg);
+
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX+"sortByPosition", "false")
+ .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+ responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+ assertEquals("Left column line 1 Left column line 2 Right column line 1 Right column line 2", responseMsg);
+
+ //make sure that default reverts to initial config option
+ response = WebClient.create(endPoint + TIKA_PATH)
+ .type("application/pdf")
+ .accept("text/plain")
+ .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+ responseMsg = getStringFromInputStream((InputStream) response
+ .getEntity());
+ responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+ assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
+ responseMsg);
+
+ }
+
+
@Test
public void testExtractTextAcceptPlainText() throws Exception {
//TIKA-2384
diff --git a/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml b/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml
new file mode 100644
index 0000000..8867655
--- /dev/null
+++ b/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml
@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<properties>
+ <parsers>
+ <parser class="org.apache.tika.parser.DefaultParser">
+ <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+ </parser>
+ <parser class="org.apache.tika.parser.pdf.PDFParser">
+ <params>
+ <param name="sortByPosition" type="bool">true</param>
+ </params>
+ </parser>
+ </parsers>
+</properties>
\ No newline at end of file
diff --git a/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf b/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf
new file mode 100644
index 0000000..f24e9e7
--- /dev/null
+++ b/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf
Binary files differ