Merge branch 'TIKA-2672' of https://github.com/ThejanW/tika into TIKA-2672

commit: 60d0f6d8493ee86272a613aae4e562592441e880 [log] [tgz]
author: Chris Mattmann <chris.a.mattmann@jpl.nasa.gov> Mon Jul 09 09:34:02 2018 -0700
committer: Chris Mattmann <chris.a.mattmann@jpl.nasa.gov> Mon Jul 09 09:34:02 2018 -0700
tree: c6ab9cc079566fead15b472d08c2f0808d174082
parent: 8c0280c7c8d96a975895456a770a5972a5f50772 [diff]
parent: 1e0d45422c17216df56266f10278f19130c15f6c [diff]
diff --git a/NOTICE.txt b/NOTICE.txt
index 7b50eb1..2bae78e 100644
--- a/NOTICE.txt
+++ b/NOTICE.txt

@@ -15,3 +15,7 @@
 OpenCSV: Copyright 2005 Bytecode Pty Ltd. Licensed under the Apache License, Version 2.0
 
 IPTC Photo Metadata descriptions Copyright 2010 International Press Telecommunications Council.
+
+Tika-mimetypes.xml includes mimetype definitions that were adapted from the PRONOM Technical Registry
+by The National Archives (http://www.nationalarchives.gov.uk/PRONOM/Default.aspx). PRONOM is published
+under the Open Government License 3.0 (http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/)

diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index 785acc7..96f922f 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java

@@ -502,6 +502,8 @@
      */
     private Object openContainer;
 
+    private int consecutiveEOFs = 0;
+
     /**
      * Creates a TikaInputStream instance. This private constructor is used
      * by the static factory methods based on the available information.
@@ -718,6 +720,7 @@
         super.reset();
         position = mark;
         mark = -1;
+        consecutiveEOFs = 0;
     }
 
     @Override
@@ -735,9 +738,15 @@
     }
 
     @Override
-    protected void afterRead(int n) {
+    protected void afterRead(int n) throws IOException {
         if (n != -1) {
             position += n;
+        } else {
+            consecutiveEOFs++;
+            if (consecutiveEOFs > 1000) {
+                throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
+                        "If you think your file is not corrupt, please open an issue on Tika's JIRA");
+            }
         }
     }
 

diff --git a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
index 813eda0..65938be 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MediaTypeRegistry.java

@@ -21,6 +21,7 @@
 import java.util.Map;
 import java.util.SortedSet;
 import java.util.TreeSet;
+import java.util.concurrent.ConcurrentHashMap;
 
 /**
  * Registry of known Internet media types.
@@ -46,7 +47,7 @@
      * as a mapping from the alias to the corresponding canonical type.
      */
     private final Map<MediaType, MediaType> registry =
-        new HashMap<MediaType, MediaType>();
+        new ConcurrentHashMap<>();
 
     /**
      * Known type inheritance relationships. The mapping is from a media type
@@ -74,7 +75,7 @@
      * @return known aliases
      */
     public SortedSet<MediaType> getAliases(MediaType type) {
-        SortedSet<MediaType> aliases = new TreeSet<MediaType>();
+        SortedSet<MediaType> aliases = new TreeSet<>();
         for (Map.Entry<MediaType, MediaType> entry : registry.entrySet()) {
             if (entry.getValue().equals(type) && !entry.getKey().equals(type)) {
                 aliases.add(entry.getKey());

diff --git a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
index 4acfe01..0009ac9 100644
--- a/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java
+++ b/tika-core/src/main/java/org/apache/tika/mime/MimeTypes.java

@@ -30,6 +30,7 @@
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.concurrent.ConcurrentHashMap;
 
 import javax.xml.namespace.QName;
 
@@ -103,8 +104,7 @@
     private final MediaTypeRegistry registry = new MediaTypeRegistry();
 
     /** All the registered MimeTypes indexed on their canonical names */
-    private final Map<MediaType, MimeType> types =
-        new HashMap<MediaType, MimeType>();
+    private final Map<MediaType, MimeType> types = new HashMap<>();
 
     /** The patterns matcher */
     private Patterns patterns = new Patterns(registry);
@@ -426,7 +426,6 @@
      *
      * @return the minimum length of data to provide.
      * @see #getMimeType(byte[])
-     * @see #getMimeType(String, byte[])
      */
     public int getMinLength() {
         // This needs to be reasonably large to be able to correctly detect

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 104cd2c..e12dd02 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml

@@ -1561,9 +1561,69 @@
   <mime-type type="application/vnd.llamagraphics.life-balance.exchange+xml">
     <glob pattern="*.lbe"/>
   </mime-type>
+  
   <mime-type type="application/vnd.lotus-1-2-3">
+    <alias type="application/x-123"/>
+    <_comment>Lotus 1-2-3</_comment>
+    <!-- <glob pattern="*.wks"/> - conflicts with application/vnd.ms-works -->
+    <glob pattern="*.wk1"/>
+    <glob pattern="*.wk2"/>
+    <glob pattern="*.wk3"/>
+    <glob pattern="*.wk4"/>
     <glob pattern="*.123"/>
   </mime-type>
+  
+  <mime-type type="application/vnd.lotus-1-2-3;version=1">
+    <sub-class-of type="application/vnd.lotus-1-2-3"/>
+    <_comment>Lotus 1-2-3, version 1</_comment>
+    <magic priority="50">
+      <match value="0x000002000404" type="string" offset="0">
+      <!-- <glob pattern="*.wks"/> - conflicts with application/vnd.ms-works -->
+      </match>
+    </magic>
+  </mime-type>
+
+  <mime-type type="application/vnd.lotus-1-2-3;version=2">
+    <sub-class-of type="application/vnd.lotus-1-2-3"/>
+    <_comment>Lotus 1-2-3, version 2</_comment>
+    <magic priority="50">
+      <match value="0x00000200060406000800" type="string" offset="0">
+      </match>
+    </magic>
+    <glob pattern="*.wk1"/>
+    <glob pattern="*.wk2"/>
+  </mime-type>
+   
+  <mime-type type="application/vnd.lotus-1-2-3;version=3">
+    <sub-class-of type="application/vnd.lotus-1-2-3"/>
+    <_comment>Lotus 1-2-3, version 3</_comment>
+    <magic priority="50">
+      <match value="0x00001A0000100400" type="string" offset="0">
+      </match>
+    </magic>
+    <glob pattern="*.wk3"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.lotus-1-2-3;version=4">
+    <sub-class-of type="application/vnd.lotus-1-2-3"/>
+    <_comment>Lotus 1-2-3, version 4-5</_comment>
+    <magic priority="50">
+      <match value="0x00001A0002100400" type="string" offset="0">
+      </match>
+    </magic>
+    <glob pattern="*.wk4"/>
+  </mime-type>
+
+  <mime-type type="application/vnd.lotus-1-2-3;version=97+9.x">
+    <sub-class-of type="application/vnd.lotus-1-2-3"/>
+    <_comment>Lotus 1-2-3, version 97/9.x</_comment>
+    <magic priority="50">
+      <match value="0x00001A0003100400" type="string" offset="0">
+      </match>
+    </magic>
+    <glob pattern="*.123"/>
+  </mime-type>
+
   <mime-type type="application/vnd.lotus-approach">
     <glob pattern="*.apr"/>
   </mime-type>
@@ -2728,6 +2788,14 @@
 
   </mime-type>
   <!-- TODO: figure out how to identify earlier versions -->
+  <mime-type type="application/vnd.wordperfect;version=4.2">
+    <sub-class-of type="application/vnd.wordperfect"/>
+    <magic priority="50">
+      <match value="0xCB0A01" type="string" offset="0">
+        <match value="0xCB" type="string" offset="5"/>
+      </match>
+    </magic>
+  </mime-type>
   <mime-type type="application/vnd.wordperfect;version=5.0">
     <sub-class-of type="application/vnd.wordperfect"/>
     <magic priority="50">
@@ -2863,13 +2931,6 @@
      <!-- <glob pattern="*.vst"/> --> <!-- conflicting with application/vnd.visio-->
   </mime-type>
 
-  <mime-type type="application/x-123">
-    <magic priority="50">
-      <match value="0x00001a00" type="big32" offset="0" />
-      <match value="0x00000200" type="big32" offset="0" />
-    </magic>
-  </mime-type>
-
   <mime-type type="application/x-abiword">
     <glob pattern="*.abw"/>
   </mime-type>
@@ -3971,12 +4032,65 @@
     </_comment>
     <!-- qp2 and wb3 are currently detected by POIFSContainerDetector
         TODO: add detection for wb2 and wb1 -->
+    <glob pattern="*.wq1"/>
+    <glob pattern="*.wq2"/>
+    <glob pattern="*.wkq"/>
     <glob pattern="*.qpw"/>
     <glob pattern="*.wb1"/>
     <glob pattern="*.wb2"/>
     <glob pattern="*.wb3"/>
   </mime-type>
 
+  <mime-type type="application/x-quattro-pro;version=1-4">
+    <sub-class-of type="application/x-quattro-pro"/>
+    <_comment>Quattro Pro for DOS, version 1-4</_comment>
+    <magic priority="50">
+      <match value="0x000002002051" type="string" offset="0">
+      </match>
+    </magic>
+    <glob pattern="*.wq1"/>
+    <glob pattern="*.wkq"/>
+  </mime-type>
+
+  <mime-type type="application/x-quattro-pro;version=5">
+    <sub-class-of type="application/x-quattro-pro"/>
+    <_comment>Quattro Pro for DOS, version 5</_comment>
+    <magic priority="50">
+      <match value="0x000002002151" type="string" offset="0">
+      </match>
+    </magic>
+    <glob pattern="*.wq2"/>
+    <glob pattern="*.wkq"/>
+  </mime-type>
+  
+  <!-- First Quattro Pro for Windows had major version number 1, which
+  was followed by 5. This is confusing, perhaps adding "win" qualifier to version
+  field could clear this up (but it's quite ugly as well)   -->
+  
+  <mime-type type="application/x-quattro-pro;version=1+5">
+    <sub-class-of type="application/x-quattro-pro"/>
+    <_comment>Quattro Pro for Windows, version 1, 5</_comment>
+    <magic priority="50">
+      <match value="0x000002000110" type="string" offset="0">
+      </match>
+    </magic>
+    <glob pattern="*.wb1"/>
+  </mime-type>
+
+  <mime-type type="application/x-quattro-pro;version=6">
+    <sub-class-of type="application/x-quattro-pro"/>
+    <_comment>Quattro Pro for Windows, version 6</_comment>
+    <magic priority="50">
+      <match value="0x000002000210" type="string" offset="0">
+      </match>
+    </magic>
+    <glob pattern="*.wb2"/>
+  </mime-type>
+  
+  <!-- Quattro Pro for Windows 7-8 (wb3) and and 9 (qpw) files are
+       currently detected by POIFSContainerDetector
+  -->
+
   <mime-type type="application/xquery">
     <_comment>XQuery source code</_comment>
     <glob pattern="*.xq"/>

diff --git a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
index b94b095..a511538 100644
--- a/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java
+++ b/tika-core/src/test/java/org/apache/tika/mime/MimeTypesReaderTest.java

@@ -25,8 +25,10 @@
 import java.io.ByteArrayInputStream;
 import java.lang.reflect.Field;
 import java.util.ArrayList;
+import java.util.ConcurrentModificationException;
 import java.util.List;
 import java.util.Set;
+import java.util.concurrent.Executors;
 
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.metadata.Metadata;
@@ -49,6 +51,8 @@
  */
 public class MimeTypesReaderTest {
 
+    static boolean stop = false;
+
     private MimeTypes mimeTypes;
     private List<Magic> magics;
 
@@ -280,4 +284,22 @@
         assertEquals(name, mimeType.toString());
         assertEquals(".ditamap", mimeType.getExtension());
     }
+
+    @Test
+    public void testMultiThreaded() throws Exception {
+        MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes();
+        Executors.newSingleThreadExecutor().execute(()-> {
+            try {
+                for (int i = 0; i < 500 && !stop; i++) {
+                    mimeTypes.forName("abc"+i+"/abc");
+                }
+            } catch (MimeTypeException e ) {
+                e.printStackTrace();
+            }}
+        );
+
+        for (int i = 0; i < 500 & !stop; i++) {
+            mimeTypes.getMediaTypeRegistry().getAliases(MediaType.APPLICATION_ZIP);
+        }
+    }
 }

diff --git a/tika-parsers/pom.xml b/tika-parsers/pom.xml
index e67609e..ff289d8 100644
--- a/tika-parsers/pom.xml
+++ b/tika-parsers/pom.xml

@@ -44,8 +44,8 @@
     <brotli.version>0.1.2</brotli.version>
     <mime4j.version>0.8.1</mime4j.version>
     <vorbis.version>0.8</vorbis.version>
-    <pdfbox.version>2.0.9</pdfbox.version>
-    <jempbox.version>1.8.13</jempbox.version>
+    <pdfbox.version>2.0.11</pdfbox.version>
+    <jempbox.version>1.8.15</jempbox.version>
     <netcdf-java.version>4.5.5</netcdf-java.version>
     <sis.version>0.8</sis.version>
     <parso.version>2.0.9</parso.version>

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
index 84141b9..c4c5188 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java

@@ -83,7 +83,7 @@
 
 
     private static final Pattern HTTP_META_PATTERN = Pattern.compile(
-            "(?is)<\\s*meta\\s+([^<>]+)"
+            "(?is)<\\s*meta(?:/|\\s+)([^<>]+)"
     );
 
     //this should match both the older:
@@ -97,7 +97,7 @@
     //For a more general "not" matcher, try:
     //("(?is)charset\\s*=\\s*['\\\"]?\\s*([^<>\\s'\\\";]+)")
     private static final Pattern FLEXIBLE_CHARSET_ATTR_PATTERN = Pattern.compile(
-            ("(?is)charset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
+            ("(?is)\\bcharset\\s*=\\s*(?:['\\\"]\\s*)?([-_:\\.a-z0-9]+)")
     );
 
     private static final Charset ASCII = Charset.forName("US-ASCII");
@@ -154,6 +154,10 @@
                 if (CHARSETS_UNSUPPORTED_BY_IANA.contains(candCharset.toLowerCase(Locale.US))) {
                     continue;
                 }
+                if ("x-user-defined".equalsIgnoreCase(candCharset)) {
+                    candCharset = "windows-1252";
+                }
+
                 if (CharsetUtils.isSupported(candCharset)) {
                     try {
                         return CharsetUtils.forName(candCharset);

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
index a1ef0da..adf591a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/HtmlParser.java

@@ -29,6 +29,8 @@
 import org.apache.tika.detect.AutoDetectReader;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractEncodingDetectorParser;
@@ -40,6 +42,8 @@
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import javax.swing.text.AbstractDocument;
+
 /**
  * HTML parser. Uses TagSoup to turn the input document to HTML SAX events,
  * and post-processes the events to produce XHTML and metadata expected by
@@ -90,6 +94,27 @@
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
 
+        TemporaryResources tmp = null;
+        try {
+            if (!TikaInputStream.isTikaInputStream(stream)) {
+                tmp = new TemporaryResources();
+                stream = TikaInputStream.get(stream, tmp);
+            }
+            //AutoDetectReader can throw exceptions during
+            //initialization.  If we just created a
+            //TemporaryResources, we need to make sure to close it.
+            parseImpl(stream, handler, metadata, context);
+        } finally {
+            if (tmp != null) {
+                tmp.close();
+            }
+        }
+
+    }
+
+
+    private void parseImpl(InputStream stream, ContentHandler handler,
+            Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
         // Automatically detect the character encoding
         try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
                 metadata, getEncodingDetector(context))) {

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java
new file mode 100644
index 0000000..487f747
--- /dev/null
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/html/StrictHtmlEncodingDetector.java

@@ -0,0 +1,491 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.html;
+
+import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.metadata.Metadata;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.IllegalCharsetNameException;
+import java.nio.charset.StandardCharsets;
+import java.nio.charset.UnsupportedCharsetException;
+import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static java.nio.charset.StandardCharsets.*;
+import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SequenceMatcher.caseInsensitive;
+import static org.apache.tika.parser.html.StrictHtmlEncodingDetector.SingleByteMatcher.matchers;
+
+/**
+ * This is a strict html encoding detector that enforces the standard
+ * far more strictly than the HtmlEncodingDetector.
+ */
+public class StrictHtmlEncodingDetector implements EncodingDetector {
+    private static final String CHARSET_LABEL_FILE = "whatwg-encoding-labels.tsv";
+    private static Map<String, Charset> CHARSET_LABELS = getCharsetLabels();
+
+    private static Map<String, Charset> getCharsetLabels() {
+        String path = StrictHtmlEncodingDetector.class.getPackage().getName().replace('.', '/');
+        String filename = '/' + path + '/' + CHARSET_LABEL_FILE;
+        InputStream inputStream = StrictHtmlEncodingDetector.class.getResourceAsStream(filename);
+        Objects.requireNonNull(inputStream, "Missing charset label mapping file : " + filename);
+        try (BufferedReader buffer = new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.US_ASCII))) {
+            return buffer.lines()
+                    .filter(s -> !s.startsWith("#"))
+                    .map(s -> s.split("\t"))
+                    .filter(parts -> parts.length >= 2)
+                    .collect(Collectors.toMap(
+                            parts -> parts[0],
+                            StrictHtmlEncodingDetector::charsetFromStandard
+                    ));
+        } catch (IOException e) {
+            throw new UncheckedIOException("Unable to read the charset label mapping", e);
+        }
+    }
+
+    private static Charset charsetFromStandard(String[] names) {
+        for (int i = 1; i < names.length; i++) {
+            try {
+                return Charset.forName(names[1]);
+            } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {/* pass */}
+        }
+        // The only single-byte charset extended charset that must be present on every Java platform
+        return StandardCharsets.ISO_8859_1;
+    }
+
+    private static Charset getCharsetByLabel(String label) {
+        if (label == null) return null;
+        label = label.trim().toLowerCase(Locale.US);
+        return CHARSET_LABELS.get(label);
+    }
+
+    @Override
+    public Charset detect(InputStream input, Metadata metadata) throws IOException {
+        PreScanner preScanner = new PreScanner(input);
+
+        // If there is a BOM at the beginning, the detection does not go further
+        Charset bomCharset = preScanner.detectBOM();
+        if (bomCharset != null) return bomCharset;
+
+        // Assume that if there was a charset specified either by the end user or the transport level,
+        // it was stored in the metadata
+        String incomingCharsetName = metadata.get(Metadata.CONTENT_ENCODING);
+        if (incomingCharsetName != null) {
+            Charset incomingCharset = getCharsetByLabel(incomingCharsetName);
+            if (incomingCharset != null) return incomingCharset;
+        }
+
+        return preScanner.scan();
+    }
+
+    static class PreScanner {
+
+        private static final Pattern META_CHARSET_PATTERN = Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
+        private static ByteMatcher COMMENT_START = new SequenceMatcher("<!--");
+        private static ByteMatcher COMMENT_END = new SequenceMatcher("-->");
+        private static ByteMatcher LETTER = new OrMatcher(
+                new RangeMatcher((byte) 'a', (byte) 'z'),
+                new RangeMatcher((byte) 'A', (byte) 'Z')
+        );
+        private static ByteMatcher SPACE = new OrMatcher(matchers(0x09, 0x0A, 0x0C, 0x0D, 0x20));
+        private static ByteMatcher SLASH = new SingleByteMatcher((byte) '/');
+        private static ByteMatcher EQUAL = new SingleByteMatcher((byte) '=');
+        private static ByteMatcher TAG_END = new SingleByteMatcher((byte) '>');
+        private static ByteMatcher SINGLE_QUOTE = new SingleByteMatcher((byte) '\'');
+        private static ByteMatcher DOUBLE_QUOTE = new SingleByteMatcher((byte) '"');
+        private static ByteMatcher QUOTE = new OrMatcher(SINGLE_QUOTE, DOUBLE_QUOTE);
+        private static ByteMatcher TAG_END_OR_SLASH = new OrMatcher(SLASH, TAG_END);
+        private static ByteMatcher SPACE_OR_SLASH = new OrMatcher(SPACE, SLASH);
+        private static ByteMatcher SPACE_OR_TAG_END = new OrMatcher(SPACE, TAG_END);
+        private static ByteMatcher META_START = new SequenceMatcher(caseInsensitive("<meta"), SPACE_OR_SLASH);
+        private static ByteMatcher TAG_START = new SequenceMatcher(
+                new SingleByteMatcher((byte) '<'),
+                new OrMatcher(SLASH, LETTER)
+        );
+        private static ByteMatcher TAG_BODY = new NegativeMatcher(new OrMatcher(SPACE, TAG_END));
+        private static ByteMatcher SPECIAL_TAG_START = new SequenceMatcher(
+                new SingleByteMatcher((byte) '<'),
+                new OrMatcher(matchers("!/?"))
+        );
+        private static ByteMatcher UTF8_BOM = new SequenceMatcher(matchers(0xEF, 0xBB, 0xBF));
+        private static ByteMatcher UTF16_BE_BOM = new SequenceMatcher(matchers(0xFE, 0xFF));
+        private static ByteMatcher UTF16_LE_BOM = new SequenceMatcher(matchers(0xFF, 0xFE));
+
+
+        PushbackInputStream stream;
+        private CharsetDetectionResult detectedCharset = new CharsetDetectionResult();
+
+        public PreScanner(InputStream inputStream) {
+            this.stream = new PushbackInputStream(inputStream, 32);
+        }
+
+        public Charset scan() {
+            while (processAtLeastOneByte()) {
+                if (detectedCharset.isFound()) {
+                    return detectedCharset.getCharset();
+                }
+            }
+            return null;
+        }
+
+        private Charset detectBOM() {
+            try {
+                if (UTF8_BOM.matches(stream)) return StandardCharsets.UTF_8;
+                else if (UTF16_BE_BOM.matches(stream)) return StandardCharsets.UTF_16BE;
+                else if (UTF16_LE_BOM.matches(stream)) return StandardCharsets.UTF_16LE;
+            } catch (IOException e) { /* stream could not be read, also return null */ }
+            return null;
+        }
+
+        private boolean processAtLeastOneByte() {
+            try {
+                return processComment() ||
+                        processMeta() ||
+                        processTag() ||
+                        processSpecialTag() ||
+                        processAny();
+            } catch (IOException e) {
+                return false;
+            }
+        }
+
+        private boolean processAny() throws IOException {
+            int read = stream.read();
+            return read != -1;
+        }
+
+        private boolean hasBytes() throws IOException {
+            int read = stream.read();
+            if (read != -1) stream.unread(read);
+            return read != -1;
+        }
+
+        private boolean processComment() throws IOException {
+            if (COMMENT_START.matches(stream)) {
+                // The two '-' in the '-->' sequence can be the same as those in the '<!--' sequence.
+                stream.unread("--".getBytes(StandardCharsets.US_ASCII));
+                return COMMENT_END.advanceUntilMatches(stream);
+            }
+            return false;
+        }
+
+        private boolean processTag() throws IOException {
+            if (TAG_START.matches(stream)) {
+                TAG_BODY.skipAll(stream);
+                while (getAttribute() != null) {/*ignore the attribute*/}
+                return true;
+            }
+            return false;
+        }
+
+        private boolean processSpecialTag() throws IOException {
+            if (SPECIAL_TAG_START.matches(stream)) {
+                TAG_BODY.skipAll(stream);
+                return TAG_END.advanceUntilMatches(stream);
+            }
+            return false;
+        }
+
+        private boolean processMeta() throws IOException {
+            if (META_START.matches(stream)) {
+                Set<String> attributeNames = new HashSet<>();
+                boolean gotPragma = false;
+                Boolean needPragma = null;
+                CharsetDetectionResult charset = new CharsetDetectionResult();
+                while (hasBytes()) {
+                    Attribute attribute = getAttribute();
+                    if (attribute == null) break;
+                    if (attributeNames.contains(attribute.getName())) continue;
+                    attributeNames.add(attribute.getName());
+                    switch (attribute.getName()) {
+                        case "http-equiv":
+                            if (attribute.getValue().equals("content-type"))
+                                gotPragma = true;
+                            break;
+                        case "content":
+                            String charsetName = getEncodingFromMeta(attribute.getValue());
+                            if (!charset.isFound() && charsetName != null) {
+                                charset.find(charsetName);
+                                needPragma = true;
+                            }
+                            break;
+                        case "charset":
+                            charset.find(attribute.getValue());
+                            needPragma = false;
+                            break;
+                        default: // Ignore non-charset related attributes
+                    }
+                }
+                if (needPragma != null && !(needPragma && !gotPragma)) {
+                    detectedCharset = charset;
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        private String getEncodingFromMeta(String attributeValue) {
+            Matcher matcher = META_CHARSET_PATTERN.matcher(attributeValue);
+            if (!matcher.find()) return null;
+            return matcher.group(2);
+        }
+
+        private Attribute getAttribute() throws IOException {
+            SPACE_OR_SLASH.skipAll(stream);
+            if (TAG_END.peekMatches(stream)) return null;
+            StringBuilder name = new StringBuilder();
+            while (!EQUAL.peekMatches(stream) || name.length() == 0) {
+                if (TAG_END_OR_SLASH.peekMatches(stream)) {
+                    break;
+                } else if (SPACE.peekMatches(stream)) {
+                    SPACE.skipAll(stream);
+                    break;
+                } else {
+                    name.append(getLowerCaseChar());
+                }
+            }
+
+            if (!EQUAL.matches(stream)) return new Attribute(name.toString(), "");
+            SPACE.skipAll(stream);
+
+            StringBuilder value = new StringBuilder();
+            byte[] quoteMatched = QUOTE.match(stream);
+            if (quoteMatched != null) {
+                char quote = (char) quoteMatched[0];
+                int nextChar = -1;
+                while (nextChar != quote) {
+                    if (nextChar != -1) value.append((char) nextChar);
+                    nextChar = getLowerCaseChar();
+                }
+            } else {
+                while (!SPACE_OR_TAG_END.peekMatches(stream)) {
+                    value.append(getLowerCaseChar());
+                }
+            }
+            return new Attribute(name.toString(), value.toString());
+        }
+
+        private char getLowerCaseChar() throws IOException {
+            int nextPoint = stream.read();
+            if (nextPoint == -1) throw new IOException();
+            if (nextPoint >= 'A' && nextPoint <= 'Z') nextPoint += 0x20; // lowercase
+            return (char) nextPoint;
+        }
+    }
+
+    static class Attribute {
+        String name;
+        String value;
+
+        public Attribute(String name, String value) {
+            this.name = name;
+            this.value = value;
+        }
+
+        public String getName() {
+            return name;
+        }
+
+        public String getValue() {
+            return value;
+        }
+    }
+
+    /**
+     * A detection may either not find a charset, find an invalid charset, or find a valid charset
+     */
+    static class CharsetDetectionResult {
+        private boolean found = false;
+        private Charset charset = null;
+
+        public CharsetDetectionResult() { /* default result: not found */}
+
+        public boolean isFound() {
+            return found;
+        }
+
+        public void find(String charsetName) {
+            this.found = true;
+            charsetName = charsetName.trim();
+            if ("x-user-defined".equals(charsetName)) charsetName = "windows-1252";
+            this.charset = getCharsetByLabel(charsetName);
+            // The specification states: If charset is a UTF-16 encoding, then set charset to UTF-8.
+            if (UTF_16LE.equals(charset) || UTF_16BE.equals(charset)) charset = UTF_8;
+        }
+
+        public Charset getCharset() {
+            // the result may be null even if found is true, in the case there is a charset specified,
+            // but it is invalid
+            return charset;
+        }
+    }
+
+    static abstract class ByteMatcher {
+
+        abstract byte[] match(PushbackInputStream pushbackInputStream) throws IOException;
+
+        boolean matches(PushbackInputStream pushbackInputStream) throws IOException {
+            return this.match(pushbackInputStream) != null;
+        }
+
+        boolean advanceUntilMatches(PushbackInputStream pushbackInputStream) throws IOException {
+            while (!this.matches(pushbackInputStream)) {
+                int nextByte = pushbackInputStream.read();
+                if (nextByte == -1) return false;
+            }
+            return true;
+        }
+
+        void skipAll(PushbackInputStream pushbackInputStream) throws IOException {
+            while (matches(pushbackInputStream)) {/* just skip the byte */}
+        }
+
+        public boolean peekMatches(PushbackInputStream pushbackInputStream) throws IOException {
+            byte[] matched = this.match(pushbackInputStream);
+            if (matched != null) pushbackInputStream.unread(matched);
+            return matched != null;
+        }
+    }
+
+    static class SingleByteMatcher extends ByteMatcher {
+        private byte b;
+
+        public SingleByteMatcher(byte b) {
+            this.b = b;
+        }
+
+        public static ByteMatcher[] matchers(String s) {
+            return matchers(s.chars());
+        }
+
+        public static ByteMatcher[] matchers(int... bytes) {
+            return matchers(IntStream.of(bytes));
+        }
+
+        public static ByteMatcher[] matchers(IntStream byteStream) {
+            return byteStream
+                    .mapToObj(i -> new SingleByteMatcher((byte) i))
+                    .toArray(ByteMatcher[]::new);
+        }
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            int read = pushbackInputStream.read();
+            if ((byte) read == b) return new byte[]{b};
+            if (read != -1) pushbackInputStream.unread(read);
+            return null;
+        }
+    }
+
+    static class SequenceMatcher extends ByteMatcher {
+        private ByteMatcher[] matchers;
+
+        public SequenceMatcher(ByteMatcher... matchers) {
+            this.matchers = matchers;
+        }
+
+        public SequenceMatcher(String s) {
+            this(matchers(s));
+        }
+
+        public static SequenceMatcher caseInsensitive(String s) {
+            ByteMatcher[] lowerMatchers = matchers(s.toLowerCase(Locale.US));
+            ByteMatcher[] upperMatchers = matchers(s.toUpperCase(Locale.US));
+            OrMatcher[] matchers = IntStream
+                    .range(0, Math.min(lowerMatchers.length, upperMatchers.length))
+                    .mapToObj(i -> new OrMatcher(lowerMatchers[i], upperMatchers[i]))
+                    .toArray(OrMatcher[]::new);
+            return new SequenceMatcher(matchers);
+        }
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            ByteArrayOutputStream allMatched = new ByteArrayOutputStream();
+            for (ByteMatcher m : matchers) {
+                byte[] matched = m.match(pushbackInputStream);
+                if (matched == null) {
+                    pushbackInputStream.unread(allMatched.toByteArray());
+                    return null;
+                } else {
+                    allMatched.write(matched);
+                }
+            }
+            return allMatched.toByteArray();
+        }
+    }
+
+    static class OrMatcher extends ByteMatcher {
+        private ByteMatcher[] matchers;
+
+        public OrMatcher(ByteMatcher... matchers) {
+            this.matchers = matchers;
+        }
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            for (ByteMatcher m : matchers) {
+                byte[] matched = m.match(pushbackInputStream);
+                if (matched != null) return matched;
+            }
+            return null;
+        }
+    }
+
+    static class NegativeMatcher extends ByteMatcher {
+        private ByteMatcher matcher;
+
+        public NegativeMatcher(ByteMatcher matcher) {
+            this.matcher = matcher;
+        }
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            byte[] matched = matcher.match(pushbackInputStream);
+            if (matched == null) {
+                int read = pushbackInputStream.read();
+                if (read == -1) return null;
+                return new byte[]{(byte) read};
+            } else {
+                pushbackInputStream.unread(matched);
+                return null;
+            }
+        }
+    }
+
+    static class RangeMatcher extends ByteMatcher {
+        private byte low;
+        private byte high;
+
+        public RangeMatcher(byte low, byte high) {
+            this.low = low;
+            this.high = high;
+        }
+
+
+        @Override
+        byte[] match(PushbackInputStream pushbackInputStream) throws IOException {
+            int read = pushbackInputStream.read();
+            if (read >= low && read <= high) return new byte[]{(byte) read};
+            if (read != -1) pushbackInputStream.unread(read);
+            return null;
+        }
+    }
+}

diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index c8aa65e..86ac3cf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java

@@ -174,10 +174,13 @@
 
     private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler) throws IOException, TikaException, SAXException {
         ZipEntry entry = zipStream.getNextEntry();
-        while (entry != null) {
+        if (entry == null) {
+            throw new IOException("No entries found in ZipInputStream");
+        }
+        do {
             handleZipEntry(entry, zipStream, metadata, context, handler);
             entry = zipStream.getNextEntry();
-        }
+        } while (entry != null);
     }
 
     private void handleZipFile(ZipFile zipFile, Metadata metadata,

diff --git a/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv b/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv
new file mode 100644
index 0000000..92ddecb
--- /dev/null
+++ b/tika-parsers/src/main/resources/org/apache/tika/parser/html/whatwg-encoding-labels.tsv

@@ -0,0 +1,234 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.unicode-1-1-utf-8	UTF-8
+#
+# label	encoding	fallback
+utf-8	UTF-8
+utf8	UTF-8
+866	IBM866
+cp866	IBM866
+csibm866	IBM866
+ibm866	IBM866
+csisolatin2	ISO-8859-2
+iso-8859-2	ISO-8859-2
+iso-ir-101	ISO-8859-2
+iso8859-2	ISO-8859-2
+iso88592	ISO-8859-2
+iso_8859-2	ISO-8859-2
+iso_8859-2:1987	ISO-8859-2
+l2	ISO-8859-2
+latin2	ISO-8859-2
+csisolatin3	ISO-8859-3
+iso-8859-3	ISO-8859-3
+iso-ir-109	ISO-8859-3
+iso8859-3	ISO-8859-3
+iso88593	ISO-8859-3
+iso_8859-3	ISO-8859-3
+iso_8859-3:1988	ISO-8859-3
+l3	ISO-8859-3
+latin3	ISO-8859-3
+csisolatin4	ISO-8859-4
+iso-8859-4	ISO-8859-4
+iso-ir-110	ISO-8859-4
+iso8859-4	ISO-8859-4
+iso88594	ISO-8859-4
+iso_8859-4	ISO-8859-4
+iso_8859-4:1988	ISO-8859-4
+l4	ISO-8859-4
+latin4	ISO-8859-4
+csisolatincyrillic	ISO-8859-5
+cyrillic	ISO-8859-5
+iso-8859-5	ISO-8859-5
+iso-ir-144	ISO-8859-5
+iso8859-5	ISO-8859-5
+iso88595	ISO-8859-5
+iso_8859-5	ISO-8859-5
+iso_8859-5:1988	ISO-8859-5
+arabic	ISO-8859-6
+asmo-708	ISO-8859-6
+csiso88596e	ISO-8859-6
+csiso88596i	ISO-8859-6
+csisolatinarabic	ISO-8859-6
+ecma-114	ISO-8859-6
+iso-8859-6	ISO-8859-6
+iso-8859-6-e	ISO-8859-6
+iso-8859-6-i	ISO-8859-6
+iso-ir-127	ISO-8859-6
+iso8859-6	ISO-8859-6
+iso88596	ISO-8859-6
+iso_8859-6	ISO-8859-6
+iso_8859-6:1987	ISO-8859-6
+csisolatingreek	ISO-8859-7
+ecma-118	ISO-8859-7
+elot_928	ISO-8859-7
+greek	ISO-8859-7
+greek8	ISO-8859-7
+iso-8859-7	ISO-8859-7
+iso-ir-126	ISO-8859-7
+iso8859-7	ISO-8859-7
+iso88597	ISO-8859-7
+iso_8859-7	ISO-8859-7
+iso_8859-7:1987	ISO-8859-7
+sun_eu_greek	ISO-8859-7
+csiso88598e	ISO-8859-8
+csisolatinhebrew	ISO-8859-8
+hebrew	ISO-8859-8
+iso-8859-8	ISO-8859-8
+iso-8859-8-e	ISO-8859-8
+iso-ir-138	ISO-8859-8
+iso8859-8	ISO-8859-8
+iso88598	ISO-8859-8
+iso_8859-8	ISO-8859-8
+iso_8859-8:1988	ISO-8859-8
+visual	ISO-8859-8
+csiso88598i	ISO-8859-8-I	ISO-8859-8
+iso-8859-8-i	ISO-8859-8-I	ISO-8859-8
+logical	ISO-8859-8-I	ISO-8859-8
+csisolatin6	ISO-8859-10	ISO-8859-4
+iso-8859-10	ISO-8859-10	ISO-8859-4
+iso-ir-157	ISO-8859-10	ISO-8859-4
+iso8859-10	ISO-8859-10	ISO-8859-4
+iso885910	ISO-8859-10	ISO-8859-4
+l6	ISO-8859-10	ISO-8859-4
+latin6	ISO-8859-10	ISO-8859-4
+iso-8859-13	ISO-8859-13
+iso8859-13	ISO-8859-13
+iso885913	ISO-8859-13
+iso-8859-14	ISO-8859-14	ISO-8859-1
+iso8859-14	ISO-8859-14	ISO-8859-1
+iso885914	ISO-8859-14	ISO-8859-1
+csisolatin9	ISO-8859-15
+iso-8859-15	ISO-8859-15
+iso8859-15	ISO-8859-15
+iso885915	ISO-8859-15
+iso_8859-15	ISO-8859-15
+l9	ISO-8859-15
+iso-8859-16	ISO-8859-16	ISO-8859-1
+cskoi8r	KOI8-R
+koi	KOI8-R
+koi8	KOI8-R
+koi8-r	KOI8-R
+koi8_r	KOI8-R
+koi8-ru	KOI8-U
+koi8-u	KOI8-U
+csmacintosh	x-MacRoman
+mac	x-MacRoman
+macintosh	x-MacRoman
+x-mac-roman	x-MacRoman
+dos-874	windows-874
+iso-8859-11	windows-874
+iso8859-11	windows-874
+iso885911	windows-874
+tis-620	windows-874
+windows-874	windows-874
+cp1250	windows-1250
+windows-1250	windows-1250
+x-cp1250	windows-1250
+cp1251	windows-1251
+windows-1251	windows-1251
+x-cp1251	windows-1251
+ansi_x3.4-1968	windows-1252
+ascii	windows-1252
+cp1252	windows-1252
+cp819	windows-1252
+csisolatin1	windows-1252
+ibm819	windows-1252
+iso-8859-1	windows-1252
+iso-ir-100	windows-1252
+iso8859-1	windows-1252
+iso88591	windows-1252
+iso_8859-1	windows-1252
+iso_8859-1:1987	windows-1252
+l1	windows-1252
+latin1	windows-1252
+us-ascii	windows-1252
+windows-1252	windows-1252
+x-cp1252	windows-1252
+cp1253	windows-1253
+windows-1253	windows-1253
+x-cp1253	windows-1253
+cp1254	windows-1254
+csisolatin5	windows-1254
+iso-8859-9	windows-1254
+iso-ir-148	windows-1254
+iso8859-9	windows-1254
+iso88599	windows-1254
+iso_8859-9	windows-1254
+iso_8859-9:1989	windows-1254
+l5	windows-1254
+latin5	windows-1254
+windows-1254	windows-1254
+x-cp1254	windows-1254
+cp1255	windows-1255
+windows-1255	windows-1255
+x-cp1255	windows-1255
+cp1256	windows-1256
+windows-1256	windows-1256
+x-cp1256	windows-1256
+cp1257	windows-1257
+windows-1257	windows-1257
+x-cp1257	windows-1257
+cp1258	windows-1258
+windows-1258	windows-1258
+x-cp1258	windows-1258
+x-mac-cyrillic	x-MacCyrillic
+x-mac-ukrainian	x-MacCyrillic
+chinese	GBK
+csgb2312	GBK
+csiso58gb231280	GBK
+gb2312	GBK
+gb_2312	GBK
+gb_2312-80	GBK
+gbk	GBK
+iso-ir-58	GBK
+x-gbk	GBK
+gb18030	gb18030
+big5	Big5
+big5-hkscs	Big5
+cn-big5	Big5
+csbig5	Big5
+x-x-big5	Big5
+cseucpkdfmtjapanese	EUC-JP
+euc-jp	EUC-JP
+x-euc-jp	EUC-JP
+csiso2022jp	ISO-2022-JP
+iso-2022-jp	ISO-2022-JP
+csshiftjis	Shift_JIS
+ms932	Shift_JIS
+ms_kanji	Shift_JIS
+shift-jis	Shift_JIS
+shift_jis	Shift_JIS
+sjis	Shift_JIS
+windows-31j	Shift_JIS
+x-sjis	Shift_JIS
+cseuckr	EUC-KR
+csksc56011987	EUC-KR
+euc-kr	EUC-KR
+iso-ir-149	EUC-KR
+korean	EUC-KR
+ks_c_5601-1987	EUC-KR
+ks_c_5601-1989	EUC-KR
+ksc5601	EUC-KR
+ksc_5601	EUC-KR
+windows-949	EUC-KR
+csiso2022kr	replacement
+hz-gb-2312	replacement
+iso-2022-cn	replacement
+iso-2022-cn-ext	replacement
+iso-2022-kr	replacement
+replacement	replacement
+utf-16be	UTF-16BE
+utf-16	UTF-16LE
+utf-16le	UTF-16LE
+x-user-defined	x-user-defined

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java
new file mode 100644
index 0000000..931f5e1
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/HtmlEncodingDetectorTest.java

@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.*;
+
+public class HtmlEncodingDetectorTest {
+
+    @Test
+    public void basic() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    @Ignore("can we can prove this harms detection")
+    public void utf16() throws IOException {
+        // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
+        assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
+    }
+
+    @Test
+    public void xUserDefined() throws IOException {
+        // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
+        assertWindows1252("<meta charset='x-user-defined'>");
+    }
+
+    @Test
+    public void withSlash() throws IOException {
+        assertWindows1252("<meta/charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    @Ignore("until we do a full parse")
+    public void insideTag() throws IOException {
+        assertWindows1252("<meta name='description'" +
+                "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
+                "<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    @Ignore("until we do a full parse")
+    public void missingAttribute() throws IOException {
+        assertWindows1252(
+                "<meta content='charset=UTF-8'>" + // missing http-equiv attribute
+                        "<meta charset='WINDOWS-1252'>" // valid declaration
+        );
+    }
+
+    @Test
+    @Ignore("until we do a full parse")
+    public void insideSpecialTag() throws IOException {
+        // Content inside <?, <!, and </ should be ignored
+        for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
+            assertWindows1252(
+                    "<" + (char) b + // start comment
+                            "<meta charset='UTF-8'>" + // inside special tag
+                            "<meta charset='WINDOWS-1252'>" // real charset declaration
+            );
+    }
+
+    @Test
+    @Ignore("until we can prove this harms detection")
+    public void spaceBeforeTag() throws IOException {
+        assertWindows1252(
+                "< meta charset='UTF-8'>" + // invalid charset declaration
+                        "<meta charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    public void invalidAttribute() throws IOException {
+        assertWindows1252(
+                "<meta " +
+                        "badcharset='UTF-8' " + // invalid charset declaration
+                        "charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    @Ignore("until we can prove this harms detection")
+    public void unmatchedQuote() throws IOException {
+        assertWindows1252(
+                "<meta http-equiv='content-type' content='charset=\"UTF-8'>" + // invalid charset declaration
+                        "<meta charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+
+    @Test
+    @Ignore("until we do a full parse")
+    public void withCompactComment() throws IOException {
+        // <!--> is a valid comment
+        assertWindows1252(
+                "<!--" + // start comment
+                        "<meta charset='UTF-8'>" + // inside comment
+                        "-->" + // end comment
+                        "<!-->" + // compact comment
+                        "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
+        );
+    }
+
+    private void assertWindows1252(String html) throws IOException {
+        assertCharset(html, Charset.forName("WINDOWS-1252"));
+    }
+
+    private void assertCharset(String html, Charset charset) throws IOException {
+        assertEquals(html + " should be detected as " + charset,
+                charset, detectCharset(html));
+    }
+
+    private Charset detectCharset(String test) throws IOException {
+        Metadata metadata = new Metadata();
+        InputStream inStream = new ByteArrayInputStream(test.getBytes(StandardCharsets.UTF_8));
+        return new HtmlEncodingDetector().detect(inStream, metadata);
+    }
+}
\ No newline at end of file

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java
new file mode 100644
index 0000000..1c0da8d
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/html/StrictHtmlEncodingDetectorTest.java

@@ -0,0 +1,300 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.html;
+
+
+import org.apache.tika.metadata.Metadata;
+import org.junit.Before;
+import org.junit.Ignore;
+import org.junit.Test;
+
+import java.io.*;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+
+import static org.junit.Assert.assertEquals;
+
+public class StrictHtmlEncodingDetectorTest {
+    private Metadata metadata = new Metadata();
+
+    @Before
+    public void setUp() {
+        this.metadata = new Metadata();
+    }
+
+    @Test
+    public void basic() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void duplicateMeta() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'>" +
+                "<meta charset='UTF-8'>");
+    }
+
+    @Test
+    public void httpEquiv() throws IOException {
+        assertWindows1252("<meta " +
+                "http-equiv='content-type' " +
+                "content='text/html; charset=\"WINDOWS-1252\"'>"); // quotes around the charset are allowed
+        assertWindows1252("<meta " +
+                "content=' charset  =  WINDOWS-1252' " + // The charset may be anywhere in the content attribute
+                "http-equiv='content-type' >");
+    }
+
+    @Test
+    public void httpEquivDuplicateCharset() throws IOException {
+        assertWindows1252("<meta " +
+                "http-equiv='content-type' " +
+                "content='charset=WINDOWS-1252;" + // The detection should stop after the semicolon
+                "charset=UTF-8'>");
+    }
+
+    @Test
+    public void htmlFragment() throws IOException {
+        assertWindows1252("<!doctype html><html class=nojs><head><meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void verBadHtml() throws IOException {
+        // check that the parser is not confused by garbage before the declaration
+        assertWindows1252("<< l \" == / '=x\n >" +
+                "<!--> " +
+                "< <x'/ <=> " +
+                "<meta/>" +
+                "<a x/>" +
+                "<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void incompleteMeta() throws IOException {
+        assertWindows1252("<meta charset='WINDOWS-1252'"); // missing '>' at the end
+    }
+
+    @Test
+    public void charsetWithWhiteSpaces() throws IOException {
+        assertWindows1252("<meta charset='   \t\n  WINDOWS-1252 \t\n'>");
+    }
+
+    @Test
+    public void mixedCase() throws IOException {
+        assertWindows1252("<mEtA chArsEt='WInDOWs-1252'>");
+    }
+
+    @Test
+    public void utf16() throws IOException {
+        // According to the specification 'If charset is a UTF-16 encoding, then set charset to UTF-8.'
+        assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
+    }
+
+    @Test
+    public void xUserDefined() throws IOException {
+        // According to the specification 'If charset is x-user-defined, then set charset to windows-1252.'
+        assertWindows1252("<meta charset='x-user-defined'>");
+    }
+
+    @Test
+    public void iso88591() throws IOException {
+        // In the spec, iso-8859-1 is an alias for WINDOWS-1252
+        assertWindows1252("<meta charset='iso-8859-1'>");
+    }
+
+    @Test
+    public void macintoshEncoding() throws IOException {
+        // The mac roman encoding exists in java, but under the name x-MacRoman
+        assertCharset("<meta charset='macintosh'>", Charset.forName("x-MacRoman"));
+    }
+
+    @Test
+    public void bom() throws IOException {
+        // A BOM should have precedence over the meta
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
+    }
+
+    @Test
+    public void withSlash() throws IOException {
+        assertWindows1252("<meta/charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void insideDescription() throws IOException {
+        assertWindows1252("<meta name='description'" +
+                "content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
+                "<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void insideTag() throws IOException {
+        assertWindows1252("<tag " +
+                "attribute=\"<meta charset='UTF-8'>\" " + // inside attribute
+                "<meta charset='UTF-8' " + // still inside tag
+                "/>" + // tag end
+                "<meta charset='WINDOWS-1252'>");
+    }
+
+    @Test
+    public void missingAttribute() throws IOException {
+        assertWindows1252(
+                "<meta content='charset=UTF-8'>" + // missing http-equiv attribute
+                        "<meta charset='WINDOWS-1252'>" // valid declaration
+        );
+    }
+
+    @Test
+    public void insideSpecialTag() throws IOException {
+        // Content inside <?, <!, and </ should be ignored
+        for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
+            assertWindows1252(
+                    "<" + (char) b + // start comment
+                            "<meta charset='UTF-8'>" + // inside special tag
+                            "<meta charset='WINDOWS-1252'>" // real charset declaration
+            );
+    }
+
+    @Test
+    public void spaceBeforeTag() throws IOException {
+        assertWindows1252(
+                "< meta charset='UTF-8'>" + // invalid charset declaration
+                        "<meta charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    public void invalidAttribute() throws IOException {
+        assertWindows1252(
+                "<meta " +
+                        "badcharset='UTF-8' " + // invalid charset declaration
+                        "charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    public void unmatchedQuote() throws IOException {
+        assertWindows1252(
+                "<meta http-equiv='content-type' content='charset=\"UTF-8'>" + // invalid charset declaration
+                        "<meta charset='WINDOWS-1252'>" // real charset declaration
+        );
+    }
+
+    @Test
+    public void realWorld() throws IOException {
+        assertWindows1252("<!DOCTYPE html>\n" +
+                "<html lang=\"fr\">\n" +
+                "<head>\n" +
+                "<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':\n" +
+                "\t\t\tnew Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],\n" +
+                "\t\t\tj=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=\n" +
+                "\t\t\t'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);\n" +
+                "\t\t\t})(window,document,'script','dataLayer','GTM-PNX8H8X');</script>\n" +
+                "<title>Horaires Transilien 2018 - Lignes A B C D E H J K L N P R U</title>\n" +
+                "<meta name=\"description\" content=\"Consultez les horaires du Transilien en temps réel. Lignes A et B du RER. Lignes C D E H J K L N P R U du Transilien.\">\n" +
+                "<meta name=\"keywords\" content=\"horaires transilien\">\n" +
+                "<meta charset=\"windows-1252\">\n" +
+                "<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
+                "<meta name=\"robots\" content=\"follow, index\">\n" +
+                "<base hr");
+    }
+
+    @Test
+    public void withCompactComment() throws IOException {
+        // <!--> is a valid comment
+        assertWindows1252(
+                "<!--" + // start comment
+                        "<meta charset='UTF-8'>" + // inside comment
+                        "-->" + // end comment
+                        "<!-->" + // compact comment
+                        "<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
+        );
+    }
+
+    @Test
+    public void withUserProvidedCharset() throws IOException {
+        metadata.set(Metadata.CONTENT_ENCODING, "ISO-8859-1");
+        // ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
+        assertWindows1252("");
+        assertWindows1252("<meta charset='UTF-8'>");
+        assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>");
+        // if a BOM is present, it has precedence over transport layer information
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
+        assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
+    }
+
+    @Test
+    public void throwResistance() throws IOException {
+        // The preprocessing should return right after having found the charset
+        // So if an error is thrown in the stream AFTER the declaration,
+        // it shouldn't see it
+        assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'>"));
+        assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'><some other tag"));
+
+        // But if an error is thrown before the end of the meta tag, it should see it
+        // and return unsuccessfully
+        assertCharset(throwAfter("<meta charset='WINDOWS-1252'"), null);
+
+        // If there is no meta, but an error is thrown, the detector simply returns
+        // unsuccessfully (it should not throw runtime errors)
+        assertCharset(throwAfter("<"), null);
+        assertCharset(throwAfter("<!"), null);
+        assertCharset(throwAfter("<!doctype"), null);
+        assertCharset(throwAfter("<!doctype html><html"), null);
+        assertCharset(throwAfter("<!doctype html><html attr"), null);
+        assertCharset(throwAfter("<!doctype html><html attr="), null);
+        assertCharset(throwAfter("<!doctype html><html attr=x"), null);
+        assertCharset(throwAfter("<!doctype html><html attr='x"), null);
+    }
+
+    private void assertWindows1252(String html) throws IOException {
+        assertCharset(html, Charset.forName("WINDOWS-1252"));
+    }
+
+    private void assertWindows1252(InputStream inStream) throws IOException {
+        assertCharset(inStream, Charset.forName("WINDOWS-1252"));
+    }
+
+    private void assertCharset(String html, Charset charset) throws IOException {
+        final Charset contentsCharset = (charset == null) ? StandardCharsets.UTF_8 : charset;
+        InputStream inStream = new ByteArrayInputStream(html.getBytes(contentsCharset));
+        final Charset detected = detectCharset(inStream);
+        assertEquals(html + " should be detected as " + charset, charset, detected);
+    }
+
+    private void assertCharset(InputStream inStream, Charset charset) throws IOException {
+        final Charset detected = detectCharset(inStream);
+        assertEquals(charset, detected);
+    }
+
+    private Charset detectCharset(InputStream inStream) throws IOException {
+        return new StrictHtmlEncodingDetector().detect(inStream, metadata);
+    }
+
+    private InputStream throwAfter(String html) {
+        byte[] contents = html.getBytes(StandardCharsets.UTF_8);
+        InputStream contentsInStream = new ByteArrayInputStream(contents);
+        InputStream errorThrowing = new InputStream() {
+            @Override
+            public int read() throws IOException {
+                throw new IOException("test exception");
+            }
+        };
+        return new SequenceInputStream(contentsInStream, errorThrowing);
+    }
+}

diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 7b93271..3b8048c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java

@@ -19,6 +19,7 @@
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
+import java.io.IOException;
 import java.io.InputStream;
 import java.util.Arrays;
 import java.util.List;
@@ -367,6 +368,28 @@
         assertEquals(3, metadataList.size());
     }
 
+    @Test(expected = IOException.class)
+    public void testInvalidFromStream() throws Exception {
+        try (InputStream is = this.getClass().getResource(
+                "/test-documents/testODTnotaZipFile.odt").openStream()) {
+            OpenDocumentParser parser = new OpenDocumentParser();
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            parser.parse(is, handler, metadata, new ParseContext());
+        }
+    }
+
+    @Test(expected = IOException.class)
+    public void testInvalidFromFile() throws Exception {
+        try (TikaInputStream tis = TikaInputStream.get(this.getClass().getResource(
+                "/test-documents/testODTnotaZipFile.odt"))) {
+            OpenDocumentParser parser = new OpenDocumentParser();
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            parser.parse(tis, handler, metadata, new ParseContext());
+        }
+    }
+
     private ParseContext getNonRecursingParseContext() {
         ParseContext parseContext = new ParseContext();
         parseContext.set(Parser.class, new EmptyParser());

diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.123 b/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.123
new file mode 100644
index 0000000..60c2ec5
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.123
Binary files differ

diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.wk4 b/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.wk4
new file mode 100644
index 0000000..3283716
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123-lotusftp.wk4
Binary files differ

diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123.wk1 b/tika-parsers/src/test/resources/test-documents/testLotus123.wk1
new file mode 100644
index 0000000..34a8a3e
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123.wk1
Binary files differ

diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123.wk3 b/tika-parsers/src/test/resources/test-documents/testLotus123.wk3
new file mode 100644
index 0000000..bda0c8c
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123.wk3
Binary files differ

diff --git a/tika-parsers/src/test/resources/test-documents/testLotus123.wks b/tika-parsers/src/test/resources/test-documents/testLotus123.wks
new file mode 100644
index 0000000..2324b24
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testLotus123.wks
Binary files differ

diff --git a/tika-parsers/src/test/resources/test-documents/testODTnotaZipFile.odt b/tika-parsers/src/test/resources/test-documents/testODTnotaZipFile.odt
new file mode 100644
index 0000000..9c1d376
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testODTnotaZipFile.odt

@@ -0,0 +1 @@
+This is not a zip file!

diff --git a/tika-parsers/src/test/resources/test-documents/testQuattro.wb1 b/tika-parsers/src/test/resources/test-documents/testQuattro.wb1
new file mode 100644
index 0000000..16db0bd
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testQuattro.wb1
Binary files differ

diff --git a/tika-parsers/src/test/resources/test-documents/testQuattro.wb2 b/tika-parsers/src/test/resources/test-documents/testQuattro.wb2
new file mode 100644
index 0000000..9ed7aa4
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testQuattro.wb2
Binary files differ

diff --git a/tika-parsers/src/test/resources/test-documents/testQuattro.wq1 b/tika-parsers/src/test/resources/test-documents/testQuattro.wq1
new file mode 100644
index 0000000..310d838
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testQuattro.wq1
Binary files differ

diff --git a/tika-parsers/src/test/resources/test-documents/testQuattro.wq2 b/tika-parsers/src/test/resources/test-documents/testQuattro.wq2
new file mode 100644
index 0000000..4a73104
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testQuattro.wq2
Binary files differ

diff --git a/tika-parsers/src/test/resources/test-documents/testWordPerfect_42.doc b/tika-parsers/src/test/resources/test-documents/testWordPerfect_42.doc
new file mode 100644
index 0000000..e6cf1e8
--- /dev/null
+++ b/tika-parsers/src/test/resources/test-documents/testWordPerfect_42.doc
Binary files differ

diff --git a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
index be91b93..1b12590 100644
--- a/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java
+++ b/tika-server/src/main/java/org/apache/tika/server/resource/TikaResource.java

@@ -155,17 +155,26 @@
 
     public static void fillParseContext(ParseContext parseContext, MultivaluedMap<String, String> httpHeaders,
                                         Parser embeddedParser) {
-        TesseractOCRConfig ocrConfig = new TesseractOCRConfig();
-        PDFParserConfig pdfParserConfig = new PDFParserConfig();
+        //lazily initialize configs
+        //if a header is submitted, any params set in --tika-config tika-config.xml
+        //upon server startup will be ignored.
+        TesseractOCRConfig ocrConfig = null;
+        PDFParserConfig pdfParserConfig = null;
         for (String key : httpHeaders.keySet()) {
             if (StringUtils.startsWith(key, X_TIKA_OCR_HEADER_PREFIX)) {
+                ocrConfig = (ocrConfig == null) ? new TesseractOCRConfig() : ocrConfig;
                 processHeaderConfig(httpHeaders, ocrConfig, key, X_TIKA_OCR_HEADER_PREFIX);
             } else if (StringUtils.startsWith(key, X_TIKA_PDF_HEADER_PREFIX)) {
+                pdfParserConfig = (pdfParserConfig == null) ? new PDFParserConfig() : pdfParserConfig;
                 processHeaderConfig(httpHeaders, pdfParserConfig, key, X_TIKA_PDF_HEADER_PREFIX);
             }
         }
-        parseContext.set(TesseractOCRConfig.class, ocrConfig);
-        parseContext.set(PDFParserConfig.class, pdfParserConfig);
+        if (ocrConfig != null) {
+            parseContext.set(TesseractOCRConfig.class, ocrConfig);
+        }
+        if (pdfParserConfig != null) {
+            parseContext.set(PDFParserConfig.class, pdfParserConfig);
+        }
         if (embeddedParser != null) {
             parseContext.set(Parser.class, embeddedParser);
         }

diff --git a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
index 7b35fec..f851e97 100644
--- a/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java
+++ b/tika-server/src/test/java/org/apache/tika/server/CXFTestBase.java

@@ -80,8 +80,8 @@
     }
 
     @Before
-    public void setUp() {
-        this.tika = TikaConfig.getDefaultConfig();
+    public void setUp() throws Exception {
+        this.tika = new TikaConfig(getClass().getResourceAsStream("tika-config-for-server-tests.xml"));
         TikaResource.init(tika,
                 new CommonsDigester(DIGESTER_READ_LIMIT, "md5,sha1:32"),
                 new DefaultInputStreamFactory());

diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
index e4e60a5..eadacfa 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaParsersTest.java

@@ -96,12 +96,12 @@
                     .get();
 
             String text = getStringFromInputStream((InputStream) response.getEntity());
-            assertContains("<h2>DefaultParser</h2>", text);
+            assertContains("<h3>DefaultParser</h3>", text);
             assertContains("Composite", text);
 
-            assertContains("<h3>OpusParser", text);
-            assertContains("<h3>PackageParser", text);
-            assertContains("<h3>OOXMLParser", text);
+            assertContains("<h4>OpusParser", text);
+            assertContains("<h4>PackageParser", text);
+            assertContains("<h4>OOXMLParser", text);
 
             assertContains(OpusParser.class.getName(), text);
             assertContains(PackageParser.class.getName(), text);
@@ -138,46 +138,51 @@
             assertEquals(true, json.containsKey("name"));
             assertEquals(true, json.containsKey("composite"));
             assertEquals(true, json.containsKey("children"));
-            assertEquals("org.apache.tika.parser.DefaultParser", json.get("name"));
+            assertEquals("org.apache.tika.parser.CompositeParser", json.get("name"));
             assertEquals(Boolean.TRUE, json.get("composite"));
 
             // At least 20 child parsers which aren't composite, except for CompositeExternalParser
             Object[] children = (Object[]) (Object) json.get("children");
-            assertTrue(children.length >= 20);
-            boolean hasOpus = false, hasOOXML = false, hasPDF = false, hasZip = false;
+            assertTrue(children.length >= 2);
+            boolean hasOpus = false, hasOOXML = false, hasZip = false;
             int nonComposite = 0;
             int composite = 0;
             for (Object o : children) {
-                Map<String, Object> d = (Map<String, Object>) o;
-                assertEquals(true, d.containsKey("name"));
-                assertEquals(true, d.containsKey("composite"));
+                Map<String, Object> child = (Map<String, Object>) o;
+                assertEquals(true, child.containsKey("name"));
+                assertEquals(true, child.containsKey("composite"));
 
-                if (d.get("composite") == Boolean.FALSE)
-                	nonComposite++;
-                else
-                	composite++;
-                
-                // Will only have mime types if requested
-                if (d.get("composite") == Boolean.FALSE)
-                	assertEquals(details, d.containsKey("supportedTypes"));
+                Object[] grandChildrenArr = (Object[]) child.get("children");
+                if (grandChildrenArr == null) {
+                    continue;
+                }
+                assertTrue(grandChildrenArr.length > 50);
+                for (Object grandChildO : grandChildrenArr) {
+                    Map<String, Object> grandChildren = (Map<String, Object>) grandChildO;
 
-                String name = (String) d.get("name");
-                if (OpusParser.class.getName().equals(name)) {
-                    hasOpus = true;
-                }
-                if (OOXMLParser.class.getName().equals(name)) {
-                    hasOOXML = true;
-                }
-                if (PDFParser.class.getName().equals(name)) {
-                    hasPDF = true;
-                }
-                if (PackageParser.class.getName().equals(name)) {
-                    hasZip = true;
+                    if (grandChildren.get("composite") == Boolean.FALSE)
+                        nonComposite++;
+                    else
+                        composite++;
+
+                    // Will only have mime types if requested
+                    if (grandChildren.get("composite") == Boolean.FALSE)
+                        assertEquals(details, grandChildren.containsKey("supportedTypes"));
+
+                    String name = (String) grandChildren.get("name");
+                    if (OpusParser.class.getName().equals(name)) {
+                        hasOpus = true;
+                    }
+                    if (OOXMLParser.class.getName().equals(name)) {
+                        hasOOXML = true;
+                    }
+                    if (PackageParser.class.getName().equals(name)) {
+                        hasZip = true;
+                    }
                 }
             }
             assertEquals(true, hasOpus);
             assertEquals(true, hasOOXML);
-            assertEquals(true, hasPDF);
             assertEquals(true, hasZip);
             assertTrue(nonComposite > 20);
             assertTrue(composite == 0 || composite == 1); // if CompositeExternalParser is available it will be 1

diff --git a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
index 295ce74..b519170 100644
--- a/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java
+++ b/tika-server/src/test/java/org/apache/tika/server/TikaResourceTest.java

@@ -258,6 +258,44 @@
         assertEquals(500, response.getStatus());
     }
 
+    //TIKA-2669
+    @Test
+    public void testPDFConfig() throws Exception {
+
+        Response response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+        String responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+        assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
+                responseMsg);
+
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .header(TikaResource.X_TIKA_PDF_HEADER_PREFIX+"sortByPosition", "false")
+                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+        responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+        assertEquals("Left column line 1 Left column line 2 Right column line 1 Right column line 2", responseMsg);
+
+        //make sure that default reverts to initial config option
+        response = WebClient.create(endPoint + TIKA_PATH)
+                .type("application/pdf")
+                .accept("text/plain")
+                .put(ClassLoader.getSystemResourceAsStream("testPDFTwoTextBoxes.pdf"));
+        responseMsg = getStringFromInputStream((InputStream) response
+                .getEntity());
+        responseMsg = responseMsg.replaceAll("[\r\n ]+", " ").trim();
+        assertEquals("Left column line 1 Right column line 1 Left colu mn line 2 Right column line 2",
+                responseMsg);
+
+    }
+
+
     @Test
     public void testExtractTextAcceptPlainText() throws Exception {
         //TIKA-2384

diff --git a/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml b/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml
new file mode 100644
index 0000000..8867655
--- /dev/null
+++ b/tika-server/src/test/resources/org/apache/tika/server/tika-config-for-server-tests.xml

@@ -0,0 +1,29 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<properties>
+    <parsers>
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/>
+        </parser>
+        <parser class="org.apache.tika.parser.pdf.PDFParser">
+            <params>
+                <param name="sortByPosition" type="bool">true</param>
+            </params>
+        </parser>
+    </parsers>
+</properties>
\ No newline at end of file

diff --git a/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf b/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf
new file mode 100644
index 0000000..f24e9e7
--- /dev/null
+++ b/tika-server/src/test/resources/testPDFTwoTextBoxes.pdf
Binary files differ
commit	60d0f6d8493ee86272a613aae4e562592441e880	[log] [tgz]
author	Chris Mattmann <chris.a.mattmann@jpl.nasa.gov>	Mon Jul 09 09:34:02 2018 -0700
committer	Chris Mattmann <chris.a.mattmann@jpl.nasa.gov>	Mon Jul 09 09:34:02 2018 -0700
tree	c6ab9cc079566fead15b472d08c2f0808d174082
parent	8c0280c7c8d96a975895456a770a5972a5f50772 [diff]
parent	1e0d45422c17216df56266f10278f19130c15f6c [diff]