Fix for TIKA-2955 filter out invalid HTML characters 0x7F to 0x9F (#285)

diff --git a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
index a200820..c568240 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
@@ -331,5 +331,15 @@
             endElement(name);
         }
     }
+    
+    @Override
+    protected boolean isInvalid(int ch) {
+        if(super.isInvalid(ch)) return true;
+        // These control chars are  invalid in XHTML.
+        if(0x7F <= ch && ch <=0x9F) {
+            return true;
+        }
+        return false;
+    }
 
 }
diff --git a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
index 6492b7c..e2ae019 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
@@ -157,6 +157,49 @@
 
         assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
     }
+    
+    
+    @Test
+    public void testInvalidControlCharacter0x7F() throws Exception {
+        xhtml.startDocument();
+        xhtml.startElement("menu");
+        xhtml.element("li", "a\u007Fz");
+        xhtml.endElement("menu");
+        xhtml.endDocument();
+        
+        String[] words = getRealWords(output.toString());
+        System.out.println(words[0]);
+        assertEquals(1, words.length);
+        assertEquals("a\ufffdz", words[0]);
+    }
+    
+    @Test
+    public void testInvalidControlCharacter0x9F() throws Exception {
+        xhtml.startDocument();
+        xhtml.startElement("menu");
+        xhtml.element("li", "a\u009Fz");
+        xhtml.endElement("menu");
+        xhtml.endDocument();
+        
+        String[] words = getRealWords(output.toString());
+        System.out.println(words[0]);
+        assertEquals(1, words.length);
+        assertEquals("a\ufffdz", words[0]);
+    }
+    
+    @Test
+    public void testInvalidControlCharacter0x93() throws Exception {
+        xhtml.startDocument();
+        xhtml.startElement("menu");
+        xhtml.element("li", "a\u0093z");
+        xhtml.endElement("menu");
+        xhtml.endDocument();
+        
+        String[] words = getRealWords(output.toString());
+        System.out.println(words[0]);
+        assertEquals(1, words.length);
+        assertEquals("a\ufffdz", words[0]);
+    }
 
     /**
      * Return array of non-zerolength words. Splitting on whitespace will get us