Fix for TIKA-2955 filter out invalid HTML characters 0x7F to 0x9F (#285)
diff --git a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
index a200820..c568240 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
@@ -331,5 +331,15 @@
endElement(name);
}
}
+
+ @Override
+ protected boolean isInvalid(int ch) {
+ if(super.isInvalid(ch)) return true;
+ // These control chars are invalid in XHTML.
+ if(0x7F <= ch && ch <=0x9F) {
+ return true;
+ }
+ return false;
+ }
}
diff --git a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
index 6492b7c..e2ae019 100644
--- a/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
+++ b/tika-core/src/test/java/org/apache/tika/sax/XHTMLContentHandlerTest.java
@@ -157,6 +157,49 @@
assertTrue(toHTMLContentHandler.toString().contains("itemscope"));
}
+
+
+ @Test
+ public void testInvalidControlCharacter0x7F() throws Exception {
+ xhtml.startDocument();
+ xhtml.startElement("menu");
+ xhtml.element("li", "a\u007Fz");
+ xhtml.endElement("menu");
+ xhtml.endDocument();
+
+ String[] words = getRealWords(output.toString());
+ System.out.println(words[0]);
+ assertEquals(1, words.length);
+ assertEquals("a\ufffdz", words[0]);
+ }
+
+ @Test
+ public void testInvalidControlCharacter0x9F() throws Exception {
+ xhtml.startDocument();
+ xhtml.startElement("menu");
+ xhtml.element("li", "a\u009Fz");
+ xhtml.endElement("menu");
+ xhtml.endDocument();
+
+ String[] words = getRealWords(output.toString());
+ System.out.println(words[0]);
+ assertEquals(1, words.length);
+ assertEquals("a\ufffdz", words[0]);
+ }
+
+ @Test
+ public void testInvalidControlCharacter0x93() throws Exception {
+ xhtml.startDocument();
+ xhtml.startElement("menu");
+ xhtml.element("li", "a\u0093z");
+ xhtml.endElement("menu");
+ xhtml.endDocument();
+
+ String[] words = getRealWords(output.toString());
+ System.out.println(words[0]);
+ assertEquals(1, words.length);
+ assertEquals("a\ufffdz", words[0]);
+ }
/**
* Return array of non-zerolength words. Splitting on whitespace will get us