tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser.html;

 import static java.nio.charset.StandardCharsets.ISO_8859_1;
 import static java.nio.charset.StandardCharsets.US_ASCII;
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;

 import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
 import java.io.Writer;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.Callable;
 import java.util.concurrent.CompletionService;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ExecutorCompletionService;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
 import java.util.concurrent.Future;
 import java.util.regex.Pattern;
 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.sax.SAXTransformerFactory;
 import javax.xml.transform.sax.TransformerHandler;
 import javax.xml.transform.stream.StreamResult;

 import org.ccil.cowan.tagsoup.HTMLSchema;
 import org.ccil.cowan.tagsoup.Schema;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.Locator;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;

 import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.ServiceLoader;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.AutoDetectReader;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Geographic;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.LinkContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
 import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;

 public class HtmlParserTest extends TikaTest {

     @Test
     public void testParseAscii() throws Exception {
         String path = "/test-documents/testHTML.html";
         final StringWriter href = new StringWriter();
         final StringWriter name = new StringWriter();
         ContentHandler body = new BodyContentHandler();
         Metadata metadata = new Metadata();
         try (InputStream stream = getResourceAsStream(path)) {
             ContentHandler link = new DefaultHandler() {
                 @Override
                 public void startElement(String u, String l, String n, Attributes a)
                         throws SAXException {
                     if ("a".equals(l)) {
                         if (a.getValue("href") != null) {
                             href.append(a.getValue("href"));
                         } else if (a.getValue("name") != null) {
                             name.append(a.getValue("name"));
                         }
                     }
                 }
             };
             new HtmlParser()
                     .parse(stream, new TeeContentHandler(body, link), metadata, new ParseContext());
         }

         assertEquals("Title : Test Indexation Html", metadata.get(TikaCoreProperties.TITLE));
         assertEquals("Tika Developers", metadata.get("Author"));
         assertEquals("5", metadata.get("refresh"));

         assertEquals("51.2312", metadata.get(Geographic.LATITUDE));
         assertEquals("-5.1987", metadata.get(Geographic.LONGITUDE));

         assertEquals("http://www.apache.org/", href.toString());
         assertEquals("test-anchor", name.toString());

         String content = body.toString();
         assertTrue("Did not contain expected text:" + "Test Indexation Html",
                 content.contains("Test Indexation Html"));
         assertTrue("Did not contain expected text:" + "Indexation du fichier",
                 content.contains("Indexation du fichier"));
     }

     @Test
     @Ignore("The file 'testXHTML_utf8.html' is not available for testing")
     public void XtestParseUTF8() throws IOException, SAXException, TikaException {
         String path = "/test-documents/testXHTML_utf8.html";
         Metadata metadata = new Metadata();
         String content = new Tika().parseToString(getResourceAsStream(path), metadata);

         assertTrue("Did not contain expected text:" + "Title : Tilte with UTF-8 chars √∂√§√•",
                 content.contains("Title : Tilte with UTF-8 chars √∂√§√•"));

         assertTrue("Did not contain expected text:" + "Content with UTF-8 chars",
                 content.contains("Content with UTF-8 chars"));

         assertTrue("Did not contain expected text:" + "√•√§√∂", content.contains("√•√§√∂"));
     }

     @Test
     public void testXhtmlParsing() throws Exception {
         String path = "/test-documents/testXHTML.html";
         Metadata metadata = new Metadata();
         String content = new Tika().parseToString(getResourceAsStream(path), metadata);

         //can't specify charset because default differs between OS's
         assertTrue(
                 metadata.get(Metadata.CONTENT_TYPE).startsWith("application/xhtml+xml; charset="));
         assertEquals("XHTML test document", metadata.get(TikaCoreProperties.TITLE));

         assertEquals("Tika Developers", metadata.get("Author"));
         assertEquals("5", metadata.get("refresh"));
         assertContains("ability of Apache Tika", content);
         assertContains("extract content", content);
         assertContains("an XHTML document", content);
     }

     @Test
     public void testParseEmpty() throws Exception {
         ContentHandler handler = new BodyContentHandler();
         new HtmlParser().parse(new ByteArrayInputStream(new byte[0]), handler, new Metadata(),
                 new ParseContext());
         assertEquals("", handler.toString());
     }

     /**
      * Test case for TIKA-210
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-210">TIKA-210</a>
      */
     @Test
     public void testCharactersDirectlyUnderBodyElement() throws Exception {
         String test = "<html><body>test</body></html>";
         String content = new Tika().parseToString(new ByteArrayInputStream(test.getBytes(UTF_8)));
         assertEquals("test", content);
     }

     /**
      * Test case for TIKA-287
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-287">TIKA-287</a>
      */
     @Test
     public void testBaseHref() throws Exception {
         assertRelativeLink("http://lucene.apache.org/tika/", "http://lucene.apache.org/", "tika/");

         assertRelativeLink("http://domain.com/?pid=1", "http://domain.com", "?pid=1");
         assertRelativeLink("http://domain.com/?pid=2", "http://domain.com?pid=1", "?pid=2");

         assertRelativeLink("http://domain.com/file.html", "http://domain.com/path/", "/file.html");
         assertRelativeLink("http://domain.com/path/file.html", "http://domain.com/path/",
                 "./file.html");
         assertRelativeLink("http://domain.com/path/file.html", "http://domain.com/path/",
                 "file.html");

         assertRelativeLink("http://domain2.com/newpath", "http://domain.com/path/to/file",
                 "http://domain2.com/newpath");

         // See http://www.communities.hp.com/securitysoftware/blogs/jeff/archive/2007/12/19/RFC-1808-vs-2396-vs-3986_3A00_-Browsers-vs.-programing-languages.aspx
         // Also http://www.ietf.org/rfc/rfc3986.txt
         // Also http://issues.apache.org/jira/browse/NUTCH-566
         // Also http://issues.apache.org/jira/browse/NUTCH-436
         assertRelativeLink("http://domain.com/path/?pid=1", "http://domain.com/path/", "?pid=1");
         assertRelativeLink("http://domain.com/file?pid=1", "http://domain.com/file", "?pid=1");
         assertRelativeLink("http://domain.com/path/d;p?pid=1", "http://domain.com/path/d;p?q#f",
                 "?pid=1");
     }

     private void assertRelativeLink(String url, String base, String relative) throws Exception {
         String test =
                 "<html><head><base href=\"" + base + "\"></head>" + "<body><a href=\"" + relative +
                         "\">test</a></body></html>";
         final List<String> links = new ArrayList<>();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new DefaultHandler() {
                     @Override
                     public void startElement(String u, String l, String name, Attributes atts) {
                         if (name.equals("a") && atts.getValue("", "href") != null) {
                             links.add(atts.getValue("", "href"));
                         }
                     }
                 }, new Metadata(), new ParseContext());
         assertEquals(1, links.size());
         assertEquals(url, links.get(0));
     }

     /**
      * Test case for TIKA-268
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-268">TIKA-268</a>
      */
     @Test
     public void testWhitespaceBetweenTableCells() throws Exception {
         String test = "<html><body><table><tr><td>a</td><td>b</td></table></body></html>";
         String content = new Tika().parseToString(new ByteArrayInputStream(test.getBytes(UTF_8)));
         assertContains("a", content);
         assertContains("b", content);
         assertFalse(content.contains("ab"));
     }

     /**
      * Test case for TIKA-332
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-332">TIKA-332</a>
      */
     @Test
     public void testHttpEquivCharset() throws Exception {
         String test = "<html><head><meta http-equiv=\"content-type\"" +
                 " content=\"text/html; charset=ISO-8859-1\" />" +
                 "<title>the name is \u00e1ndre</title>" + "</head><body></body></html>";
         Metadata metadata = new Metadata();
         new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
     }

     /**
      * Test case for TIKA-892
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-892">TIKA-892</a>
      */
     @Test
     public void testHtml5Charset() throws Exception {
         String test = "<html><head><meta charset=\"ISO-8859-15\" />" +
                 "<title>the name is \u00e1ndre</title>" + "</head><body></body></html>";
         Metadata metadata = new Metadata();
         new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
     }

     /**
      * Test case for TIKA-334
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-334">TIKA-334</a>
      */
     @Test
     public void testDetectOfCharset() throws Exception {
         String test = "<html><head><title>\u017d</title></head><body></body></html>";
         Metadata metadata = new Metadata();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());
         assertEquals("\u017d", metadata.get(TikaCoreProperties.TITLE));
     }

     /**
      * Test case for TIKA-341
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
      */
     @Test
     public void testUsingCharsetInContentTypeHeader() throws Exception {
         final String test =
                 "<html><head><title>the name is \u00e1ndre</title></head>" + "<body></body></html>";

         Metadata metadata = new Metadata();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());
         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

         metadata = new Metadata();
         metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
         new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
     }

     /**
      * Test case for HTML content like
      * "&gt;div&lt;foo&gt;br&lt;bar&gt;/div&gt;" that should result
      * in three whitespace-separated tokens "foo", "bar" and "baz" instead
      * of a single token "foobarbaz".
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-343">TIKA-343</a>
      */
     @Test
     public void testLineBreak() throws Exception {
         String test = "<html><body><div>foo<br>bar</div>baz</body></html>";
         String text = new Tika().parseToString(new ByteArrayInputStream(test.getBytes(US_ASCII)));
         String[] parts = text.trim().split("\\s+");
         assertEquals(3, parts.length);
         assertEquals("foo", parts[0]);
         assertEquals("bar", parts[1]);
         assertEquals("baz", parts[2]);
     }

     /**
      * Test case for TIKA-339: Don't use language returned by CharsetDetector
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-339">TIKA-339</a>
      */
     @Test
     public void testIgnoreCharsetDetectorLanguage() throws Exception {
         String test = "<html><title>Simple Content</title><body></body></html>";
         Metadata metadata = new Metadata();
         metadata.add(Metadata.CONTENT_LANGUAGE, "en");
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());

         assertEquals("en", metadata.get(Metadata.CONTENT_LANGUAGE));
     }

     /**
      * Test case for TIKA-349
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-349">TIKA-349</a>
      */
     @Test
     public void testHttpEquivCharsetFunkyAttributes() throws Exception {
         String test1 = "<html><head><meta http-equiv=\"content-type\"" +
                 " content=\"text/html; charset=ISO-8859-15; charset=iso-8859-15\" />" +
                 "<title>the name is \u00e1ndre</title>" + "</head><body></body></html>";
         Metadata metadata = new Metadata();
         new HtmlParser().parse(new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));

         // Some HTML pages have errors like ';;' versus '; ' as separator
         String test2 = "<html><head><meta http-equiv=\"content-type\"" +
                 " content=\"text/html;;charset=ISO-8859-15\" />" +
                 "<title>the name is \u00e1ndre</title>" + "</head><body></body></html>";
         metadata = new Metadata();
         new HtmlParser().parse(new ByteArrayInputStream(test2.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("ISO-8859-15", metadata.get(Metadata.CONTENT_ENCODING));
     }

     /**
      * Test case for TIKA-350
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-350">TIKA-350</a>
      */
     @Test
     public void testUsingFunkyCharsetInContentTypeHeader() throws Exception {
         final String test =
                 "<html><head><title>the name is \u00e1ndre</title></head>" + "<body></body></html>";

         Metadata metadata = new Metadata();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());
         assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));

         metadata = new Metadata();
         metadata.set(Metadata.CONTENT_TYPE, "charset=ISO-8859-1;text/html");
         new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
     }


     /**
      * Test case for TIKA-357
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-357">TIKA-357</a>
      */
     @Test
     public void testMetaHttpEquivWithLotsOfPreambleText() throws Exception {
         String path = "/test-documents/big-preamble.html";
         Metadata metadata = new Metadata();
         new HtmlParser().parse(getResourceAsStream(path), new BodyContentHandler(), metadata,
                 new ParseContext());

         assertEquals("windows-1251", metadata.get(Metadata.CONTENT_ENCODING));
     }

     /**
      * Test case for TIKA-420
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-420">TIKA-420</a>
      */
     @Test
     public void testBoilerplateRemoval() throws Exception {
         String path = "/test-documents/boilerplate.html";

         Metadata metadata = new Metadata();
         BodyContentHandler handler = new BodyContentHandler();
         new HtmlParser()
                 .parse(getResourceAsStream(path), new BoilerpipeContentHandler(handler), metadata,
                         new ParseContext());

         String content = handler.toString();
         assertTrue(content.startsWith("This is the real meat"));
         assertTrue(content.endsWith("This is the end of the text.\n"));
         assertFalse(content.contains("boilerplate"));
         assertFalse(content.contains("footer"));
     }

     /**
      * Test case for TIKA-478. Don't emit <head> sub-elements inside of <body>.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-478">TIKA-478</a>
      */
     @Test
     public void testElementOrdering() throws Exception {
         final String test = "<html><head><title>Title</title>" +
                 "<meta http-equiv=\"content-type\" content=\"text/html\">" +
                 "<link rel=\"next\" href=\"next.html\" />" +
                 "</head><body><p>Simple Content</p></body></html>";

         StringWriter sw = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         new Metadata(), new ParseContext());

         String result = sw.toString();

         // Title element in <head> section
         assertTrue(Pattern.matches("(?s)<html.*<head>.*<title>Title</title>.*</head>.*$", result));

         // No meta elements in body
         assertFalse(Pattern.matches("(?s).*<body>.*<meta. *</body>.*$", result));

         // meta elements should show up in <head> section
         assertTrue(Pattern.matches("(?s)<html.*<head>.*<meta .*</head>.*$", result));

         // No link elements in body
         assertFalse(Pattern.matches("(?s).*<body>.*<link .*</body>.*$", result));

         // link element should be in <head> section
         assertTrue(Pattern.matches("(?s)<html.*<head>.*<link .*</head>.*$", result));

         // There should be ending elements.
         assertTrue(Pattern.matches("(?s).*</body>.*</html>$", result));

     }

     /**
      * Test case for TIKA-463. Don't skip elements that have URLs.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
      */
     @Test
     public void testImgUrlExtraction() throws Exception {
         final String test =
                 "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" +
                         "</head><body><img src=\"image.jpg\" /></body></html>";

         StringWriter sw = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         new Metadata(), new ParseContext());

         String result = sw.toString();

         // <img> tag should exist, with fully resolved URL
         assertTrue(Pattern.matches("(?s).*src=\"http://domain.com/image.jpg\".*$", result));
     }

     /**
      * Test case for TIKA-463. Don't skip elements that have URLs.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
      */
     @Test
     public void testFrameSrcExtraction() throws Exception {
         final String test =
                 "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" +
                         "</head><frameset><frame src=\"frame.html\" /></frameset></html>";

         StringWriter sw = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         new Metadata(), new ParseContext());

         String result = sw.toString();

         // <frame> tag should exist, with fully resolved URL
         assertTrue(Pattern.matches("(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$",
                 result));
     }

     /**
      * Test case for TIKA-463. Don't skip elements that have URLs.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
      */
     @Test
     public void testIFrameSrcExtraction() throws Exception {
         final String test =
                 "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" +
                         "</head><body><iframe src =\"framed.html\" width=\"100%\" height=\"300\">" +
                         "<p>Your browser doesn't support iframes!</p></body></html>";

         StringWriter sw = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         new Metadata(), new ParseContext());

         String result = sw.toString();

         // <iframe> tag should exist, with fully resolved URL
         assertTrue(Pattern.matches("(?s).*<iframe .* src=\"http://domain.com/framed.html\".*$",
                 result));
     }

     /**
      * Test case for TIKA-463. Don't skip elements that have URLs.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
      */
     @Test
     public void testAreaExtraction() throws Exception {
         final String test =
                 "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" +
                         "</head><body><p><map name=\"map\" id=\"map\">" +
                         "<area shape=\"rect\" href=\"map.html\" alt=\"\" />" +
                         "</map></p></body></html>";

         StringWriter sw = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         new Metadata(), new ParseContext());

         String result = sw.toString();

         // <map> tag should exist, with <area> tag with fully resolved URL
         assertTrue(Pattern.matches(
                 "(?s).*<map .*<area .* href=\"http://domain.com/map.html\".*</map>.*$", result));
     }

     /**
      * Test case for TIKA-463. Don't skip elements that have URLs.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
      */
     @Test
     public void testObjectExtraction() throws Exception {
         final String test =
                 "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" +
                         "</head><body><p><object data=\"object.data\" type=\"text/html\">" +
                         "<param name=\"name\" value=\"value\" />" + "</object></p></body></html>";

         StringWriter sw = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         new Metadata(), new ParseContext());

         String result = sw.toString();

         // <object> tag should exist with fully resolved URLs
         assertTrue("<object> tag not correctly found in:\n" + result, Pattern.matches(
                 "(?s).*<object data=\"http://domain.com/object.data\".*<param .* name=\"name\" value=\"value\"/>.*</object>.*$",
                 result));
     }

     /**
      * Test case for change related to TIKA-463. Verify proper handling of <meta> tags.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-463">TIKA-463</a>
      */
     @Test
     public void testMetaTagHandling() throws Exception {
         final String test = "<html><body><h1>header</h1><p>some text</p></body></html>";

         Metadata metadata = new Metadata();
         metadata.add("Content-Type", "text/html; charset=utf-8");
         metadata.add("Language", null);

         StringWriter sw = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         metadata, new ParseContext());

         String result = sw.toString();

         // <meta> tag for Content-Type should exist, but nothing for Language
         assertTrue(Pattern.matches(
                 "(?s).*<meta name=\"Content-Type\" content=\"text/html; charset=UTF-8\"/>.*$",
                 result));
         assertFalse(Pattern.matches("(?s).*<meta name=\"Language\".*$", result));
     }

     /**
      * Test case for TIKA-457. Better handling for broken HTML that has <frameset> inside of <body>.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-457">TIKA-457</a>
      */
     @Test
     public void testBrokenFrameset() throws Exception {
         final String test1 =
                 "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" +
                         "</head><body><frameset><frame src=\"frame.html\" />" +
                         "</frameset></body></html>";

         StringWriter sw1 = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test1.getBytes(UTF_8)), makeHtmlTransformer(sw1),
                         new Metadata(), new ParseContext());

         String result = sw1.toString();

         // <frame> tag should exist, with fully resolved URL
         assertTrue(Pattern.matches(
                 "(?s).*<frame .* src=\"http://domain.com/frame.html\"/>.*$",
                 result));

         // <body> tag should not exist.
         assertFalse(Pattern.matches("(?s).*<body>.*$", result));

         // Test the example from the Nutch project.
         final String test2 = "<html><head><title> my title </title></head><body>" +
                 "<frameset rows=\"20,*\"><frame src=\"top.html\"></frame>" +
                 "<frameset cols=\"20,*\"><frame src=\"left.html\"></frame>" +
                 "<frame src=\"invalid.html\"/></frame>" + "<frame src=\"right.html\"></frame>" +
                 "</frameset></frameset></body></html>";

         StringWriter sw2 = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test2.getBytes(UTF_8)), makeHtmlTransformer(sw2),
                         new Metadata(), new ParseContext());

         result = sw2.toString();

         // <frame> tags should exist, with relative URL (no base element specified)
         assertTrue(Pattern.matches("(?s).*<frame .* src=\"top.html\"/>.*$", result));
         assertTrue(Pattern.matches("(?s).*<frame .* src=\"left.html\"/>.*$", result));
         assertTrue(Pattern.matches("(?s).*<frame .* src=\"invalid.html\"/>.*$", result));
         assertTrue(Pattern.matches("(?s).*<frame .* src=\"right.html\"/>.*$", result));

         // <body> tag should not exist.
         assertFalse(Pattern.matches("(?s).*<body>.*$", result));
     }

     /**
      * Test case for TIKA-480: fix NPE when using BodyContentHandler or HtmlTransformer
      * as delegate for BoilerpipeContentHandler
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-480">TIKA-480</a>
      */
     @Test
     public void testBoilerplateDelegation() throws Exception {
         String path = "/test-documents/boilerplate.html";

         Metadata metadata = new Metadata();
         StringWriter sw = new StringWriter();
         new HtmlParser().parse(getResourceAsStream(path), makeHtmlTransformer(sw), metadata,
                 new ParseContext());

         String content = sw.toString();

         // Should have <html>, <head>, <title>, <body> elements
         assertTrue(Pattern.matches(
                 "(?s).*<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\">.*</html>.*$",
                 content));
         assertTrue(Pattern.matches("(?s).*<head>.*</head>.*$", content));
         assertTrue(Pattern.matches("(?s).*<title>Title</title>.*$", content));
         assertTrue(Pattern.matches("(?s).*<body>.*</body>.*$", content));
     }

     /**
      * Test case for TIKA-481. Verify href in <link> is resolved.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-481">TIKA-481</a>
      */
     @Test
     public void testLinkHrefResolution() throws Exception {
         final String test =
                 "<html><head><title>Title</title>" + "<base href=\"http://domain.com\" />" +
                         "<link rel=\"next\" href=\"next.html\" />" + "</head><body></body></html>";

         StringWriter sw = new StringWriter();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         new Metadata(), new ParseContext());

         String result = sw.toString();

         // <link> tag should exist in <head>, with fully resolved URL
         assertTrue(Pattern.matches(
                 "(?s).*<head>.*<link rel=\"next\" href=\"http://domain.com/next.html\"/>.*</head>.*$",
                 result));
     }


     /**
      * Create ContentHandler that transforms SAX events into textual HTML output,
      * and writes it out to <writer> - typically this is a StringWriter.
      *
      * @param writer Where to write resulting HTML text.
      * @return ContentHandler suitable for passing to parse() methods.
      * @throws Exception
      */
     private ContentHandler makeHtmlTransformer(Writer writer) throws Exception {
         SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
         handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, "utf-8");
         handler.setResult(new StreamResult(writer));
         return handler;
     }

     /**
      * Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
      */
     @Test
     public void testBoilerplateWithMarkup() throws Exception {
         String path = "/test-documents/boilerplate.html";

         Metadata metadata = new Metadata();
         StringWriter sw = new StringWriter();
         ContentHandler ch = makeHtmlTransformer(sw);
         BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
         bpch.setIncludeMarkup(true);

         new HtmlParser().parse(getResourceAsStream(path), bpch, metadata, new ParseContext());

         String content = sw.toString();
         assertTrue("Has empty table elements",
                 content.contains("<body><table><tr><td><table><tr><td>"));
         assertTrue("Has empty a element",
                 content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
         assertTrue("Has real content", content.contains("<p>This is the real meat"));
         assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
         assertFalse(content.contains("boilerplate"));
         assertFalse(content.contains("footer"));
     }

     /**
      * Test case for TIKA-434 - Pushback buffer overflow in TagSoup
      */
     @Test
     public void testPushback() throws IOException, TikaException {
         String content = new Tika()
                 .parseToString(getResourceAsStream("/test-documents/tika434.html"), new Metadata());
         assertNotNull(content);
     }

     /**
      * Test case for TIKA-869
      * IdentityHtmlMapper needs to lower-case tag names.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-869">TIKA-869</a>
      */
     @Test
     public void testIdentityMapper() throws Exception {
         final String html = "<html><head><title>Title</title></head>" + "<body></body></html>";
         Metadata metadata = new Metadata();
         ParseContext parseContext = new ParseContext();
         parseContext.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);

         StringWriter sw = new StringWriter();

         new HtmlParser()
                 .parse(new ByteArrayInputStream(html.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         metadata, parseContext);

         String result = sw.toString();
         // Make sure we don't get <body><BODY/></body>
         assertTrue(Pattern.matches("(?s).*<body/>.*$", result));
     }

     /**
      * Test case for TIKA-889
      * XHTMLContentHandler wont emit newline when html element matches ENDLINE set.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-889">TIKA-889</a>
      */
     @Test
     public void testNewlineAndIndent() throws Exception {
         final String html = "<html><head><title>Title</title></head>" +
                 "<body><ul><li>one</li></ul></body></html>";

         BodyContentHandler handler = new BodyContentHandler();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(html.getBytes(UTF_8)), handler, new Metadata(),
                         new ParseContext());

         // Make sure we get <tab>, "one", newline, newline
         String result = handler.toString();

         assertTrue(Pattern.matches("\tone\n\n", result));
     }

     /**
      * Test case for Tika-2100
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-2100">TIKA-2100</a>
      */
     @Test
     public void testHtmlLanguage() throws Exception {
         final String html = "<html lang=\"fr\"></html>";

         StringWriter sw = new StringWriter();
         Metadata metadata = new Metadata();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(html.getBytes(UTF_8)), makeHtmlTransformer(sw),
                         metadata, new ParseContext());

         assertEquals("fr", metadata.get(Metadata.CONTENT_LANGUAGE));
         assertTrue("Missing HTML lang attribute",
                 Pattern.matches("(?s)<html[^>]* lang=\"fr\".*", sw.toString()));
     }

     /**
      * Test case for TIKA-961
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-961">TIKA-961</a>
      */
     @Test
     public void testBoilerplateWhitespace() throws Exception {
         String path = "/test-documents/boilerplate-whitespace.html";

         Metadata metadata = new Metadata();
         BodyContentHandler handler = new BodyContentHandler();

         BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
         bpHandler.setIncludeMarkup(true);

         new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());

         String content = handler.toString();

         // Should not contain item_aitem_b
         assertFalse(content.contains("item_aitem_b"));

         // Should contain the two list items with a newline in between.
         assertContains("item_a\nitem_b", content);

         // Should contain 有什么需要我帮你的 (can i help you) without whitespace
         assertContains("有什么需要我帮你的", content);
     }

     /**
      * Test case for TIKA-2683
      *
      * @see <a href="https://issues.apache.org/jira/projects/TIKA/issues/TIKA-2683">TIKA-2683</a>
      */
     @Test
     public void testBoilerplateMissingWhitespace() throws Exception {
         String path = "/test-documents/testBoilerplateMissingSpace.html";

         Metadata metadata = new Metadata();
         BodyContentHandler handler = new BodyContentHandler();

         BoilerpipeContentHandler bpHandler = new BoilerpipeContentHandler(handler);
         bpHandler.setIncludeMarkup(true);

         new HtmlParser().parse(getResourceAsStream(path), bpHandler, metadata, new ParseContext());

         String content = handler.toString();

         // Should contain space between these two words as mentioned in HTML
         assertContains("family Psychrolutidae", content);

         // Shouldn't add new-line chars around brackets; This is not how the HTML look
         assertContains("(Psychrolutes marcidus)", content);
     }

     /**
      * Test case for TIKA-983:  HTML parser should add Open Graph
      * meta tag data to Metadata returned by parser
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-983">TIKA-983</a>
      */
     @Test
     public void testOpenGraphMetadata() throws Exception {
         String test1 = "<html><head><meta property=\"og:description\"" +
                 " content=\"some description\" />" +
                 "<meta property=\"og:image\" content=\"http://example.com/image1.jpg\" />" +
                 "<meta property=\"og:image\" content=\"http://example.com/image2.jpg\" />" +
                 "<title>hello</title>" + "</head><body></body></html>";
         Metadata metadata = new Metadata();
         new HtmlParser().parse(new ByteArrayInputStream(test1.getBytes(ISO_8859_1)),
                 new BodyContentHandler(), metadata, new ParseContext());
         assertEquals("some description", metadata.get("og:description"));
         assertTrue(metadata.isMultiValued("og:image"));
     }

     // TIKA-1011
     @Test
     public void testUserDefinedCharset() throws Exception {
         String content = new Tika()
                 .parseToString(getResourceAsStream("/test-documents/testUserDefinedCharset.mhtml"),
                         new Metadata());
         assertNotNull(content);
     }

     //TIKA-1001
     @Test
     public void testNoisyMetaCharsetHeaders() throws Exception {
         Tika tika = new Tika();
         String hit = "\u0623\u0639\u0631\u0628";

         for (int i = 1; i <= 4; i++) {
             String fileName = "/test-documents/testHTMLNoisyMetaEncoding_" + i + ".html";
             String content = tika.parseToString(getResourceAsStream(fileName));
             assertTrue("testing: " + fileName, content.contains(hit));
         }
     }

     // TIKA-1193
     @Test
     public void testCustomHtmlSchema() throws Exception {
         // Default schema does not allow tables inside anchors
         String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";

         Metadata metadata = new Metadata();
         LinkContentHandler linkContentHandler = new LinkContentHandler();

         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler,
                         metadata, new ParseContext());

         // Expect no anchor text
         assertEquals("", linkContentHandler.getLinks().get(0).getText());

         // We'll change the schema to allow tables inside anchors!
         Schema schema = new HTMLSchema();
         schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);

         ParseContext parseContext = new ParseContext();
         parseContext.set(Schema.class, schema);
         linkContentHandler = new LinkContentHandler();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler,
                         metadata, parseContext);

         // Expect anchor text
         assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
     }

     /**
      * Test case for TIKA-820:  Locator is unset for HTML parser
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-820">TIKA-820</a>
      */
     @Test
     public void testLocator() throws Exception {
         final int line = 0;
         final int col = 1;
         final int[] textPosition = new int[2];

         new HtmlParser()
                 .parse(getResourceAsStream("/test-documents/testHTML.html"), new ContentHandler() {
                     Locator locator;

                     public void setDocumentLocator(Locator locator) {
                         this.locator = locator;
                     }

                     public void startDocument() throws SAXException {
                     }

                     public void endDocument() throws SAXException {
                     }

                     public void startPrefixMapping(String prefix, String uri) throws SAXException {
                     }

                     public void endPrefixMapping(String prefix) throws SAXException {
                     }

                     public void startElement(String uri, String localName, String qName,
                                              Attributes atts) throws SAXException {
                     }

                     public void endElement(String uri, String localName, String qName)
                             throws SAXException {
                     }

                     public void characters(char[] ch, int start, int length) throws SAXException {
                         String text = new String(ch, start, length);
                         if (text.equals("Test Indexation Html") && locator != null) {
                             textPosition[line] = locator.getLineNumber();
                             textPosition[col] = locator.getColumnNumber();
                         }
                     }

                     public void ignorableWhitespace(char[] ch, int start, int length)
                             throws SAXException {
                     }

                     public void processingInstruction(String target, String data)
                             throws SAXException {
                     }

                     public void skippedEntity(String name) throws SAXException {
                     }
                 }, new Metadata(), new ParseContext());

         // The text occurs at line 24 (if lines start at 0) or 25 (if lines start at 1).
         assertEquals(24, textPosition[line]);
         // The column reported seems fuzzy, just test it is close enough.
         assertTrue(Math.abs(textPosition[col] - 47) < 10);
     }


     /**
      * Test case for TIKA-1303: HTML parse should use the first title tag to set value in meta data
      * and ignore any subsequent title tags found in HTML.
      *
      * @see <a href="https://issues.apache.org/jira/browse/TIKA-1303">TIKA-1303</a>
      */
     @Test
     public void testFirstTitleValueisSetToMetadata() throws Exception {
         String test = "<html><title>Simple Content</title><body><h1></h1>" +
                 "<title>TitleToIgnore</title></body></html>";
         Metadata metadata = new Metadata();

         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());

         //Expecting first title to be set in meta data and second one to be ignored.
         assertEquals("Simple Content", metadata.get(TikaCoreProperties.TITLE));
     }

     @Test
     public void testMisleadingMetaContentTypeTags() throws Exception {
         //TIKA-1519

         String test =
                 "<html><head><meta http-equiv=\"content-type\" content=\"text/html;" +
                         " charset=UTF-ELEVEN\">" +
                         "</head><title>title</title><body>body</body></html>";
         Metadata metadata = new Metadata();

         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());
         assertEquals("text/html; charset=UTF-ELEVEN",
                 metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
         assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));

         test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\">" +
                 "</head><title>title</title><body>body</body></html>";
         metadata = new Metadata();

         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());
         assertEquals("application/pdf", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
         assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));

         //test two content values
         test =
                 "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\" " +
                         "content=\"application/ms-word\">" +
                         "</head><title>title</title><body>body</body></html>";
         metadata = new Metadata();

         new HtmlParser()
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());
         assertEquals("application/ms-word", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
         assertEquals("text/html; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
     }

     @Test
     public void testXHTMLWithMisleading() throws Exception {
         //first test an acceptable XHTML header with http-equiv tags
         String test = "<?xml version=\"1.0\" ?>" +
                 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" +
                 "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + "<head>\n" +
                 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n" +
                 "<title>title</title></head><body>body</body></html>";
         Metadata metadata = new Metadata();
         AUTO_DETECT_PARSER
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());

         assertEquals("text/html; charset=iso-8859-1",
                 metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
         assertEquals("application/xhtml+xml; charset=ISO-8859-1",
                 metadata.get(Metadata.CONTENT_TYPE));

         test = "<?xml version=\"1.0\" ?>" +
                 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" " +
                 "\"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" +
                 "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + "<head>\n" +
                 "<meta http-equiv=\"Content-Type\" content=\"text/html; " +
                 "charset=iso-NUMBER_SEVEN\" />\n" +
                 "<title>title</title></head><body>body</body></html>";
         metadata = new Metadata();
         AUTO_DETECT_PARSER
                 .parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(),
                         metadata, new ParseContext());

         assertEquals("text/html; charset=iso-NUMBER_SEVEN",
                 metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
         assertEquals("application/xhtml+xml; charset=ISO-8859-1",
                 metadata.get(Metadata.CONTENT_TYPE));

     }

     @Test
     public void testScriptSrc() throws Exception {
         String url = "http://domain.com/logic.js";
         String scriptInBody = "<html><body><script src=\"" + url + "\"></script></body></html>";
         String scriptInHead = "<html><head><script src=\"" + url + "\"></script></head></html>";

         assertScriptLink(scriptInBody, url);
         assertScriptLink(scriptInHead, url);
     }

     private void assertScriptLink(String html, String url) throws Exception {
         // IdentityHtmlMapper is needed to extract <script> tags
         ParseContext context = new ParseContext();
         context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
         Metadata metadata = new Metadata();
         metadata.set(Metadata.CONTENT_TYPE, "text/html");

         final List<String> links = new ArrayList<>();
         new HtmlParser()
                 .parse(new ByteArrayInputStream(html.getBytes(UTF_8)), new DefaultHandler() {
                     @Override
                     public void startElement(String u, String l, String name, Attributes atts) {
                         if (name.equals("script") && atts.getValue("", "src") != null) {
                             links.add(atts.getValue("", "src"));
                         }
                     }
                 }, metadata, context);

         assertEquals(1, links.size());
         assertEquals(url, links.get(0));
     }

     @Test
     public void testAllHeadElements() throws Exception {
         //TIKA-1980
         // IdentityHtmlMapper is needed to extract <script> tags
         ParseContext context = new ParseContext();
         context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
         Metadata metadata = new Metadata();
         metadata.set(Metadata.CONTENT_TYPE, "text/html");

         final Map<String, Integer> tagFrequencies = new HashMap<>();

         String path = "/test-documents/testHTML_head.html";
         try (InputStream stream = getResourceAsStream(path)) {
             ContentHandler tagCounter = new DefaultHandler() {
                 @Override
                 public void startElement(String uri, String local, String name,
                                          Attributes attributes) throws SAXException {
                     int count = tagFrequencies.getOrDefault(name, 0);
                     tagFrequencies.put(name, count + 1);
                 }
             };
             new HtmlParser().parse(stream, tagCounter, metadata, context);
         }

         assertEquals(1, (int) tagFrequencies.get("title"));
         assertEquals(9, (int) tagFrequencies.get("meta"));
         assertEquals(12, (int) tagFrequencies.get("link"));
         assertEquals(6, (int) tagFrequencies.get("script"));
     }

     @Test
     public void testSkippingCommentsInEncodingDetection() throws Exception {
         StringBuilder sb = new StringBuilder();
         for (int i = 0; i < 10000; i++) {
             sb.append(" ");
         }
         byte[] bytes = new String("<html><head>" +
                 "<!--<meta http-equiv=\"Content-Type\" " +
                 "content=\"text/html; charset=ISO-8859-1\"> -->\n" +
                 "   <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" +
                 "</head>" + sb.toString() + "<body>" + "有什么需要我帮你的" + "</body></html>")
                 .getBytes(StandardCharsets.UTF_8);
         XMLResult r = getXML(new ByteArrayInputStream(bytes), AUTO_DETECT_PARSER, new Metadata());
         assertContains("有什么需要我帮你的", r.xml);
     }

     @Test
     @Ignore("until we fix TIKA-1896")
     public void testBadScript() throws Exception {
         String xml = getXML("testHTMLBadScript.html").xml;
         assertContains("This is a test", xml);
         assertNotContained("cool", xml);
     }

     @Test
     public void testGoodScript() throws Exception {
         String xml = getXML("testHTMLGoodScript.html").xml;
         assertContains("This is a test", xml);
         assertNotContained("cool", xml);
     }

     @Test
     public void testExtractScript() throws Exception {
         HtmlParser p = new HtmlParser();
         p.setExtractScripts(true);
         //TIKA-2550 -- make absolutely sure that macros are still extracted
         //with the ToTextHandler
         List<Metadata> metadataList = getRecursiveMetadata("testHTMLGoodScript.html", p,
                 BasicContentHandlerFactory.HANDLER_TYPE.TEXT);
         assertEquals(2, metadataList.size());
         assertEquals("MACRO", metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
         assertContains("cool", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
         assertNotContained("cool", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
     }

     @Test
     public void testConfigExtractScript() throws Exception {
         try (InputStream is = getResourceAsStream("/org/apache/tika/parser/html/tika-config.xml")) {
             assertNotNull(is);
             TikaConfig tikaConfig = new TikaConfig(is);
             Parser p = new AutoDetectParser(tikaConfig);
             List<Metadata> metadataList = getRecursiveMetadata("testHTMLGoodScript.html", p);
             assertEquals(2, metadataList.size());
             assertEquals("MACRO",
                     metadataList.get(1).get(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE));
             assertContains("cool", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
             assertNotContained("cool", metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));
         }
     }


     @Test
     public void testMultiThreadingEncodingDetection() throws Exception {
         ServiceLoader loader = new ServiceLoader(AutoDetectReader.class.getClassLoader());
         List<EncodingDetector> detectors =
                 new ArrayList<>(loader.loadServiceProviders(EncodingDetector.class));
         for (EncodingDetector detector : detectors) {
             testDetector(detector);
         }
     }

     private void testDetector(EncodingDetector detector) throws Exception {
         Path testDocs = Paths.get(getResourceAsUri("/test-documents"));
         List<Path> tmp = new ArrayList<>();
         Map<Path, String> encodings = new ConcurrentHashMap<>();
         File[] testDocArray = testDocs.toFile().listFiles();
         assertNotNull("no test docs??", testDocArray);
         for (File file : testDocArray) {
             if (file.getName().endsWith(".txt") || file.getName().endsWith(".html")) {
                 String encoding = getEncoding(detector, file.toPath());
                 tmp.add(file.toPath());
                 encodings.put(file.toPath(), encoding);
             }
         }
         ArrayBlockingQueue<Path> paths = new ArrayBlockingQueue<>(tmp.size());
         paths.addAll(tmp);
         int numThreads = paths.size() + 1;
         ExecutorService ex = Executors.newFixedThreadPool(numThreads);
         CompletionService<String> completionService = new ExecutorCompletionService<>(ex);

         for (int i = 0; i < numThreads; i++) {
             completionService.submit(new EncodingDetectorRunner(paths, encodings, detector));
         }
         int completed = 0;
         while (completed < numThreads) {
             Future<String> future = completionService.take();

             if (future.isDone() &&
                     //will trigger ExecutionException if an IOException
                     //was thrown during call
                     EncodingDetectorRunner.DONE.equals(future.get())) {
                 completed++;
             }
         }
     }

     public String getEncoding(EncodingDetector detector, Path p) throws IOException {
         try (InputStream is = TikaInputStream.get(p)) {
             Charset charset = detector.detect(is, new Metadata());
             if (charset == null) {
                 return "NULL";
             } else {
                 return charset.toString();
             }
         }
     }

     @Test
     public void testCharsetsNotSupportedByIANA() throws Exception {
         assertContains("This is a sample text", getXML("testHTML_charset_utf8.html").xml);

         assertContains("This is a sample text", getXML("testHTML_charset_utf16le.html").xml);

     }

     @Test
     public void testSkippingDataURIInScriptNode() throws Exception {
         //TIKA-2759 skip data: uri element if inside a script
         //default behavior
         List<Metadata> metadataList = getRecursiveMetadata("testHTML_embedded_data_uri_js.html");
         assertEquals(1, metadataList.size());
         assertNotContained("alert( 'Hello, world!' );",
                 metadataList.get(0).get(TikaCoreProperties.TIKA_CONTENT));

         //make sure to include it if a user wants scripts to be extracted
         try (InputStream is = getResourceAsStream("/org/apache/tika/parser/html/tika-config.xml")) {
             assertNotNull(is);
             TikaConfig tikaConfig = new TikaConfig(is);
             Parser p = new AutoDetectParser(tikaConfig);
             metadataList = getRecursiveMetadata("testHTML_embedded_data_uri_js.html", p);
             assertEquals(2, metadataList.size());
             assertContains("alert( 'Hello, world!' );",
                     metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT));
         }
     }

     private class EncodingDetectorRunner implements Callable<String> {

         final static String DONE = "done";
         private final ArrayBlockingQueue<Path> paths;
         private final Map<Path, String> encodings;
         private final EncodingDetector detector;

         private EncodingDetectorRunner(ArrayBlockingQueue<Path> paths, Map<Path, String> encodings,
                                        EncodingDetector detector) {
             this.paths = paths;
             this.encodings = encodings;
             this.detector = detector;
         }

         @Override
         public String call() throws IOException {
             for (int i = 0; i < encodings.size(); i++) {
                 Path p = paths.poll();
                 if (p == null) {
                     return DONE;
                 }
                 String detectedEncoding = getEncoding(detector, p);
                 String trueEncoding = encodings.get(p);
                 assertEquals("detector class=" + detector.getClass() + " : file=" + p.toString(),
                         trueEncoding, detectedEncoding);

             }
             return DONE;
         }
     }
 }