core/src/test/java/org/apache/any23/Any23Test.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23;

 import org.apache.any23.extractor.ExtractorGroup;
 import org.apache.any23.extractor.rdf.NTriplesExtractorFactory;
 import org.apache.http.conn.ConnectTimeoutException;
 import org.junit.Assert;
 import org.apache.any23.configuration.Configuration;
 import org.apache.any23.configuration.DefaultConfiguration;
 import org.apache.any23.configuration.ModifiableConfiguration;
 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.extractor.ExtractionParameters;
 import org.apache.any23.extractor.Extractor;
 import org.apache.any23.extractor.microdata.MicrodataExtractor;
 import org.apache.any23.filter.IgnoreAccidentalRDFa;
 import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
 import org.apache.any23.http.DefaultHTTPClient;
 import org.apache.any23.http.DefaultHTTPClientConfiguration;
 import org.apache.any23.http.HTTPClient;
 import org.apache.any23.http.HTTPClientConfiguration;
 import org.apache.any23.source.DocumentSource;
 import org.apache.any23.source.HTTPDocumentSource;
 import org.apache.any23.source.StringDocumentSource;
 import org.apache.any23.util.FileUtils;
 import org.apache.any23.util.StreamUtils;
 import org.apache.any23.util.StringUtils;
 import org.apache.any23.vocab.DCTerms;
 import org.apache.any23.writer.CompositeTripleHandler;
 import org.apache.any23.writer.CountingTripleHandler;
 import org.apache.any23.writer.NTriplesWriter;
 import org.apache.any23.writer.RDFXMLWriter;
 import org.apache.any23.writer.ReportingTripleHandler;
 import org.apache.any23.writer.RepositoryWriter;
 import org.apache.any23.writer.TripleHandler;
 import org.apache.any23.writer.TripleHandlerException;
 import org.apache.commons.io.IOUtils;
 import org.junit.AssumptionViolatedException;
 import org.junit.Test;
 import org.eclipse.rdf4j.model.Statement;
 import org.eclipse.rdf4j.repository.Repository;
 import org.eclipse.rdf4j.repository.RepositoryConnection;
 import org.eclipse.rdf4j.repository.RepositoryResult;
 import org.eclipse.rdf4j.repository.sail.SailRepository;
 import org.eclipse.rdf4j.rio.RDFParseException;
 import org.eclipse.rdf4j.sail.memory.MemoryStore;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.net.URISyntaxException;
 import java.nio.charset.StandardCharsets;
 import java.util.Collections;
 import java.util.List;

 import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;

 /**
  * Test case for {@link Any23} facade.
  *
  * @author Davide Palmisano ( dpalmisano@gmail.com )
  * @author Michele Mostarda ( michele.mostarda@gmail.com )
  */
 @SuppressWarnings("unchecked")
 public class Any23Test extends Any23OnlineTestBase {

     private static final DCTerms vDCTERMS = DCTerms.getInstance();

     private static final String PAGE_URL = "http://bob.com";

     private static final Logger logger = LoggerFactory
             .getLogger(Any23Test.class);

     @Test
     public void testTTLDetection() throws Exception {
         assertDetection("<a> <b> <c> .", "rdf-turtle");
     }

     @Test
     public void testN3Detection1() throws Exception {
         assertDetection("<Bob><brothers>(<Jim><Mark>).", "rdf-turtle");
     }

     @Test
     public void testN3Detection2() throws Exception {
         assertDetection(
                 "<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .",
                 "rdf-nt");
     }

     @Test
     public void testHTMLBruteForceDetection() throws Exception {
         assertDetection("<html><body><div class=\"vcard fn\">Joe</div></body></html>");
     }

     /**
      * This tests the behavior of <i>Any23</i> to execute the extraction
      * explicitly specifying the charset encoding of the input.
      *
      * @throws Exception if there is an error reading the input
      */
     @Test
     public void testExplicitEncoding() throws Exception {
         assertEncodingDetection("UTF-8", "/html/encoding-test.html",
                 "Knud M\u00F6ller");
     }

     /**
      * This tests the behavior of <i>Any23</i> to perform the extraction without
      * passing it any charset encoding. The encoding is therefore guessed using
      * {@link org.apache.any23.encoding.TikaEncodingDetector} class.
      *
      * @throws Exception if there is an error reading the input
      */
     @Test
     public void testImplicitEncoding() throws Exception {
         assertEncodingDetection(null, // The encoding will be auto detected.
                 "/html/encoding-test.html", "Knud M\u00F6ller");
     }

     @Test
     public void testRDFXMLDetectionAndExtraction() throws Exception {
         String rdfXML = "<?xml version='1.0'?> "
                 + "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' "
                 + "xmlns:dc='http://purl.org/dc/elements/1.1/'>"
                 + "<rdf:Description rdf:about='http://www.example.com'>"
                 + "<dc:title>x</dc:title>" + "</rdf:Description>"
                 + "</rdf:RDF>";
         assertDetectionAndExtraction(rdfXML);
     }

     @Test
     public void testNTriplesDetectionAndExtraction() throws Exception {
         String n3 = "<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"n3 . appo\" .";
         assertDetectionAndExtraction(n3);
     }

     @Test
     public void testNturtleDetectionAndExtraction() throws Exception {
         String nTurtle = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n"
                 + "@prefix dc: <http://purl.org/dc/elements/1.1/> .\n"
                 + "@prefix ex: <http://example.org/stuff/1.0/> .\n"
                 + "\n"
                 + "<http://www.w3.org/TR/rdf-syntax-grammar>\n"
                 + "  dc:title \"RDF/XML Syntax Specification (Revised)\" ;\n"
                 + "  ex:editor [\n"
                 + "    ex:fullname \"Dave Beckett\";\n"
                 + "    ex:homePage <http://purl.org/net/dajobe/>\n" + "  ] .";
         assertDetectionAndExtraction(nTurtle);
     }

     /**
      * Tests out the first code snipped used in <i>Developer Manual</i>.
      *
      * @throws Exception if there is an error reading the input
      */
     @Test
     public void testDemoCodeSnippet1() throws Exception {
         /* 1 */Any23 runner = new Any23();
         /* 2 */final String content = "@prefix foo: <http://example.org/ns#> .   "
                 + "@prefix : <http://other.example.org/ns#> ."
                 + "foo:bar foo: : .                          "
                 + ":bar : foo:bar .                           ";
         // The second argument of StringDocumentSource() must be a valid IRI.
         /* 3 */DocumentSource source = new StringDocumentSource(content,
                 "http://host.com/service");
         /* 4 */ByteArrayOutputStream out = new ByteArrayOutputStream();
         /* 5 */TripleHandler handler = new NTriplesWriter(out);
         try {
             /* 6 */runner.extract(source, handler);
         } finally {
             /* 7 */handler.close();
         }
         /* 8 */String nt = out.toString("UTF-8");

         /*
          * <http://example.org/ns#bar> <http://example.org/ns#>
          * <http://other.example.org/ns#> . <http://other.example.org/ns#bar>
          * <http://other.example.org/ns#> <http://example.org/ns#bar> .
          */
         logger.debug("nt: " + nt);
         Assert.assertTrue(nt.length() > 0);
     }

     /**
      * Tests out the second code snipped used in <i>Developer Manual</i>.
      *
      * @throws Exception if there is an error reading the input
      */
     @Test
     public void testDemoCodeSnippet2() throws Exception {
         assumeOnlineAllowed();

         /* 1 */Any23 runner = new Any23();
         /* 2 */runner.setHTTPUserAgent("apache-any23-test-user-agent");
         /* 3 */HTTPClient httpClient = runner.getHTTPClient();
         /* 4 */DocumentSource source = new HTTPDocumentSource(httpClient,
                 "http://dbpedia.org/resource/Trento");
         /* 5 */ByteArrayOutputStream out = new ByteArrayOutputStream();
         /* 6 */TripleHandler handler = new NTriplesWriter(out);
         try {
             /* 7 */runner.extract(source, handler);
         } finally {
             /* 8 */handler.close();
         }
         /* 9 */String n3 = out.toString("UTF-8");

         /*
          * <http://dbpedia.org/resource/Trent>
          * <http://dbpedia.org/ontology/wikiPageDisambiguates>
          * <http://dbpedia.org/resource/Trento> .
          * <http://dbpedia.org/resource/Andrea_Pozzo>
          * <http://dbpedia.org/ontology/birthPlace>
          * <http://dbpedia.org/resource/Trento> .
          * <http://dbpedia.org/resource/Union_for_Trentino>
          * <http://dbpedia.org/ontology/headquarter>
          * <http://dbpedia.org/resource/Trento> . [...]
          */
         logger.debug("n3: " + n3);
         Assert.assertTrue(n3.length() > 0);

         Assert.assertTrue(n3.contains("<http://dbpedia.org/resource/Trento> <http://dbpedia.org/property/mayor> \"Alessandro Andreatta\" ."));
     }

     /**
      * This test checks the extraction behavior when the library is used
      * programatically. This test is related to the issue #45, to verify the
      * different behaviors between Maven and Ant. The behavior was related to a
      * 2nd-level dependency introduced by Maven.
      *
      * @throws org.apache.any23.extractor.ExtractionException if there is an error running extraction logic
      * @throws IOException if there is an error reading the input
      * @throws URISyntaxException if there is an error defining input URI's
      */
     @Test
     public void testProgrammaticExtraction() throws ExtractionException,
             IOException, URISyntaxException {
         Any23 any23 = new Any23();
         any23.setHTTPUserAgent("Any23-Servlet");
         any23.setHTTPClient(new DefaultHTTPClient() {
             @Override
             protected int getConnectionTimeout() {
                 return 5000;
             }

             @Override
             protected int getSoTimeout() {
                 return 2000;
             }
         });
         ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
         TripleHandler handler = new NTriplesWriter(byteArrayOutputStream);
         TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
         ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);

         DocumentSource source = getDocumentSourceFromResource(
                 "/html/rdfa/ansa_2010-02-26_12645863.html",
                 "http://host.com/service");

         Assert.assertTrue(any23.extract(source, reporting)
                 .hasMatchingExtractors());
         try {
             handler.close();
         } catch (TripleHandlerException e) {
             Assert.fail(e.getMessage());
         }

         final String bufferContent = byteArrayOutputStream.toString();
         logger.debug(bufferContent);
         Assert.assertSame("Unexpected number of triples.", 18,
                 StringUtils.countNL(bufferContent));

     }

     /**
      * This test checks if a URL that is supposed to be GZIPPED is correctly
      * opened and parsed with the {@link Any23} facade.
      *
      * @throws org.apache.any23.extractor.ExtractionException if there is an error running extraction logic
      * @throws IOException if there is an error reading the input
      * @throws URISyntaxException if there is an error defining input URI's
      */
     @Test
     public void testGZippedContent() throws IOException, URISyntaxException,
             ExtractionException {
         assumeOnlineAllowed();
         final Any23 runner = new Any23();
         runner.setHTTPUserAgent("apache-any23-test-user-agent");
         DocumentSource source = new HTTPDocumentSource(runner.getHTTPClient(),
                 "https://dev.w3.org/html5/rdfa/");
         ByteArrayOutputStream out = new ByteArrayOutputStream();
         TripleHandler handler = new NTriplesWriter(out);
         try {
             runner.extract(source, handler);
         } catch (ConnectTimeoutException e) {
             // This page is down as of 2019.09.14
             logger.error("Connection to " + source.getDocumentIRI() + " timed out; skipping test", e);
             throw new AssumptionViolatedException(e.getMessage());
         }
         String n3 = out.toString("UTF-8");
         logger.debug("N3 " + n3);
         Assert.assertTrue(n3.length() > 0);
     }

     @Test
     public void testExtractionParameters() throws IOException,
             ExtractionException, TripleHandlerException {
         // not quite sure if following triples should be extracted
         // ?doc <http://www.w3.org/1999/xhtml/vocab#icon> <https://any23.googlecode.com/favicon.ico> .
         // ?doc <http://www.w3.org/1999/xhtml/vocab#stylesheet> <https://any23.googlecode.com/design/style.css>  .

         final int EXPECTED_TRIPLES = 12;
         Any23 runner = new Any23();
         DocumentSource source = getDocumentSourceFromResource(
                 "/org/apache/any23/validator/missing-og-namespace.html",
                 "http://www.test.com");

         ByteArrayOutputStream baos = new ByteArrayOutputStream();

         CountingTripleHandler cth1 = new CountingTripleHandler();
         NTriplesWriter ctw1 = new NTriplesWriter(baos);
         CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
         compositeTH1.addChild(cth1);
         compositeTH1.addChild(ctw1);
         try {
             runner.extract(
                     new ExtractionParameters(DefaultConfiguration.singleton(),
                             ValidationMode.NONE), source, compositeTH1);
         } finally {
             compositeTH1.close();
         }
         logger.debug(baos.toString());
         Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES,
                 cth1.getCount());

 //        baos.reset();
 //        CountingTripleHandler cth2 = new CountingTripleHandler();
 //        NTriplesWriter ctw2 = new NTriplesWriter(baos);
 //        CompositeTripleHandler compositeTH2 = new CompositeTripleHandler();
 //        compositeTH2.addChild(cth2);
 //        compositeTH2.addChild(ctw2);
 //        runner.extract(
 //                new ExtractionParameters(DefaultConfiguration.singleton(),
 //                        ValidationMode.ValidateAndFix), source, compositeTH2);
 //        logger.debug(baos.toString());
 //        Assert.assertEquals("Unexpected number of triples.",
 //                EXPECTED_TRIPLES + 5, cth2.getCount());
     }

     @Test
     public void testExtractionParametersWithNestingDisabled()
             throws IOException, ExtractionException, TripleHandlerException {
         final int EXPECTED_TRIPLES = 20;
         Any23 runner = new Any23();
         DocumentSource source = getDocumentSourceFromResource(
                 "/microformats/nested-microformats-a1.html",
                 "http://www.test.com");

         ByteArrayOutputStream baos = new ByteArrayOutputStream();

         CountingTripleHandler cth1 = new CountingTripleHandler();
         RDFXMLWriter ctw1 = new RDFXMLWriter(baos);
         CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
         compositeTH1.addChild(cth1);
         compositeTH1.addChild(ctw1);
         runner.extract(
                 new ExtractionParameters(DefaultConfiguration.singleton(),
                         ValidationMode.NONE, true), source, compositeTH1);
         compositeTH1.close();
         logger.debug("Out1: " + baos.toString());
         Assert.assertEquals("Unexpected number of triples.",
                 EXPECTED_TRIPLES + 3, cth1.getCount());

         baos.reset();
         CountingTripleHandler cth2 = new CountingTripleHandler();
         NTriplesWriter ctw2 = new NTriplesWriter(baos);
         CompositeTripleHandler compositeTH2 = new CompositeTripleHandler();
         compositeTH2.addChild(cth2);
         compositeTH2.addChild(ctw2);
         runner.extract(
                 new ExtractionParameters(DefaultConfiguration.singleton(),
                         ValidationMode.VALIDATE_AND_FIX, false), source,
                 compositeTH2);
         compositeTH2.close();
         logger.debug("Out2: " + baos.toString());
         Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES,
                 cth2.getCount());
     }

     @Test
     public void testExceptionPropagation() throws IOException {
         Any23 any23 = new Any23();
         DocumentSource source = getDocumentSourceFromResource(
                 "/application/turtle/geolinkeddata.ttl", "http://www.test.com");
         CountingTripleHandler cth1 = new CountingTripleHandler();
         try {
             any23.extract(source, cth1);
         } catch (ExtractionException e) {
             Assert.assertTrue(e.getCause() instanceof RDFParseException);
         }

     }

     /**
      * Test correct management of general <i>XML</i> content.
      *
      * @throws org.apache.any23.extractor.ExtractionException if there is an error running extraction logic
      * @throws IOException if there is an error reading the input
      */
     @Test
     public void testXMLMimeTypeManagement() throws IOException,
             ExtractionException {
         final String documentIRI = "http://www.test.com/resource.xml";
         final String contentType = "application/xml";
         final String in = StreamUtils.asString(this.getClass()
                 .getResourceAsStream("any23-xml-mimetype.xml"));
         final DocumentSource doc = new StringDocumentSource(in, documentIRI,
                 contentType);
         final Any23 any23 = new Any23();
         final CountingTripleHandler cth = new CountingTripleHandler(false);
         final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
         final ExtractionReport report = any23.extract(doc, rth);
         Assert.assertFalse(report.hasMatchingExtractors());
         Assert.assertEquals(0, cth.getCount());
     }

     /**
      * Test correct management of general <i>XML</i> content from <i>URL</i>
      * source.
      *
      * @throws org.apache.any23.extractor.ExtractionException if there is an error running extraction logic
      * @throws IOException if there is an error reading the input
      */
     @Test
     public void testXMLMimeTypeManagementViaURL() throws IOException,
             ExtractionException {
         assumeOnlineAllowed();
         final Any23 any23 = new Any23();
         any23.setHTTPUserAgent("apache-any23-test-user-agent");
         HTTPClient client = any23.getHTTPClient();
         HTTPClientConfiguration configuration = new DefaultHTTPClientConfiguration("application/xml");
         client.init(configuration);
         final CountingTripleHandler cth = new CountingTripleHandler(false);
         final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
         final ExtractionReport report = any23.extract(
                 "http://www.legislation.gov.uk/ukpga/2015/17/section/4/data.xml", rth);
         Assert.assertFalse(report.hasMatchingExtractors());
         Assert.assertEquals(0, cth.getCount());
     }

     @Test
     public void testBlankNodesViaURL() throws IOException, ExtractionException {
         assumeOnlineAllowed();
         final Any23 any23 = new Any23();
         any23.setHTTPUserAgent("apache-any23-test-user-agent");
         final CountingTripleHandler cth = new CountingTripleHandler(false);
         final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
         final ExtractionReport report = any23.extract(
                 "https://www.w3.org/", rth);
         Assert.assertTrue(report.hasMatchingExtractors());
     }

     @Test
     public void testMicrodataSupport() throws Exception {
         final String htmlWithMicrodata = IOUtils.toString(getClass()
                 .getResourceAsStream("/microdata/microdata-basic.html"),
                 StandardCharsets.UTF_8);
         assertExtractorActivation(htmlWithMicrodata, MicrodataExtractor.class);
     }

     @Test
     public void testAbstractMethodErrorIssue186_1() throws IOException,
             ExtractionException {
         final Any23 runner = new Any23();
         final String content = FileUtils
                 .readResourceContent("/html/rdfa/rdfa-issue186-1.xhtml");
         final DocumentSource source = new StringDocumentSource(content,
                 "http://base.com");
         final ByteArrayOutputStream out = new ByteArrayOutputStream();
         final TripleHandler handler = new NTriplesWriter(out);
         runner.extract(source, handler);
         String n3 = out.toString("UTF-8");
         logger.debug(n3);
     }

     @Test
     public void testAbstractMethodErrorIssue186_2() throws IOException,
             ExtractionException {
         final Any23 runner = new Any23();
         final String content = FileUtils
                 .readResourceContent("/html/rdfa/rdfa-issue186-2.xhtml");
         final DocumentSource source = new StringDocumentSource(content,
                 "http://richard.cyganiak.de/");
         final ByteArrayOutputStream out = new ByteArrayOutputStream();
         final TripleHandler handler = new NTriplesWriter(out);
         runner.extract(source, handler);
         final String n3 = out.toString("UTF-8");
         logger.debug(n3);
     }

     @Test
     public void testModifiableConfiguration_issue183() throws Exception {
         final ModifiableConfiguration modifiableConf = DefaultConfiguration
                 .copy();
         modifiableConf.setProperty("any23.extraction.metadata.timesize", "off");
         final Any23 any23 = new Any23(modifiableConf);

         final String content = FileUtils
                 .readResourceContent("/rdf/rdf-issue183.ttl");
         final DocumentSource source = new StringDocumentSource(content,
                 "http://base.com");
         final ByteArrayOutputStream out = new ByteArrayOutputStream();
         final TripleHandler handler = new NTriplesWriter(out);
         any23.extract(source, handler);
         handler.close();
         final String n3 = out.toString("UTF-8");

         logger.debug(n3);
         Assert.assertFalse(
                 "Should not contain triple with http://vocab.sindice.net/date",
                 n3.contains("http://vocab.sindice.net/date"));
         Assert.assertFalse(
                 "Should not contain triple with http://vocab.sindice.net/size",
                 n3.contains("http://vocab.sindice.net/size"));
     }

     @Test
     public void testIssue415InvalidNTriples() throws Exception {
         NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
         Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));

         ExtractionReport report = runner.extract(
                 IOUtils.resourceToString("/rdf/issue415.txt", StandardCharsets.UTF_8),
                 "http://humanstxt.org/humans.txt",
                 new CompositeTripleHandler());
         Assert.assertEquals("text/plain", report.getDetectedMimeType());
         Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
         Assert.assertEquals(0, report.getMatchingExtractors().size());
     }

     @Test
     public void testIssue415ValidNTriples() throws Exception {
         NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
         Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));

         CountingTripleHandler handler = new CountingTripleHandler();
         ExtractionReport report = runner.extract(
                 IOUtils.resourceToString("/rdf/issue415-valid.txt", StandardCharsets.UTF_8),
                 "http://humanstxt.org/humans.txt",
                 handler);
         Assert.assertEquals("application/n-triples", report.getDetectedMimeType());
         Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
         Assert.assertEquals(1, report.getMatchingExtractors().size());
         Assert.assertEquals(1, handler.getCount());
     }

     /**
      * Performs detection and extraction on the given input string and return
      * the {@link ExtractionReport}.
      *
      * @param in
      *            input string.
      * @return a populated {@link org.apache.any23.ExtractionReport}
      * @throws Exception if there is an error detecting mime type and running extraction
      */
     private ExtractionReport detectAndExtract(String in) throws Exception {
         Any23 any23 = new Any23();
         Configuration conf = DefaultConfiguration.copy();
         ByteArrayOutputStream out = new ByteArrayOutputStream();
         ReportingTripleHandler outputHandler = new ReportingTripleHandler(
                 new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(
                         new NTriplesWriter(out))));
         return any23.extract(new ExtractionParameters(conf, ValidationMode.VALIDATE_AND_FIX, null, null),
             new StringDocumentSource(in, "http://host.com/path"), outputHandler, "UTF-8");
     }

     /**
      * Asserts that a list an {@link Extractor} has been activated for the given
      * input data.
      *
      * @param in
      *            input data as string.
      * @throws IOException
      * @throws ExtractionException
      */
     private void assertDetectionAndExtraction(String in) throws Exception {
         final ExtractionReport extractionReport = detectAndExtract(in);
         Assert.assertTrue(
                 "Detection and extraction failed, no matching extractors.",
                 extractionReport.hasMatchingExtractors());
     }

     /**
      * Assert the correct activation of the given list of {@link Extractor}s for
      * the given input string.
      *
      * @param in
      *            input data as string.
      * @param expectedExtractors
      * @throws IOException
      * @throws ExtractionException
      */
     private void assertExtractorActivation(String in,
             @SuppressWarnings("rawtypes") Class<? extends Extractor>... expectedExtractors) throws Exception {
         final ExtractionReport extractionReport = detectAndExtract(in);
         for (@SuppressWarnings("rawtypes") Class<? extends Extractor> expectedExtractorClass : expectedExtractors) {
             Assert.assertTrue(
                     String.format(
                             "Detection and extraction failed, expected extractor [%s] not found.",
                             expectedExtractorClass),
                     containsClass(extractionReport.getMatchingExtractors(),
                             expectedExtractorClass));
         }
     }

     /**
      * Asserts the correct encoding detection for a specified data.
      *
      * @param encoding
      *            the expected specified encoding, if <code>null</code> will be
      *            auto detected.
      * @param input
      * @param expectedContent
      * @throws Exception
      */
     private void assertEncodingDetection(String encoding, String input, String expectedContent)
     throws Exception {
         DocumentSource fileDocumentSource = getDocumentSourceFromResource(input);
         Any23 any23;
         RepositoryConnection conn = null;
         RepositoryWriter repositoryWriter = null;

         any23 = new Any23();
         Repository store = new SailRepository(new MemoryStore());
         store.init();
         try
         {
             conn = store.getConnection();
             repositoryWriter = new RepositoryWriter(conn);
             Assert.assertTrue( any23.extract(fileDocumentSource, repositoryWriter, encoding).hasMatchingExtractors() );

             RepositoryResult<Statement> statements = conn.getStatements(null, vDCTERMS.title, null, false);
             try {
                 while (statements.hasNext()) {
                     Statement statement = statements.next();
                     printStatement(statement);
                     Assert.assertTrue(statement.getObject().stringValue().contains(expectedContent));
                 }
             } finally {
                 statements.close();
             }
         }
         finally {
             if(conn != null) {
                 conn.close();
             }
             if(repositoryWriter != null) {
                 repositoryWriter.close();
             }
         }
         fileDocumentSource = null;
         any23 = null;
     }

     /**
      * Will try to detect the <i>content</i> trying sequentially with all
      * specified parser.
      *
      * @param content
      * @param parsers
      * @throws Exception
      */
     private void assertDetection(String content, String... parsers)
             throws Exception {
         ByteArrayOutputStream out = new ByteArrayOutputStream();
         Any23 runner = new Any23(parsers.length == 0 ? null : parsers);
         if (parsers.length != 0) {
             runner.setMIMETypeDetector(null); // Use all the provided
                                               // extractors.
         }
         final NTriplesWriter tripleHandler = new NTriplesWriter(out);
         runner.extract(new StringDocumentSource(content, PAGE_URL),
                 tripleHandler);
         tripleHandler.close();
         String result = out.toString("us-ascii");
         Assert.assertNotNull(result);
         Assert.assertTrue(result.length() > 10);
     }

     private void printStatement(Statement statement) {
         logger.debug(String.format("%s\t%s\t%s", statement.getSubject(),
                 statement.getPredicate(), statement.getObject()));
     }

     private boolean containsClass(List<?> list, Class<?> clazz) {
         for (Object o : list) {
             if (o.getClass().equals(clazz)) {
                 return true;
             }
         }
         return false;
     }

 }