blob: d851986d4c21da039d9658590e39d7d58f9f2ebf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23;
import org.apache.any23.extractor.ExtractorGroup;
import org.apache.any23.extractor.rdf.NTriplesExtractorFactory;
import org.apache.http.conn.ConnectTimeoutException;
import org.junit.Assert;
import org.apache.any23.configuration.Configuration;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.configuration.ModifiableConfiguration;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.microdata.MicrodataExtractor;
import org.apache.any23.filter.IgnoreAccidentalRDFa;
import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
import org.apache.any23.http.DefaultHTTPClient;
import org.apache.any23.http.DefaultHTTPClientConfiguration;
import org.apache.any23.http.HTTPClient;
import org.apache.any23.http.HTTPClientConfiguration;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.util.FileUtils;
import org.apache.any23.util.StreamUtils;
import org.apache.any23.util.StringUtils;
import org.apache.any23.vocab.DCTerms;
import org.apache.any23.writer.CompositeTripleHandler;
import org.apache.any23.writer.CountingTripleHandler;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.RDFXMLWriter;
import org.apache.any23.writer.ReportingTripleHandler;
import org.apache.any23.writer.RepositoryWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.IOUtils;
import org.junit.AssumptionViolatedException;
import org.junit.Test;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.repository.Repository;
import org.eclipse.rdf4j.repository.RepositoryConnection;
import org.eclipse.rdf4j.repository.RepositoryResult;
import org.eclipse.rdf4j.repository.sail.SailRepository;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.sail.memory.MemoryStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
/**
* Test case for {@link Any23} facade.
*
* @author Davide Palmisano ( dpalmisano@gmail.com )
* @author Michele Mostarda ( michele.mostarda@gmail.com )
*/
@SuppressWarnings("unchecked")
public class Any23Test extends Any23OnlineTestBase {
private static final DCTerms vDCTERMS = DCTerms.getInstance();
private static final String PAGE_URL = "http://bob.com";
private static final Logger logger = LoggerFactory
.getLogger(Any23Test.class);
@Test
public void testTTLDetection() throws Exception {
assertDetection("<a> <b> <c> .", "rdf-turtle");
}
@Test
public void testN3Detection1() throws Exception {
assertDetection("<Bob><brothers>(<Jim><Mark>).", "rdf-turtle");
}
@Test
public void testN3Detection2() throws Exception {
assertDetection(
"<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .",
"rdf-nt");
}
@Test
public void testHTMLBruteForceDetection() throws Exception {
assertDetection("<html><body><div class=\"vcard fn\">Joe</div></body></html>");
}
/**
* This tests the behavior of <i>Any23</i> to execute the extraction
* explicitly specifying the charset encoding of the input.
*
* @throws Exception if there is an error reading the input
*/
@Test
public void testExplicitEncoding() throws Exception {
assertEncodingDetection("UTF-8", "/html/encoding-test.html",
"Knud M\u00F6ller");
}
/**
* This tests the behavior of <i>Any23</i> to perform the extraction without
* passing it any charset encoding. The encoding is therefore guessed using
* {@link org.apache.any23.encoding.TikaEncodingDetector} class.
*
* @throws Exception if there is an error reading the input
*/
@Test
public void testImplicitEncoding() throws Exception {
assertEncodingDetection(null, // The encoding will be auto detected.
"/html/encoding-test.html", "Knud M\u00F6ller");
}
@Test
public void testRDFXMLDetectionAndExtraction() throws Exception {
String rdfXML = "<?xml version='1.0'?> "
+ "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' "
+ "xmlns:dc='http://purl.org/dc/elements/1.1/'>"
+ "<rdf:Description rdf:about='http://www.example.com'>"
+ "<dc:title>x</dc:title>" + "</rdf:Description>"
+ "</rdf:RDF>";
assertDetectionAndExtraction(rdfXML);
}
@Test
public void testNTriplesDetectionAndExtraction() throws Exception {
String n3 = "<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"n3 . appo\" .";
assertDetectionAndExtraction(n3);
}
@Test
public void testNturtleDetectionAndExtraction() throws Exception {
String nTurtle = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n"
+ "@prefix dc: <http://purl.org/dc/elements/1.1/> .\n"
+ "@prefix ex: <http://example.org/stuff/1.0/> .\n"
+ "\n"
+ "<http://www.w3.org/TR/rdf-syntax-grammar>\n"
+ " dc:title \"RDF/XML Syntax Specification (Revised)\" ;\n"
+ " ex:editor [\n"
+ " ex:fullname \"Dave Beckett\";\n"
+ " ex:homePage <http://purl.org/net/dajobe/>\n" + " ] .";
assertDetectionAndExtraction(nTurtle);
}
/**
* Tests out the first code snipped used in <i>Developer Manual</i>.
*
* @throws Exception if there is an error reading the input
*/
@Test
public void testDemoCodeSnippet1() throws Exception {
/* 1 */Any23 runner = new Any23();
/* 2 */final String content = "@prefix foo: <http://example.org/ns#> . "
+ "@prefix : <http://other.example.org/ns#> ."
+ "foo:bar foo: : . "
+ ":bar : foo:bar . ";
// The second argument of StringDocumentSource() must be a valid IRI.
/* 3 */DocumentSource source = new StringDocumentSource(content,
"http://host.com/service");
/* 4 */ByteArrayOutputStream out = new ByteArrayOutputStream();
/* 5 */TripleHandler handler = new NTriplesWriter(out);
try {
/* 6 */runner.extract(source, handler);
} finally {
/* 7 */handler.close();
}
/* 8 */String nt = out.toString("UTF-8");
/*
* <http://example.org/ns#bar> <http://example.org/ns#>
* <http://other.example.org/ns#> . <http://other.example.org/ns#bar>
* <http://other.example.org/ns#> <http://example.org/ns#bar> .
*/
logger.debug("nt: " + nt);
Assert.assertTrue(nt.length() > 0);
}
/**
* Tests out the second code snipped used in <i>Developer Manual</i>.
*
* @throws Exception if there is an error reading the input
*/
@Test
public void testDemoCodeSnippet2() throws Exception {
assumeOnlineAllowed();
/* 1 */Any23 runner = new Any23();
/* 2 */runner.setHTTPUserAgent("apache-any23-test-user-agent");
/* 3 */HTTPClient httpClient = runner.getHTTPClient();
/* 4 */DocumentSource source = new HTTPDocumentSource(httpClient,
"http://dbpedia.org/resource/Trento");
/* 5 */ByteArrayOutputStream out = new ByteArrayOutputStream();
/* 6 */TripleHandler handler = new NTriplesWriter(out);
try {
/* 7 */runner.extract(source, handler);
} finally {
/* 8 */handler.close();
}
/* 9 */String n3 = out.toString("UTF-8");
/*
* <http://dbpedia.org/resource/Trent>
* <http://dbpedia.org/ontology/wikiPageDisambiguates>
* <http://dbpedia.org/resource/Trento> .
* <http://dbpedia.org/resource/Andrea_Pozzo>
* <http://dbpedia.org/ontology/birthPlace>
* <http://dbpedia.org/resource/Trento> .
* <http://dbpedia.org/resource/Union_for_Trentino>
* <http://dbpedia.org/ontology/headquarter>
* <http://dbpedia.org/resource/Trento> . [...]
*/
logger.debug("n3: " + n3);
Assert.assertTrue(n3.length() > 0);
Assert.assertTrue(n3.contains("<http://dbpedia.org/resource/Trento> <http://dbpedia.org/property/mayor> \"Alessandro Andreatta\" ."));
}
/**
* This test checks the extraction behavior when the library is used
* programatically. This test is related to the issue #45, to verify the
* different behaviors between Maven and Ant. The behavior was related to a
* 2nd-level dependency introduced by Maven.
*
* @throws org.apache.any23.extractor.ExtractionException if there is an error running extraction logic
* @throws IOException if there is an error reading the input
* @throws URISyntaxException if there is an error defining input URI's
*/
@Test
public void testProgrammaticExtraction() throws ExtractionException,
IOException, URISyntaxException {
Any23 any23 = new Any23();
any23.setHTTPUserAgent("Any23-Servlet");
any23.setHTTPClient(new DefaultHTTPClient() {
@Override
protected int getConnectionTimeout() {
return 5000;
}
@Override
protected int getSoTimeout() {
return 2000;
}
});
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
TripleHandler handler = new NTriplesWriter(byteArrayOutputStream);
TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);
DocumentSource source = getDocumentSourceFromResource(
"/html/rdfa/ansa_2010-02-26_12645863.html",
"http://host.com/service");
Assert.assertTrue(any23.extract(source, reporting)
.hasMatchingExtractors());
try {
handler.close();
} catch (TripleHandlerException e) {
Assert.fail(e.getMessage());
}
final String bufferContent = byteArrayOutputStream.toString();
logger.debug(bufferContent);
Assert.assertSame("Unexpected number of triples.", 18,
StringUtils.countNL(bufferContent));
}
/**
* This test checks if a URL that is supposed to be GZIPPED is correctly
* opened and parsed with the {@link Any23} facade.
*
* @throws org.apache.any23.extractor.ExtractionException if there is an error running extraction logic
* @throws IOException if there is an error reading the input
* @throws URISyntaxException if there is an error defining input URI's
*/
@Test
public void testGZippedContent() throws IOException, URISyntaxException,
ExtractionException {
assumeOnlineAllowed();
final Any23 runner = new Any23();
runner.setHTTPUserAgent("apache-any23-test-user-agent");
DocumentSource source = new HTTPDocumentSource(runner.getHTTPClient(),
"https://dev.w3.org/html5/rdfa/");
ByteArrayOutputStream out = new ByteArrayOutputStream();
TripleHandler handler = new NTriplesWriter(out);
try {
runner.extract(source, handler);
} catch (ConnectTimeoutException e) {
// This page is down as of 2019.09.14
logger.error("Connection to " + source.getDocumentIRI() + " timed out; skipping test", e);
throw new AssumptionViolatedException(e.getMessage());
}
String n3 = out.toString("UTF-8");
logger.debug("N3 " + n3);
Assert.assertTrue(n3.length() > 0);
}
@Test
public void testExtractionParameters() throws IOException,
ExtractionException, TripleHandlerException {
// not quite sure if following triples should be extracted
// ?doc <http://www.w3.org/1999/xhtml/vocab#icon> <https://any23.googlecode.com/favicon.ico> .
// ?doc <http://www.w3.org/1999/xhtml/vocab#stylesheet> <https://any23.googlecode.com/design/style.css> .
final int EXPECTED_TRIPLES = 12;
Any23 runner = new Any23();
DocumentSource source = getDocumentSourceFromResource(
"/org/apache/any23/validator/missing-og-namespace.html",
"http://www.test.com");
ByteArrayOutputStream baos = new ByteArrayOutputStream();
CountingTripleHandler cth1 = new CountingTripleHandler();
NTriplesWriter ctw1 = new NTriplesWriter(baos);
CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
compositeTH1.addChild(cth1);
compositeTH1.addChild(ctw1);
try {
runner.extract(
new ExtractionParameters(DefaultConfiguration.singleton(),
ValidationMode.NONE), source, compositeTH1);
} finally {
compositeTH1.close();
}
logger.debug(baos.toString());
Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES,
cth1.getCount());
// baos.reset();
// CountingTripleHandler cth2 = new CountingTripleHandler();
// NTriplesWriter ctw2 = new NTriplesWriter(baos);
// CompositeTripleHandler compositeTH2 = new CompositeTripleHandler();
// compositeTH2.addChild(cth2);
// compositeTH2.addChild(ctw2);
// runner.extract(
// new ExtractionParameters(DefaultConfiguration.singleton(),
// ValidationMode.ValidateAndFix), source, compositeTH2);
// logger.debug(baos.toString());
// Assert.assertEquals("Unexpected number of triples.",
// EXPECTED_TRIPLES + 5, cth2.getCount());
}
@Test
public void testExtractionParametersWithNestingDisabled()
throws IOException, ExtractionException, TripleHandlerException {
final int EXPECTED_TRIPLES = 20;
Any23 runner = new Any23();
DocumentSource source = getDocumentSourceFromResource(
"/microformats/nested-microformats-a1.html",
"http://www.test.com");
ByteArrayOutputStream baos = new ByteArrayOutputStream();
CountingTripleHandler cth1 = new CountingTripleHandler();
RDFXMLWriter ctw1 = new RDFXMLWriter(baos);
CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
compositeTH1.addChild(cth1);
compositeTH1.addChild(ctw1);
runner.extract(
new ExtractionParameters(DefaultConfiguration.singleton(),
ValidationMode.NONE, true), source, compositeTH1);
compositeTH1.close();
logger.debug("Out1: " + baos.toString());
Assert.assertEquals("Unexpected number of triples.",
EXPECTED_TRIPLES + 3, cth1.getCount());
baos.reset();
CountingTripleHandler cth2 = new CountingTripleHandler();
NTriplesWriter ctw2 = new NTriplesWriter(baos);
CompositeTripleHandler compositeTH2 = new CompositeTripleHandler();
compositeTH2.addChild(cth2);
compositeTH2.addChild(ctw2);
runner.extract(
new ExtractionParameters(DefaultConfiguration.singleton(),
ValidationMode.VALIDATE_AND_FIX, false), source,
compositeTH2);
compositeTH2.close();
logger.debug("Out2: " + baos.toString());
Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES,
cth2.getCount());
}
@Test
public void testExceptionPropagation() throws IOException {
Any23 any23 = new Any23();
DocumentSource source = getDocumentSourceFromResource(
"/application/turtle/geolinkeddata.ttl", "http://www.test.com");
CountingTripleHandler cth1 = new CountingTripleHandler();
try {
any23.extract(source, cth1);
} catch (ExtractionException e) {
Assert.assertTrue(e.getCause() instanceof RDFParseException);
}
}
/**
* Test correct management of general <i>XML</i> content.
*
* @throws org.apache.any23.extractor.ExtractionException if there is an error running extraction logic
* @throws IOException if there is an error reading the input
*/
@Test
public void testXMLMimeTypeManagement() throws IOException,
ExtractionException {
final String documentIRI = "http://www.test.com/resource.xml";
final String contentType = "application/xml";
final String in = StreamUtils.asString(this.getClass()
.getResourceAsStream("any23-xml-mimetype.xml"));
final DocumentSource doc = new StringDocumentSource(in, documentIRI,
contentType);
final Any23 any23 = new Any23();
final CountingTripleHandler cth = new CountingTripleHandler(false);
final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
final ExtractionReport report = any23.extract(doc, rth);
Assert.assertFalse(report.hasMatchingExtractors());
Assert.assertEquals(0, cth.getCount());
}
/**
* Test correct management of general <i>XML</i> content from <i>URL</i>
* source.
*
* @throws org.apache.any23.extractor.ExtractionException if there is an error running extraction logic
* @throws IOException if there is an error reading the input
*/
@Test
public void testXMLMimeTypeManagementViaURL() throws IOException,
ExtractionException {
assumeOnlineAllowed();
final Any23 any23 = new Any23();
any23.setHTTPUserAgent("apache-any23-test-user-agent");
HTTPClient client = any23.getHTTPClient();
HTTPClientConfiguration configuration = new DefaultHTTPClientConfiguration("application/xml");
client.init(configuration);
final CountingTripleHandler cth = new CountingTripleHandler(false);
final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
final ExtractionReport report = any23.extract(
"http://www.legislation.gov.uk/ukpga/2015/17/section/4/data.xml", rth);
Assert.assertFalse(report.hasMatchingExtractors());
Assert.assertEquals(0, cth.getCount());
}
@Test
public void testBlankNodesViaURL() throws IOException, ExtractionException {
assumeOnlineAllowed();
final Any23 any23 = new Any23();
any23.setHTTPUserAgent("apache-any23-test-user-agent");
final CountingTripleHandler cth = new CountingTripleHandler(false);
final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
final ExtractionReport report = any23.extract(
"https://www.w3.org/", rth);
Assert.assertTrue(report.hasMatchingExtractors());
}
@Test
public void testMicrodataSupport() throws Exception {
final String htmlWithMicrodata = IOUtils.toString(getClass()
.getResourceAsStream("/microdata/microdata-basic.html"),
StandardCharsets.UTF_8);
assertExtractorActivation(htmlWithMicrodata, MicrodataExtractor.class);
}
@Test
public void testAbstractMethodErrorIssue186_1() throws IOException,
ExtractionException {
final Any23 runner = new Any23();
final String content = FileUtils
.readResourceContent("/html/rdfa/rdfa-issue186-1.xhtml");
final DocumentSource source = new StringDocumentSource(content,
"http://base.com");
final ByteArrayOutputStream out = new ByteArrayOutputStream();
final TripleHandler handler = new NTriplesWriter(out);
runner.extract(source, handler);
String n3 = out.toString("UTF-8");
logger.debug(n3);
}
@Test
public void testAbstractMethodErrorIssue186_2() throws IOException,
ExtractionException {
final Any23 runner = new Any23();
final String content = FileUtils
.readResourceContent("/html/rdfa/rdfa-issue186-2.xhtml");
final DocumentSource source = new StringDocumentSource(content,
"http://richard.cyganiak.de/");
final ByteArrayOutputStream out = new ByteArrayOutputStream();
final TripleHandler handler = new NTriplesWriter(out);
runner.extract(source, handler);
final String n3 = out.toString("UTF-8");
logger.debug(n3);
}
@Test
public void testModifiableConfiguration_issue183() throws Exception {
final ModifiableConfiguration modifiableConf = DefaultConfiguration
.copy();
modifiableConf.setProperty("any23.extraction.metadata.timesize", "off");
final Any23 any23 = new Any23(modifiableConf);
final String content = FileUtils
.readResourceContent("/rdf/rdf-issue183.ttl");
final DocumentSource source = new StringDocumentSource(content,
"http://base.com");
final ByteArrayOutputStream out = new ByteArrayOutputStream();
final TripleHandler handler = new NTriplesWriter(out);
any23.extract(source, handler);
handler.close();
final String n3 = out.toString("UTF-8");
logger.debug(n3);
Assert.assertFalse(
"Should not contain triple with http://vocab.sindice.net/date",
n3.contains("http://vocab.sindice.net/date"));
Assert.assertFalse(
"Should not contain triple with http://vocab.sindice.net/size",
n3.contains("http://vocab.sindice.net/size"));
}
@Test
public void testIssue415InvalidNTriples() throws Exception {
NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
ExtractionReport report = runner.extract(
IOUtils.resourceToString("/rdf/issue415.txt", StandardCharsets.UTF_8),
"http://humanstxt.org/humans.txt",
new CompositeTripleHandler());
Assert.assertEquals("text/plain", report.getDetectedMimeType());
Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
Assert.assertEquals(0, report.getMatchingExtractors().size());
}
@Test
public void testIssue415ValidNTriples() throws Exception {
NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
CountingTripleHandler handler = new CountingTripleHandler();
ExtractionReport report = runner.extract(
IOUtils.resourceToString("/rdf/issue415-valid.txt", StandardCharsets.UTF_8),
"http://humanstxt.org/humans.txt",
handler);
Assert.assertEquals("application/n-triples", report.getDetectedMimeType());
Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
Assert.assertEquals(1, report.getMatchingExtractors().size());
Assert.assertEquals(1, handler.getCount());
}
/**
* Performs detection and extraction on the given input string and return
* the {@link ExtractionReport}.
*
* @param in
* input string.
* @return a populated {@link org.apache.any23.ExtractionReport}
* @throws Exception if there is an error detecting mime type and running extraction
*/
private ExtractionReport detectAndExtract(String in) throws Exception {
Any23 any23 = new Any23();
Configuration conf = DefaultConfiguration.copy();
ByteArrayOutputStream out = new ByteArrayOutputStream();
ReportingTripleHandler outputHandler = new ReportingTripleHandler(
new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(
new NTriplesWriter(out))));
return any23.extract(new ExtractionParameters(conf, ValidationMode.VALIDATE_AND_FIX, null, null),
new StringDocumentSource(in, "http://host.com/path"), outputHandler, "UTF-8");
}
/**
* Asserts that a list an {@link Extractor} has been activated for the given
* input data.
*
* @param in
* input data as string.
* @throws IOException
* @throws ExtractionException
*/
private void assertDetectionAndExtraction(String in) throws Exception {
final ExtractionReport extractionReport = detectAndExtract(in);
Assert.assertTrue(
"Detection and extraction failed, no matching extractors.",
extractionReport.hasMatchingExtractors());
}
/**
* Assert the correct activation of the given list of {@link Extractor}s for
* the given input string.
*
* @param in
* input data as string.
* @param expectedExtractors
* @throws IOException
* @throws ExtractionException
*/
private void assertExtractorActivation(String in,
@SuppressWarnings("rawtypes") Class<? extends Extractor>... expectedExtractors) throws Exception {
final ExtractionReport extractionReport = detectAndExtract(in);
for (@SuppressWarnings("rawtypes") Class<? extends Extractor> expectedExtractorClass : expectedExtractors) {
Assert.assertTrue(
String.format(
"Detection and extraction failed, expected extractor [%s] not found.",
expectedExtractorClass),
containsClass(extractionReport.getMatchingExtractors(),
expectedExtractorClass));
}
}
/**
* Asserts the correct encoding detection for a specified data.
*
* @param encoding
* the expected specified encoding, if <code>null</code> will be
* auto detected.
* @param input
* @param expectedContent
* @throws Exception
*/
private void assertEncodingDetection(String encoding, String input, String expectedContent)
throws Exception {
DocumentSource fileDocumentSource = getDocumentSourceFromResource(input);
Any23 any23;
RepositoryConnection conn = null;
RepositoryWriter repositoryWriter = null;
any23 = new Any23();
Repository store = new SailRepository(new MemoryStore());
store.init();
try
{
conn = store.getConnection();
repositoryWriter = new RepositoryWriter(conn);
Assert.assertTrue( any23.extract(fileDocumentSource, repositoryWriter, encoding).hasMatchingExtractors() );
RepositoryResult<Statement> statements = conn.getStatements(null, vDCTERMS.title, null, false);
try {
while (statements.hasNext()) {
Statement statement = statements.next();
printStatement(statement);
Assert.assertTrue(statement.getObject().stringValue().contains(expectedContent));
}
} finally {
statements.close();
}
}
finally {
if(conn != null) {
conn.close();
}
if(repositoryWriter != null) {
repositoryWriter.close();
}
}
fileDocumentSource = null;
any23 = null;
}
/**
* Will try to detect the <i>content</i> trying sequentially with all
* specified parser.
*
* @param content
* @param parsers
* @throws Exception
*/
private void assertDetection(String content, String... parsers)
throws Exception {
ByteArrayOutputStream out = new ByteArrayOutputStream();
Any23 runner = new Any23(parsers.length == 0 ? null : parsers);
if (parsers.length != 0) {
runner.setMIMETypeDetector(null); // Use all the provided
// extractors.
}
final NTriplesWriter tripleHandler = new NTriplesWriter(out);
runner.extract(new StringDocumentSource(content, PAGE_URL),
tripleHandler);
tripleHandler.close();
String result = out.toString("us-ascii");
Assert.assertNotNull(result);
Assert.assertTrue(result.length() > 10);
}
private void printStatement(Statement statement) {
logger.debug(String.format("%s\t%s\t%s", statement.getSubject(),
statement.getPredicate(), statement.getObject()));
}
private boolean containsClass(List<?> list, Class<?> clazz) {
for (Object o : list) {
if (o.getClass().equals(clazz)) {
return true;
}
}
return false;
}
}