blob: 672fcbe5be6ea4b8d031e9f10c1100816b3d7c20 [file] [log] [blame]
/*
* Copyright 2008-2010 Digital Enterprise Research Institute (DERI)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.deri.any23;
import junit.framework.Assert;
import org.deri.any23.extractor.ExtractionException;
import org.deri.any23.filter.IgnoreAccidentalRDFa;
import org.deri.any23.filter.IgnoreTitlesOfEmptyDocuments;
import org.deri.any23.http.DefaultHTTPClient;
import org.deri.any23.http.HTTPClient;
import org.deri.any23.source.DocumentSource;
import org.deri.any23.source.FileDocumentSource;
import org.deri.any23.source.HTTPDocumentSource;
import org.deri.any23.source.StringDocumentSource;
import org.deri.any23.vocab.DCTERMS;
import org.deri.any23.writer.*;
import org.junit.Test;
import org.openrdf.model.Statement;
import org.openrdf.repository.RepositoryConnection;
import org.openrdf.repository.RepositoryException;
import org.openrdf.repository.RepositoryResult;
import org.openrdf.repository.sail.SailRepository;
import org.openrdf.sail.Sail;
import org.openrdf.sail.SailException;
import org.openrdf.sail.memory.MemoryStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.net.URI;
import java.net.URISyntaxException;
/**
* Test case for {@link org.deri.any23.Any23} facade.
* @author Davide Palmisano ( dpalmisano@gmail.com )
* @author Michele Mostarda ( michele.mostarda@gmail.com )
*/
public class Any23Test {
private static final Logger logger = LoggerFactory.getLogger(Any23Test.class);
private String url = "http://bob.com";
@Test
public void testTTLDetection() throws Exception {
assertDetection("<a> <b> <c> .", "rdf-turtle");
}
@Test
public void testN3Detection1() throws Exception {
assertDetection("<Bob><brothers>(<Jim><Mark>).", "rdf-turtle");
}
@Test
public void testN3Detection2() throws Exception {
assertDetection(
"<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .",
"rdf-nt"
);
}
@Test
public void testHTMLBruteForceDetection() throws Exception {
assertDetection("<html><body><div class=\"vcard fn\">Joe</div></body></html>");
}
/**
* This tests the behavior of <i>Any23</i> to execute the extraction explicitly specifying the charset
* encoding of the input.
*
* @throws ExtractionException
* @throws IOException
* @throws SailException
* @throws RepositoryException
*/
@Test
public void testExplicitEncoding()
throws ExtractionException, IOException, SailException, RepositoryException {
assertEncodingDetection(
"UTF-8",
new File("src/test/resources/html/encoding-test.html"),
"Knud M\u00F6ller"
);
}
/**
* This tests the behavior of <i>Any23</i> to perform the extraction without passing it any charset encoding.
* The encoding is therefore guessed using {@link org.deri.any23.encoding.TikaEncodingDetector} class.
*
* @throws ExtractionException
* @throws IOException
* @throws SailException
* @throws RepositoryException
*/
@Test
public void testImplicitEncoding() throws ExtractionException, IOException, SailException, RepositoryException {
assertEncodingDetection(
null, // The encoding will be auto detected.
new File("src/test/resources/html/encoding-test.html"),
"Knud M\u00F6ller"
);
}
@Test
public void testRDFXMLDetectionAndExtraction() throws IOException, ExtractionException {
String rdfXML =
"<?xml version='1.0'?> " +
"<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' xmlns:dc='http://purl.org/dc/elements/1.1/'>" +
"<rdf:Description rdf:about='http://www.example.com'>" +
"<dc:title>x</dc:title>" +
"</rdf:Description>" +
"</rdf:RDF>";
assertDetectionAndExtraction(rdfXML);
}
@Test
public void testNTriplesDetectionAndExtraction() throws IOException, ExtractionException {
String n3 = "<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"n3 . appo\" .";
assertDetectionAndExtraction(n3);
}
@Test
public void testNturtleDetectionAndExtraction() throws IOException, ExtractionException {
String nTurtle =
"@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n" +
"@prefix dc: <http://purl.org/dc/elements/1.1/> .\n" +
"@prefix ex: <http://example.org/stuff/1.0/> .\n" +
"\n" +
"<http://www.w3.org/TR/rdf-syntax-grammar>\n" +
" dc:title \"RDF/XML Syntax Specification (Revised)\" ;\n" +
" ex:editor [\n" +
" ex:fullname \"Dave Beckett\";\n" +
" ex:homePage <http://purl.org/net/dajobe/>\n" +
" ] .";
assertDetectionAndExtraction(nTurtle);
}
/**
* Tests out the first code snipped used in <i>Developer Manual</i>.
*
* @throws IOException
* @throws ExtractionException
*/
@Test
public void testDemoCodeSnippet1() throws IOException, ExtractionException {
/*1*/ Any23 runner = new Any23();
/*2*/ final String content = "@prefix foo: <http://example.org/ns#> . " +
"@prefix : <http://other.example.org/ns#> ." +
"foo:bar foo: : . " +
":bar : foo:bar . ";
// The second argument of StringDocumentSource() must be a valid URI.
/*3*/ DocumentSource source = new StringDocumentSource(content, "http://host.com/service");
/*4*/ ByteArrayOutputStream out = new ByteArrayOutputStream();
/*5*/ TripleHandler handler = new NTriplesWriter(out);
/*6*/ runner.extract(source, handler);
/*7*/ String n3 = out.toString("UTF-8");
/*
<http://example.org/ns#bar> <http://example.org/ns#> <http://other.example.org/ns#> .
<http://other.example.org/ns#bar> <http://other.example.org/ns#> <http://example.org/ns#bar> .
*/
System.out.println("n3: " + n3);
Assert.assertTrue(n3.length() > 0);
}
/**
* Tests out the second code snipped used in <i>Developer Manual</i>.
*
* @throws IOException
* @throws ExtractionException
*/
// Deactivated to avoid test dependency on external resources.
// @Test
public void testDemoCodeSnippet2()
throws IOException, ExtractionException, URISyntaxException, SailException, RepositoryException {
/*1*/ Any23 runner = new Any23();
/*2*/ runner.setHTTPUserAgent("test-user-agent");
/*3*/ HTTPClient httpClient = runner.getHTTPClient();
/*4*/ DocumentSource source = new HTTPDocumentSource(
httpClient,
"http://www.rentalinrome.com/semanticloft/semanticloft.htm"
);
/*5*/ ByteArrayOutputStream out = new ByteArrayOutputStream();
/*6*/ TripleHandler handler = new NTriplesWriter(out);
/*7*/ runner.extract(source, handler);
/*8*/ String n3 = out.toString("UTF-8");
/*
<http://www.rentalinrome.com/semanticloft/semanticloft.htm>
<http://purl.org/dc/terms/title>
"Semantic Loft (beta) - Trastevere apartments | Rental in Rome - rentalinrome.com" .
[...]
*/
System.out.println("N3 "+ n3);
Assert.assertTrue(n3.length() > 0);
}
/**
* This test checks the extraction behavior when the library is used programatically.
* This test is related to the issue #45, to verify the different behaviors between Maven and Ant.
* The behavior was related to a 2nd-level dependency introduced by Maven.
*
* @throws ExtractionException
* @throws IOException
* @throws URISyntaxException
*/
@Test
public void testProgrammaticExtraction() throws ExtractionException, IOException, URISyntaxException {
Any23 any23 = new Any23();
any23.setHTTPUserAgent("Any23-Servlet");
any23.setHTTPClient(new DefaultHTTPClient() {
@Override
protected int getConnectionTimeout() {
return 5000;
}
@Override
protected int getSoTimeout() {
return 2000;
}
});
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
TripleHandler handler = new RDFXMLWriter(byteArrayOutputStream);
TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);
DocumentSource source = new FileDocumentSource(
new File("src/test/resources/html/rdfa/ansa_2010-02-26_12645863.html"),
"http://host.com/service");
Assert.assertTrue(any23.extract(source, reporting));
handler.close();
String bufferContent = byteArrayOutputStream.toString();
System.out.println(bufferContent);
int i = 0;
int counter = 0;
while( i < bufferContent.length() ) {
i = bufferContent.indexOf("\n", i);
if(i == -1) {
break;
}
counter++;
i++;
}
Assert.assertSame("Unexpected number of triples.", 38, counter);
}
private void assertDetectionAndExtraction(String in) throws IOException, ExtractionException {
Any23 any23 = new Any23();
ByteArrayOutputStream out = new ByteArrayOutputStream();
ReportingTripleHandler outputHandler = new ReportingTripleHandler(
new IgnoreAccidentalRDFa(
new IgnoreTitlesOfEmptyDocuments(
new NTriplesWriter(out)
)
)
);
Assert.assertTrue(
"Detection and extraction failed.",
any23.extract(in, "http://host.com/path", outputHandler)
);
}
/**
* Asserts the correct encoding detection for a specified data.
*
* @param encoding the expected specified encoding, if <code>null</code> will be auto detected.
* @throws IOException
* @throws ExtractionException
* @throws RepositoryException
* @throws SailException
*/
private void assertEncodingDetection(String encoding, File input, String expectedContent)
throws IOException, ExtractionException, RepositoryException, SailException {
FileDocumentSource fileDocumentSource;
Any23 any23;
RepositoryConnection conn;
RepositoryWriter repositoryWriter;
fileDocumentSource = new FileDocumentSource(input);
any23 = new Any23();
Sail store = new MemoryStore();
store.initialize();
conn = new SailRepository(store).getConnection();
repositoryWriter = new RepositoryWriter(conn);
Assert.assertTrue(any23.extract(fileDocumentSource, repositoryWriter, encoding));
RepositoryResult<Statement> statements = conn.getStatements(null, DCTERMS.title, null, false);
try {
while (statements.hasNext()) {
Statement statement = statements.next();
printStatement(statement);
org.junit.Assert.assertTrue(statement.getObject().stringValue().contains(expectedContent));
}
} finally {
statements.close();
}
fileDocumentSource = null;
any23 = null;
conn.close();
repositoryWriter.close();
}
private void printStatement(Statement statement) {
logger.info(String.format("%s\t%s\t%s",
statement.getSubject(),
statement.getPredicate(),
statement.getObject()));
}
/**
* Will try to detect the <i>content</i> trying sequentially with all
* specified parser.
*
* @param content
* @param parsers
* @throws Exception
*/
private void assertDetection(String content, String... parsers) throws Exception {
ByteArrayOutputStream out = new ByteArrayOutputStream();
Any23 runner = new Any23(parsers.length == 0 ? null : parsers);
if (parsers.length != 0) {
runner.setMIMETypeDetector(null); // Use all the provided extractors.
}
runner.extract(new StringDocumentSource(content, url), new NTriplesWriter(out));
String result = out.toString("us-ascii");
Assert.assertNotNull(result);
Assert.assertTrue(result.length() > 10);
}
}