blob: a22f7dbbbc5bab5158637bdbb68352a00a689b7f [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor;
import org.apache.any23.AbstractAny23TestBase;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.configuration.ModifiableConfiguration;
import org.apache.any23.extractor.html.HTMLFixture;
import org.apache.any23.mime.TikaMIMETypeDetector;
import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
import org.apache.any23.vocab.ICAL;
import org.apache.any23.vocab.Review;
import org.apache.any23.vocab.SINDICE;
import org.apache.any23.vocab.VCard;
import org.apache.any23.writer.CompositeTripleHandler;
import org.apache.any23.writer.RDFXMLWriter;
import org.apache.any23.writer.RepositoryWriter;
import org.apache.any23.writer.TripleHandlerException;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.repository.RepositoryConnection;
import org.eclipse.rdf4j.repository.RepositoryException;
import org.eclipse.rdf4j.repository.RepositoryResult;
import org.eclipse.rdf4j.repository.sail.SailRepository;
import org.eclipse.rdf4j.sail.Sail;
import org.eclipse.rdf4j.sail.SailException;
import org.eclipse.rdf4j.sail.memory.MemoryStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
/**
* Test case for {@link SingleDocumentExtraction}.
*
* @author Michele Mostarda (mostarda@fbk.eu)
* @author Davide Palmisano (palmisano@fbk.eu)
*/
// TODO #20 - Solve issue that hreview item and vcard item have the same BNode due they have the same XPath DOM.
public class SingleDocumentExtractionTest extends AbstractAny23TestBase {
private static final SINDICE vSINDICE = SINDICE.getInstance();
private static final ICAL vICAL = ICAL.getInstance();
private static final Review vREVIEW = Review.getInstance();
private static final VCard vVCARD = VCard.getInstance();
private static final Logger logger = LoggerFactory.getLogger(SingleDocumentExtractionTest.class);
private SingleDocumentExtraction singleDocumentExtraction;
private ExtractorGroup extractorGroup;
private Sail store;
private RepositoryConnection conn;
RepositoryWriter repositoryWriter;
ByteArrayOutputStream baos;
RDFXMLWriter rdfxmlWriter;
@Before
public void setUp() throws Exception {
super.setUp();
extractorGroup = ExtractorRegistryImpl.getInstance().getExtractorGroup();
store = new MemoryStore();
store.init();
conn = new SailRepository(store).getConnection();
}
@After
public void tearDown() throws SailException, RepositoryException, TripleHandlerException {
rdfxmlWriter.close();
repositoryWriter.close();
logger.debug(baos.toString());
singleDocumentExtraction = null;
extractorGroup = null;
conn.close();
conn = null;
store.shutDown();
store = null;
}
/**
* Tests the existence of the domain triples.
*
* @throws IOException if there is an error loading input data
* @throws ExtractionException if an exception is raised during extraction
* @throws RepositoryException if an error is encountered whilst loading content from a storage connection
*/
@Test
public void testMicroformatDomains() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/microformat-domains.html");
singleDocumentExtraction.run();
logStorageContent();
assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
}
/**
* Tests the nested microformat relationships. This test verifies the first supported approach
* for microformat nesting. Such approach foreseen to add a microformat HTML node within the
* property of a container microformat.
*
* For further details see
* {@link SingleDocumentExtraction}
* consolidateResources(java.util.List, java.util.List, org.apache.any23.writer.TripleHandler)}
*
* @throws IOException if there is an error loading input data
* @throws ExtractionException if an exception is raised during extraction
* @throws RepositoryException if an error is encountered whilst loading content from a storage connection
*/
@Test
public void testNestedMicroformats() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html");
singleDocumentExtraction.run();
logStorageContent();
assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
}
/**
* This test assess the absence of {@link SINDICE} <i>nesting</i> relationship,
* since {@link org.apache.any23.extractor.html.HCardExtractor} declared a native nesting
* with the {@link org.apache.any23.extractor.html.AdrExtractor}.
*
* @see org.apache.any23.extractor.html.annotations.Includes
* @throws IOException if there is an error loading input data
* @throws ExtractionException if an exception is raised during extraction
* @throws RepositoryException if an error is encountered whilst loading content from a storage connection
*/
@Test
public void testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html");
singleDocumentExtraction.run();
logStorageContent();
assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), (Value) null, 0);
assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null, 0);
}
/**
* Tests the nested microformat relationships. This test verifies the second supported approach
* for microformat nesting. Such approach foreseen to use the same node attributes to declare both
* a microformat container property and a nested microformat root class.
*
* For further details see
* {@link SingleDocumentExtraction}
* consolidateResources(java.util.List, java.util.List, org.apache.any23.writer.TripleHandler)}
*
* See also the <a href="http://www.google.com/support/webmasters/bin/answer.py?answer=146862">Nested Entities</a>
* article that is linked by the official microformats.org doc page.
*
* @throws IOException if there is an error loading input data
* @throws ExtractionException if an exception is raised during extraction
* @throws RepositoryException if an error is encountered whilst loading content from a storage connection
*/
@Test
public void testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html");
singleDocumentExtraction.run();
logStorageContent();
assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
}
/**
* Tests the nested microformat relationships. This test verifies the behavior of the nested microformats
* when the nesting relationship is handled by the microformat extractor itself (like the HReview that is
* able to detect an inner VCard).
*
* @throws IOException if there is an error loading input data
* @throws ExtractionException if an exception is raised during extraction
* @throws RepositoryException if an error is encountered whilst loading content from a storage connection
*/
@Test
/* NOTE: The triple (bnode http://www.w3.org/2006/vcard/ns#url http://pizza.example.com) and
* (bnode http://vocab.sindice.net/nesting_original (structured) *) are printed out twice,
* once for every extractor. The RDFWriter doesn't remove the duplicates and some graph renderers
* show the triple property as double. Despite this the model contains it just once.
*/
public void testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException {
singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html");
singleDocumentExtraction.run();
logStorageContent();
assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 3);
assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING), (Value) null, 1);
assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
assertTripleCount(vVCARD.url, (Value) null, 1);
Value object = getTripleObject(null, vREVIEW.hasReview);
assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), object , 1);
assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL) , vREVIEW.hasReview, 1);
}
private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
baos = new ByteArrayOutputStream();
rdfxmlWriter = new RDFXMLWriter(baos);
repositoryWriter = new RepositoryWriter(conn);
final CompositeTripleHandler cth = new CompositeTripleHandler();
cth.addChild(rdfxmlWriter);
cth.addChild(repositoryWriter);
final ModifiableConfiguration configuration = DefaultConfiguration.copy();
configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
SingleDocumentExtraction instance = new SingleDocumentExtraction(
configuration,
new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"),
extractorGroup,
cth
);
instance.setMIMETypeDetector( new TikaMIMETypeDetector(new WhiteSpacesPurifier()) );
return instance;
}
/**
* Logs the storage content.
* @throws RepositoryException if an error is encountered whilst loading content from a storage connection
*/
private void logStorageContent() throws RepositoryException {
RepositoryResult<Statement> result = conn.getStatements(null, null, null, false);
while (result.hasNext()) {
Statement statement = result.next();
logger.debug( statement.toString() );
}
}
/**
* Asserts that the triple pattern is present within the storage exactly n times.
*
* @param predicate
* @param value
* @param occurrences
* @throws RepositoryException
*/
private void assertTripleCount(IRI predicate, Value value, int occurrences) throws RepositoryException {
RepositoryResult<Statement> statements = conn.getStatements(
null, predicate, value, false
);
int count = 0;
while (statements.hasNext()) {
statements.next();
count++;
}
Assert.assertEquals(
String.format("Cannot find triple (* %s %s) %d times", predicate, value, occurrences),
occurrences,
count
);
}
/**
* Asserts that the triple pattern is present within the storage exactly n times.
*
* @param predicate
* @param value
* @param occurrences
* @throws RepositoryException
*/
private void assertTripleCount(IRI predicate, String value, int occurrences) throws RepositoryException {
assertTripleCount(predicate, SimpleValueFactory.getInstance().createLiteral(value), occurrences);
}
/**
* Asserts that a triple exists exactly once.
*
* @param predicate
* @param value
* @throws RepositoryException
*/
private void assertTriple(IRI predicate, Value value) throws RepositoryException {
assertTripleCount(predicate, value, 1);
}
/**
* Asserts that a triple exists exactly once.
*
* @param predicate
* @param value
* @throws RepositoryException
*/
@SuppressWarnings("unused")
private void assertTriple(IRI predicate, String value) throws RepositoryException {
assertTriple(predicate, SimpleValueFactory.getInstance().createLiteral(value) );
}
/**
* Retrieves the triple object matching with the given pattern that is expected to be just one.
*
* @param sub the triple subject, <code>null</code> for any.
* @param prop the triple property, <code>null</code> for any.
* @return the object of the unique triple matching the given pattern.
* @throws RepositoryException if an error occurred during the search.
*/
private Value getTripleObject(Resource sub, IRI prop) throws RepositoryException {
RepositoryResult<Statement> statements = conn.getStatements(sub, prop, null, false);
Assert.assertTrue(statements.hasNext());
Statement statement = statements.next();
Value value = statement.getObject();
Assert.assertFalse( "Expected just one result.", statements.hasNext() );
statements.close();
return value;
}
}