blob: 2b14cb4aa08fa97dc9a29ff20c7bdc294542d767 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.htmlextractor;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.io.InputStream;
import java.util.Set;
import org.apache.clerezza.commons.rdf.Graph;
import org.apache.clerezza.commons.rdf.BlankNodeOrIRI;
import org.apache.clerezza.commons.rdf.Triple;
import org.apache.clerezza.commons.rdf.IRI;
import org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ClerezzaRDFUtils;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractionRegistry;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlParser;
import org.apache.stanbol.enhancer.engines.htmlextractor.impl.InitializationException;
import org.junit.BeforeClass;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
* @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
*
*/
public class TestHtmlExtractor {
private static final Logger LOG = LoggerFactory.getLogger(TestHtmlExtractor.class);
private static HtmlParser parser;
private static HtmlExtractionRegistry registry;
// define the Nepomuks NIE namespace locally here
private static final String NIE_NS = "http://www.semanticdesktop.org/ontologies/2007/01/19/nie#";
@BeforeClass
public static void oneTimeSetup() throws IOException {
try {
registry = new HtmlExtractionRegistry("htmlextractors.xml");
}
catch (InitializationException e) {
LOG.error("Registry Initialization Error: " + e.getMessage());
throw new IOException(e.getMessage());
}
parser = new HtmlParser();
}
/**
* This tests the RDFa extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testRdfaExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-rdfa.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile,in,null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("RDFa triples: {}",tripleCounter);
printTriples(model);
assertEquals(8, tripleCounter);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
}
/** This tests some Microformat extraction
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testMFExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-MF.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile,in,null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("Microformat triples: {}",tripleCounter);
printTriples(model);
assertEquals(127, tripleCounter);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
}
/** This test some extraction of microdata from an HTML-5 document
*
* @throws Exception
*/
@Test
public void testMicrodataExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-microdata.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile,in,null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("Microdata triples: {}",tripleCounter);
printTriples(model);
assertEquals(91, tripleCounter);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
}
/** This tests the merging of disconnected graphs under a single root
*
* @throws Exception
*/
@Test
public void testRootExtraction() throws Exception {
HtmlExtractor extractor = new HtmlExtractor(registry, parser);
Graph model = new SimpleGraph();
String testFile = "test-MultiRoot.html";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
extractor.extract("file://" + testFile,in,null, "text/html", model);
// show triples
int tripleCounter = model.size();
LOG.debug("Triples: {}",tripleCounter);
printTriples(model);
Set<BlankNodeOrIRI> roots = ClerezzaRDFUtils.findRoots(model);
assertTrue(roots.size() > 1);
ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
roots = ClerezzaRDFUtils.findRoots(model);
assertEquals(1,roots.size());
}
private InputStream getResourceAsStream(String testResultFile) {
return this.getClass().getClassLoader().getResourceAsStream(
testResultFile);
}
private void printTriples(Graph model) {
for (Triple t: model) {
LOG.debug(t.toString());
}
}
}