blob: 7cb09206cf77377d2b1b1a21e6c7da2b04a166b8 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.metaxa.core;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import org.apache.clerezza.rdf.core.BNode;
import org.apache.clerezza.rdf.core.NonLiteral;
import org.apache.clerezza.rdf.core.Resource;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.io.IOUtils;
import org.apache.stanbol.enhancer.engines.metaxa.MetaxaEngine;
import org.junit.BeforeClass;
import org.junit.Test;
import org.ontoware.aifbcommons.collection.ClosableIterator;
import org.ontoware.rdf2go.model.Model;
import org.ontoware.rdf2go.model.Statement;
import org.ontoware.rdf2go.model.node.BlankNode;
import org.ontoware.rdf2go.model.node.Variable;
import org.ontoware.rdf2go.model.node.impl.URIImpl;
import org.semanticdesktop.aperture.extractor.ExtractorException;
import org.semanticdesktop.aperture.vocabulary.NMO;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
/**
* {@link TestMetaxaCore} is a test class for {@link MetaxaCore}.
*
* @author Joerg Steffen, DFKI
* @version $Id$
*/
public class TestMetaxaCore {
/**
* This contains the logger.
*/
private static final Logger LOG = LoggerFactory.getLogger(TestMetaxaCore.class);
/**
* This contains the Metaxa extractor to test.
*/
private static MetaxaCore extractor;
/**
* This initializes the Aperture extractor.
*/
@BeforeClass
public static void oneTimeSetUp() throws IOException {
extractor = new MetaxaCore("extractionregistry.xml");
}
/**
* This tests the pdf extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testPdfExtraction() throws Exception {
String testFile = "test.pdf";
String testResultFile = "pdf-res.txt";
// extract text from pdf
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "application/pdf");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
assertNotNull("failed to load resource " + testResultFile, in2);
String expectedText = IOUtils.toString(in2, "utf-8");
// test
assertEquals(cleanup(expectedText), cleanup(text));
// show triples
int tripleCounter = this.printTriples(m);
assertEquals(11, tripleCounter);
}
/**
* This tests the html extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testHtmlExtraction() throws Exception {
String testFile = "test.html";
String testResultFile = "html-res.txt";
// extract text from html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
assertNotNull("failed to load resource " + testResultFile, in2);
String expectedText = IOUtils.toString(in2, "utf-8");
// test
assertEquals(cleanup(expectedText), cleanup(text));
// show triples
int tripleCounter = this.printTriples(m);
assertEquals(28, tripleCounter);
}
/**
* This tests the html extraction.
*
* @throws ExtractorException if there is an error during extraction
* @throws IOException if there is an error when reading the document
*/
@Test
public void testRdfaExtraction() throws Exception {
String testFile = "test-rdfa.html";
String testResultFile = "rdfa-res.txt";
// extract text from RDFa annotated html
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
String text = MetaxaCore.getText(m);
// get expected result
InputStream in2 = getResourceAsStream(testResultFile);
assertNotNull("failed to load resource " + testResultFile, in2);
String expectedText = IOUtils.toString(in2, "utf-8");
// test
assertEquals(cleanup(expectedText), cleanup(text));
// show triples
int tripleCounter = this.printTriples(m);
assertEquals(10, tripleCounter);
}
@Test
public void testMailExtraction() throws Exception {
String testFile = "mail-multipart-test.eml";
InputStream in = getResourceAsStream(testFile);
assertNotNull("failed to load resource " + testFile, in);
Model m = extractor.extract(in, new URIImpl("file://" + testFile), "message/rfc822");
boolean textContained = m.contains(Variable.ANY, NMO.plainTextMessageContent, Variable.ANY);
assertTrue(textContained);
}
/**
* This prints out the Stanbol Enhancer triples that would be created for the metadata
* contained in the given model.
*
* @param m a {@link Model}
*
* @return an {@code int} with the number of added triples
*/
private int printTriples(Model m) {
int tripleCounter = 0;
HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();
ClosableIterator<Statement> it = m.iterator();
while (it.hasNext()) {
Statement oneStmt = it.next();
NonLiteral subject = (NonLiteral) MetaxaEngine.asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
UriRef predicate = (UriRef) MetaxaEngine.asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
Resource object = MetaxaEngine.asClerezzaResource(oneStmt.getObject(), blankNodeMap);
if (null != subject
&& null != predicate
&& null != object) {
Triple t =
new TripleImpl(subject, predicate, object);
LOG.debug("adding " + t);
tripleCounter++;
} else {
LOG.debug("skipped " + oneStmt.toString());
}
}
it.close();
return tripleCounter;
}
/**
* Cleanup strings for comparison, by removing non-printable chars.
*
* @param txt a {@link String} with the text to clean
*
* @return a {@link String} with the result
*/
private String cleanup(String txt) {
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < txt.length(); i++) {
final char c = txt.charAt(i);
if (c >= ' ') {
sb.append(c);
}
}
return sb.toString();
}
private InputStream getResourceAsStream(String testResultFile) {
return this.getClass().getClassLoader().getResourceAsStream(
testResultFile);
}
}