enhancement-engines/metaxa/src/test/java/org/apache/stanbol/enhancer/engines/metaxa/core/TestMetaxaCore.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.stanbol.enhancer.engines.metaxa.core;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.HashMap;

 import org.apache.clerezza.rdf.core.BNode;
 import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.commons.io.IOUtils;
 import org.apache.stanbol.enhancer.engines.metaxa.MetaxaEngine;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.ontoware.aifbcommons.collection.ClosableIterator;
 import org.ontoware.rdf2go.model.Model;
 import org.ontoware.rdf2go.model.Statement;
 import org.ontoware.rdf2go.model.node.BlankNode;
 import org.ontoware.rdf2go.model.node.Variable;
 import org.ontoware.rdf2go.model.node.impl.URIImpl;
 import org.semanticdesktop.aperture.extractor.ExtractorException;
 import org.semanticdesktop.aperture.vocabulary.NMO;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;


 /**
  * {@link TestMetaxaCore} is a test class for {@link MetaxaCore}.
  *
  * @author Joerg Steffen, DFKI
  * @version $Id$
  */
 public class TestMetaxaCore {

     /**
      * This contains the logger.
      */
     private static final Logger LOG = LoggerFactory.getLogger(TestMetaxaCore.class);

     /**
      * This contains the Metaxa extractor to test.
      */
     private static MetaxaCore extractor;


     /**
      * This initializes the Aperture extractor.
      */
     @BeforeClass
     public static void oneTimeSetUp() throws IOException {
         extractor = new MetaxaCore("extractionregistry.xml");
     }

     /**
      * This tests the pdf extraction.
      *
      * @throws ExtractorException if there is an error during extraction
      * @throws IOException if there is an error when reading the document
      */
     @Test
     public void testPdfExtraction() throws Exception {

         String testFile = "test.pdf";
         String testResultFile = "pdf-res.txt";

         // extract text from pdf
         InputStream in = getResourceAsStream(testFile);
         assertNotNull("failed to load resource " + testFile, in);

         Model m = extractor.extract(in, new URIImpl("file://" + testFile), "application/pdf");
         String text = MetaxaCore.getText(m);
         // get expected result
         InputStream in2 = getResourceAsStream(testResultFile);
         assertNotNull("failed to load resource " + testResultFile, in2);

         String expectedText = IOUtils.toString(in2, "utf-8");
         // test
         assertEquals(cleanup(expectedText), cleanup(text));

         // show triples
         int tripleCounter = this.printTriples(m);
         assertEquals(11, tripleCounter);
     }

     /**
      * This tests the html extraction.
      *
      * @throws ExtractorException if there is an error during extraction
      * @throws IOException if there is an error when reading the document
      */
     @Test
     public void testHtmlExtraction() throws Exception {

         String testFile = "test.html";
         String testResultFile = "html-res.txt";

         // extract text from html
         InputStream in = getResourceAsStream(testFile);
         assertNotNull("failed to load resource " + testFile, in);

         Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
         String text = MetaxaCore.getText(m);
         // get expected result
         InputStream in2 = getResourceAsStream(testResultFile);
         assertNotNull("failed to load resource " + testResultFile, in2);

         String expectedText = IOUtils.toString(in2, "utf-8");
         // test
         assertEquals(cleanup(expectedText), cleanup(text));

         // show triples
         int tripleCounter = this.printTriples(m);
         assertEquals(28, tripleCounter);
     }

     /**
      * This tests the html extraction.
      *
      * @throws ExtractorException if there is an error during extraction
      * @throws IOException if there is an error when reading the document
      */
     @Test
     public void testRdfaExtraction() throws Exception {
         String testFile = "test-rdfa.html";
         String testResultFile = "rdfa-res.txt";

         // extract text from RDFa annotated html
         InputStream in = getResourceAsStream(testFile);
         assertNotNull("failed to load resource " + testFile, in);

         Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
         String text = MetaxaCore.getText(m);
         // get expected result
         InputStream in2 = getResourceAsStream(testResultFile);
         assertNotNull("failed to load resource " + testResultFile, in2);

         String expectedText = IOUtils.toString(in2, "utf-8");
         // test
         assertEquals(cleanup(expectedText), cleanup(text));

         // show triples
         int tripleCounter = this.printTriples(m);
         assertEquals(10, tripleCounter);
     }

     @Test
     public void testMailExtraction() throws Exception {
       String testFile = "mail-multipart-test.eml";
       InputStream in = getResourceAsStream(testFile);
       assertNotNull("failed to load resource " + testFile, in);
       Model m = extractor.extract(in, new URIImpl("file://" + testFile), "message/rfc822");
       boolean textContained = m.contains(Variable.ANY, NMO.plainTextMessageContent, Variable.ANY);
       assertTrue(textContained);
     }

     /**
      * This prints out the Stanbol Enhancer triples that would be created for the metadata
      * contained in the given model.
      *
      * @param m a {@link Model}
      *
      * @return an {@code int} with the number of added triples
      */
     private int printTriples(Model m) {

         int tripleCounter = 0;

         HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();

         ClosableIterator<Statement> it = m.iterator();
         while (it.hasNext()) {
             Statement oneStmt = it.next();

             NonLiteral subject = (NonLiteral) MetaxaEngine.asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
             UriRef predicate = (UriRef) MetaxaEngine.asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
             Resource object = MetaxaEngine.asClerezzaResource(oneStmt.getObject(), blankNodeMap);

             if (null != subject
                     && null != predicate
                     && null != object) {
                 Triple t =
                         new TripleImpl(subject, predicate, object);
                 LOG.debug("adding " + t);
                 tripleCounter++;
             } else {
                 LOG.debug("skipped " + oneStmt.toString());
             }
         }
         it.close();

         return tripleCounter;
     }

     /**
      * Cleanup strings for comparison, by removing non-printable chars.
      *
      * @param txt a {@link String} with the text to clean
      *
      * @return a {@link String} with the result
      */
     private String cleanup(String txt) {
         final StringBuilder sb = new StringBuilder();
         for (int i = 0; i < txt.length(); i++) {
             final char c = txt.charAt(i);
             if (c >= ' ') {
                 sb.append(c);
             }
         }
         return sb.toString();
     }

     private InputStream getResourceAsStream(String testResultFile) {
         return this.getClass().getClassLoader().getResourceAsStream(
                 testResultFile);
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.metaxa.core;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.HashMap;

	import org.apache.clerezza.rdf.core.BNode;
	import org.apache.clerezza.rdf.core.NonLiteral;
	import org.apache.clerezza.rdf.core.Resource;
	import org.apache.clerezza.rdf.core.Triple;
	import org.apache.clerezza.rdf.core.UriRef;
	import org.apache.clerezza.rdf.core.impl.TripleImpl;
	import org.apache.commons.io.IOUtils;
	import org.apache.stanbol.enhancer.engines.metaxa.MetaxaEngine;
	import org.junit.BeforeClass;
	import org.junit.Test;
	import org.ontoware.aifbcommons.collection.ClosableIterator;
	import org.ontoware.rdf2go.model.Model;
	import org.ontoware.rdf2go.model.Statement;
	import org.ontoware.rdf2go.model.node.BlankNode;
	import org.ontoware.rdf2go.model.node.Variable;
	import org.ontoware.rdf2go.model.node.impl.URIImpl;
	import org.semanticdesktop.aperture.extractor.ExtractorException;
	import org.semanticdesktop.aperture.vocabulary.NMO;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertNotNull;
	import static org.junit.Assert.assertTrue;


	/**
	* {@link TestMetaxaCore} is a test class for {@link MetaxaCore}.
	*
	* @author Joerg Steffen, DFKI
	* @version $Id$
	*/
	public class TestMetaxaCore {

	/**
	* This contains the logger.
	*/
	private static final Logger LOG = LoggerFactory.getLogger(TestMetaxaCore.class);

	/**
	* This contains the Metaxa extractor to test.
	*/
	private static MetaxaCore extractor;


	/**
	* This initializes the Aperture extractor.
	*/
	@BeforeClass
	public static void oneTimeSetUp() throws IOException {
	extractor = new MetaxaCore("extractionregistry.xml");
	}

	/**
	* This tests the pdf extraction.
	*
	* @throws ExtractorException if there is an error during extraction
	* @throws IOException if there is an error when reading the document
	*/
	@Test
	public void testPdfExtraction() throws Exception {

	String testFile = "test.pdf";
	String testResultFile = "pdf-res.txt";

	// extract text from pdf
	InputStream in = getResourceAsStream(testFile);
	assertNotNull("failed to load resource " + testFile, in);

	Model m = extractor.extract(in, new URIImpl("file://" + testFile), "application/pdf");
	String text = MetaxaCore.getText(m);
	// get expected result
	InputStream in2 = getResourceAsStream(testResultFile);
	assertNotNull("failed to load resource " + testResultFile, in2);

	String expectedText = IOUtils.toString(in2, "utf-8");
	// test
	assertEquals(cleanup(expectedText), cleanup(text));

	// show triples
	int tripleCounter = this.printTriples(m);
	assertEquals(11, tripleCounter);
	}

	/**
	* This tests the html extraction.
	*
	* @throws ExtractorException if there is an error during extraction
	* @throws IOException if there is an error when reading the document
	*/
	@Test
	public void testHtmlExtraction() throws Exception {

	String testFile = "test.html";
	String testResultFile = "html-res.txt";

	// extract text from html
	InputStream in = getResourceAsStream(testFile);
	assertNotNull("failed to load resource " + testFile, in);

	Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
	String text = MetaxaCore.getText(m);
	// get expected result
	InputStream in2 = getResourceAsStream(testResultFile);
	assertNotNull("failed to load resource " + testResultFile, in2);

	String expectedText = IOUtils.toString(in2, "utf-8");
	// test
	assertEquals(cleanup(expectedText), cleanup(text));

	// show triples
	int tripleCounter = this.printTriples(m);
	assertEquals(28, tripleCounter);
	}

	/**
	* This tests the html extraction.
	*
	* @throws ExtractorException if there is an error during extraction
	* @throws IOException if there is an error when reading the document
	*/
	@Test
	public void testRdfaExtraction() throws Exception {
	String testFile = "test-rdfa.html";
	String testResultFile = "rdfa-res.txt";

	// extract text from RDFa annotated html
	InputStream in = getResourceAsStream(testFile);
	assertNotNull("failed to load resource " + testFile, in);

	Model m = extractor.extract(in, new URIImpl("file://" + testFile), "text/html");
	String text = MetaxaCore.getText(m);
	// get expected result
	InputStream in2 = getResourceAsStream(testResultFile);
	assertNotNull("failed to load resource " + testResultFile, in2);

	String expectedText = IOUtils.toString(in2, "utf-8");
	// test
	assertEquals(cleanup(expectedText), cleanup(text));

	// show triples
	int tripleCounter = this.printTriples(m);
	assertEquals(10, tripleCounter);
	}

	@Test
	public void testMailExtraction() throws Exception {
	String testFile = "mail-multipart-test.eml";
	InputStream in = getResourceAsStream(testFile);
	assertNotNull("failed to load resource " + testFile, in);
	Model m = extractor.extract(in, new URIImpl("file://" + testFile), "message/rfc822");
	boolean textContained = m.contains(Variable.ANY, NMO.plainTextMessageContent, Variable.ANY);
	assertTrue(textContained);
	}

	/**
	* This prints out the Stanbol Enhancer triples that would be created for the metadata
	* contained in the given model.
	*
	* @param m a {@link Model}
	*
	* @return an {@code int} with the number of added triples
	*/
	private int printTriples(Model m) {

	int tripleCounter = 0;

	HashMap<BlankNode, BNode> blankNodeMap = new HashMap<BlankNode, BNode>();

	ClosableIterator<Statement> it = m.iterator();
	while (it.hasNext()) {
	Statement oneStmt = it.next();

	NonLiteral subject = (NonLiteral) MetaxaEngine.asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
	UriRef predicate = (UriRef) MetaxaEngine.asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
	Resource object = MetaxaEngine.asClerezzaResource(oneStmt.getObject(), blankNodeMap);

	if (null != subject
	&& null != predicate
	&& null != object) {
	Triple t =
	new TripleImpl(subject, predicate, object);
	LOG.debug("adding " + t);
	tripleCounter++;
	} else {
	LOG.debug("skipped " + oneStmt.toString());
	}
	}
	it.close();

	return tripleCounter;
	}

	/**
	* Cleanup strings for comparison, by removing non-printable chars.
	*
	* @param txt a {@link String} with the text to clean
	*
	* @return a {@link String} with the result
	*/
	private String cleanup(String txt) {
	final StringBuilder sb = new StringBuilder();
	for (int i = 0; i < txt.length(); i++) {
	final char c = txt.charAt(i);
	if (c >= ' ') {
	sb.append(c);
	}
	}
	return sb.toString();
	}

	private InputStream getResourceAsStream(String testResultFile) {
	return this.getClass().getClassLoader().getResourceAsStream(
	testResultFile);
	}

	}