enhancement-engines/htmlextractor/src/test/java/org/apache/stanbol/enhancer/engines/htmlextractor/TestHtmlExtractor.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.stanbol.enhancer.engines.htmlextractor;

 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Set;

 import org.apache.clerezza.commons.rdf.Graph;
 import org.apache.clerezza.commons.rdf.BlankNodeOrIRI;
 import org.apache.clerezza.commons.rdf.Triple;
 import org.apache.clerezza.commons.rdf.IRI;
 import org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph;
 import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ClerezzaRDFUtils;
 import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException;
 import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractionRegistry;
 import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor;
 import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlParser;
 import org.apache.stanbol.enhancer.engines.htmlextractor.impl.InitializationException;
 import org.junit.BeforeClass;
 import org.junit.Test;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  *
  * @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
  *
  */

 public class TestHtmlExtractor {
     private static final Logger LOG = LoggerFactory.getLogger(TestHtmlExtractor.class);

     private static HtmlParser parser;

     private static HtmlExtractionRegistry registry;

     // define the Nepomuks NIE namespace locally here
     private static final String NIE_NS = "http://www.semanticdesktop.org/ontologies/2007/01/19/nie#";

     @BeforeClass
     public static void oneTimeSetup() throws IOException {
         try {
             registry = new HtmlExtractionRegistry("htmlextractors.xml");
         }
         catch (InitializationException e) {
             LOG.error("Registry Initialization Error: " + e.getMessage());
             throw new IOException(e.getMessage());
         }
         parser = new HtmlParser();

     }

     /**
      * This tests the RDFa extraction.
      *
      * @throws ExtractorException if there is an error during extraction
      * @throws IOException if there is an error when reading the document
      */
     @Test
     public void testRdfaExtraction() throws Exception {
         HtmlExtractor extractor = new HtmlExtractor(registry, parser);
         Graph model = new SimpleGraph();
         String testFile = "test-rdfa.html";
         // extract text from RDFa annotated html
         InputStream in = getResourceAsStream(testFile);
         assertNotNull("failed to load resource " + testFile, in);

         extractor.extract("file://" + testFile,in,null, "text/html", model);

         // show triples
         int tripleCounter = model.size();
         LOG.debug("RDFa triples: {}",tripleCounter);
         printTriples(model);
         assertEquals(8, tripleCounter);
         ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
     }

     /** This tests some Microformat extraction
      *
      * @throws ExtractorException if there is an error during extraction
      * @throws IOException if there is an error when reading the document
      */
     @Test
     public void testMFExtraction() throws Exception {
         HtmlExtractor extractor = new HtmlExtractor(registry, parser);
         Graph model = new SimpleGraph();
         String testFile = "test-MF.html";

         // extract text from RDFa annotated html
         InputStream in = getResourceAsStream(testFile);
         assertNotNull("failed to load resource " + testFile, in);

         extractor.extract("file://" + testFile,in,null, "text/html", model);

         // show triples
         int tripleCounter = model.size();
         LOG.debug("Microformat triples: {}",tripleCounter);
         printTriples(model);
         assertEquals(127, tripleCounter);
         ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
     }

     /** This test some extraction of microdata from an HTML-5 document
      *
      * @throws Exception
      */
     @Test
     public void testMicrodataExtraction() throws Exception {
       HtmlExtractor extractor = new HtmlExtractor(registry, parser);
       Graph model = new SimpleGraph();
       String testFile = "test-microdata.html";

       // extract text from RDFa annotated html
       InputStream in = getResourceAsStream(testFile);
       assertNotNull("failed to load resource " + testFile, in);

       extractor.extract("file://" + testFile,in,null, "text/html", model);

       // show triples
       int tripleCounter = model.size();
       LOG.debug("Microdata triples: {}",tripleCounter);
       printTriples(model);
       assertEquals(91, tripleCounter);
       ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
     }

     /** This tests the merging of disconnected graphs under a single root
      *
      * @throws Exception
      */
     @Test
     public void testRootExtraction() throws Exception {
         HtmlExtractor extractor = new HtmlExtractor(registry, parser);
         Graph model = new SimpleGraph();
         String testFile = "test-MultiRoot.html";

         // extract text from RDFa annotated html
         InputStream in = getResourceAsStream(testFile);
         assertNotNull("failed to load resource " + testFile, in);

         extractor.extract("file://" + testFile,in,null, "text/html", model);

         // show triples
         int tripleCounter = model.size();
         LOG.debug("Triples: {}",tripleCounter);
         printTriples(model);
         Set<BlankNodeOrIRI> roots = ClerezzaRDFUtils.findRoots(model);
         assertTrue(roots.size() > 1);
         ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
         roots = ClerezzaRDFUtils.findRoots(model);
         assertEquals(1,roots.size());
     }

     private InputStream getResourceAsStream(String testResultFile) {
         return this.getClass().getClassLoader().getResourceAsStream(
                 testResultFile);
     }

     private void printTriples(Graph model) {
         for (Triple t: model) {
             LOG.debug(t.toString());
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.stanbol.enhancer.engines.htmlextractor;

	import static org.junit.Assert.assertEquals;
	import static org.junit.Assert.assertNotNull;
	import static org.junit.Assert.assertTrue;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.Set;

	import org.apache.clerezza.commons.rdf.Graph;
	import org.apache.clerezza.commons.rdf.BlankNodeOrIRI;
	import org.apache.clerezza.commons.rdf.Triple;
	import org.apache.clerezza.commons.rdf.IRI;
	import org.apache.clerezza.commons.rdf.impl.utils.simple.SimpleGraph;
	import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ClerezzaRDFUtils;
	import org.apache.stanbol.enhancer.engines.htmlextractor.impl.ExtractorException;
	import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractionRegistry;
	import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlExtractor;
	import org.apache.stanbol.enhancer.engines.htmlextractor.impl.HtmlParser;
	import org.apache.stanbol.enhancer.engines.htmlextractor.impl.InitializationException;
	import org.junit.BeforeClass;
	import org.junit.Test;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	*
	* @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
	*
	*/

	public class TestHtmlExtractor {
	private static final Logger LOG = LoggerFactory.getLogger(TestHtmlExtractor.class);

	private static HtmlParser parser;

	private static HtmlExtractionRegistry registry;

	// define the Nepomuks NIE namespace locally here
	private static final String NIE_NS = "http://www.semanticdesktop.org/ontologies/2007/01/19/nie#";

	@BeforeClass
	public static void oneTimeSetup() throws IOException {
	try {
	registry = new HtmlExtractionRegistry("htmlextractors.xml");
	}
	catch (InitializationException e) {
	LOG.error("Registry Initialization Error: " + e.getMessage());
	throw new IOException(e.getMessage());
	}
	parser = new HtmlParser();

	}

	/**
	* This tests the RDFa extraction.
	*
	* @throws ExtractorException if there is an error during extraction
	* @throws IOException if there is an error when reading the document
	*/
	@Test
	public void testRdfaExtraction() throws Exception {
	HtmlExtractor extractor = new HtmlExtractor(registry, parser);
	Graph model = new SimpleGraph();
	String testFile = "test-rdfa.html";
	// extract text from RDFa annotated html
	InputStream in = getResourceAsStream(testFile);
	assertNotNull("failed to load resource " + testFile, in);

	extractor.extract("file://" + testFile,in,null, "text/html", model);

	// show triples
	int tripleCounter = model.size();
	LOG.debug("RDFa triples: {}",tripleCounter);
	printTriples(model);
	assertEquals(8, tripleCounter);
	ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
	}

	/** This tests some Microformat extraction
	*
	* @throws ExtractorException if there is an error during extraction
	* @throws IOException if there is an error when reading the document
	*/
	@Test
	public void testMFExtraction() throws Exception {
	HtmlExtractor extractor = new HtmlExtractor(registry, parser);
	Graph model = new SimpleGraph();
	String testFile = "test-MF.html";

	// extract text from RDFa annotated html
	InputStream in = getResourceAsStream(testFile);
	assertNotNull("failed to load resource " + testFile, in);

	extractor.extract("file://" + testFile,in,null, "text/html", model);

	// show triples
	int tripleCounter = model.size();
	LOG.debug("Microformat triples: {}",tripleCounter);
	printTriples(model);
	assertEquals(127, tripleCounter);
	ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
	}

	/** This test some extraction of microdata from an HTML-5 document
	*
	* @throws Exception
	*/
	@Test
	public void testMicrodataExtraction() throws Exception {
	HtmlExtractor extractor = new HtmlExtractor(registry, parser);
	Graph model = new SimpleGraph();
	String testFile = "test-microdata.html";

	// extract text from RDFa annotated html
	InputStream in = getResourceAsStream(testFile);
	assertNotNull("failed to load resource " + testFile, in);

	extractor.extract("file://" + testFile,in,null, "text/html", model);

	// show triples
	int tripleCounter = model.size();
	LOG.debug("Microdata triples: {}",tripleCounter);
	printTriples(model);
	assertEquals(91, tripleCounter);
	ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
	}

	/** This tests the merging of disconnected graphs under a single root
	*
	* @throws Exception
	*/
	@Test
	public void testRootExtraction() throws Exception {
	HtmlExtractor extractor = new HtmlExtractor(registry, parser);
	Graph model = new SimpleGraph();
	String testFile = "test-MultiRoot.html";

	// extract text from RDFa annotated html
	InputStream in = getResourceAsStream(testFile);
	assertNotNull("failed to load resource " + testFile, in);

	extractor.extract("file://" + testFile,in,null, "text/html", model);

	// show triples
	int tripleCounter = model.size();
	LOG.debug("Triples: {}",tripleCounter);
	printTriples(model);
	Set<BlankNodeOrIRI> roots = ClerezzaRDFUtils.findRoots(model);
	assertTrue(roots.size() > 1);
	ClerezzaRDFUtils.makeConnected(model, new IRI("file://" + testFile), new IRI(NIE_NS+"contains"));
	roots = ClerezzaRDFUtils.findRoots(model);
	assertEquals(1,roots.size());
	}

	private InputStream getResourceAsStream(String testResultFile) {
	return this.getClass().getClassLoader().getResourceAsStream(
	testResultFile);
	}

	private void printTriples(Graph model) {
	for (Triple t: model) {
	LOG.debug(t.toString());
	}
	}
	}