enhancement-engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/core/html/HtmlTextExtractUtil.java - stanbol - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.stanbol.enhancer.engines.metaxa.core.html;

 import java.io.IOException;
 import java.io.InputStream;
 import java.util.ArrayList;
 import java.util.List;

 import javax.xml.transform.TransformerFactory;

 import org.ontoware.aifbcommons.collection.ClosableIterator;
 import org.ontoware.rdf2go.model.Model;
 import org.ontoware.rdf2go.model.Statement;
 import org.ontoware.rdf2go.model.Syntax;
 import org.ontoware.rdf2go.model.node.URI;
 import org.ontoware.rdf2go.model.node.Variable;
 import org.ontoware.rdf2go.util.RDFTool;
 import org.semanticdesktop.aperture.extractor.ExtractorException;
 import org.semanticdesktop.aperture.rdf.RDFContainer;
 import org.semanticdesktop.aperture.vocabulary.NCO;
 import org.semanticdesktop.aperture.vocabulary.NIE;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;

 /**
  * Utility class that provides core HTML text and metadata extraction independent of the configuration of Metaxa's main HTML extractor
  *
  * @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
  *
  */

 public class HtmlTextExtractUtil {
     private static final Logger LOG = LoggerFactory.getLogger(HtmlTextExtractUtil.class);

     private static HtmlParser htmlParser = new HtmlParser();
     private static XsltExtractor htmlExtractor;

     public HtmlTextExtractUtil() throws InitializationException {
         if (HtmlTextExtractUtil.htmlExtractor == null) {
             TransformerFactory transFac = TransformerFactory.newInstance();
             transFac.setURIResolver(new BundleURIResolver());
             HtmlTextExtractUtil.htmlExtractor = new XsltExtractor("any", "xslt/htmlmetadata.xsl", transFac);
             HtmlTextExtractUtil.htmlExtractor.setSyntax(Syntax.RdfXml);
         }
     }

     public String getTitle(Model meta) {
         Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.title, Variable.ANY);
         if (stmt != null) {
             return stmt.getObject().toString();
         }
         return null;
     }

     public String getAuthor(Model meta) {
         Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NCO.creator, Variable.ANY);
         if (stmt != null) {
             stmt = RDFTool.findStatement(meta, stmt.getSubject(), NCO.fullname, Variable.ANY);
             if (stmt != null) {
                 return stmt.getObject().toString();
             }
         }
         return null;
     }

     public String getDescription(Model meta) {
         Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.description, Variable.ANY);
         if (stmt != null) {
             return stmt.getObject().toString();
         }
         return null;
     }

     public List<String> getKeywords(Model meta) {
         List<String> kws = new ArrayList<String>();
         ClosableIterator<Statement> it = meta.findStatements(Variable.ANY, NIE.keyword, Variable.ANY);
         while (it.hasNext()) {
             kws.add(it.next().getObject().toString());
         }
         it.close();
         return kws;
     }

     public String getText(Model meta) {
         Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.plainTextContent, Variable.ANY);
         if (stmt != null) {
             return stmt.getObject().toString();
         }
         return null;
     }

     public void extract(URI id, String charset, InputStream input, RDFContainer result) throws ExtractorException {
         String encoding = charset;
         if (charset == null) {
             try {
                 encoding = CharsetRecognizer.detect(input, "html", null);
             } catch (IOException e) {
                 LOG.error("Charset detection problem: " + e.getMessage());
                 throw new ExtractorException("Charset detection problem: " + e.getMessage());
             }
         }
         Document doc = htmlParser.getDOM(input, encoding);
         htmlExtractor.extract(id.toString(), doc, null, result);
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.stanbol.enhancer.engines.metaxa.core.html;

	import java.io.IOException;
	import java.io.InputStream;
	import java.util.ArrayList;
	import java.util.List;

	import javax.xml.transform.TransformerFactory;

	import org.ontoware.aifbcommons.collection.ClosableIterator;
	import org.ontoware.rdf2go.model.Model;
	import org.ontoware.rdf2go.model.Statement;
	import org.ontoware.rdf2go.model.Syntax;
	import org.ontoware.rdf2go.model.node.URI;
	import org.ontoware.rdf2go.model.node.Variable;
	import org.ontoware.rdf2go.util.RDFTool;
	import org.semanticdesktop.aperture.extractor.ExtractorException;
	import org.semanticdesktop.aperture.rdf.RDFContainer;
	import org.semanticdesktop.aperture.vocabulary.NCO;
	import org.semanticdesktop.aperture.vocabulary.NIE;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.Document;

	/**
	* Utility class that provides core HTML text and metadata extraction independent of the configuration of Metaxa's main HTML extractor
	*
	* @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
	*
	*/

	public class HtmlTextExtractUtil {
	private static final Logger LOG = LoggerFactory.getLogger(HtmlTextExtractUtil.class);

	private static HtmlParser htmlParser = new HtmlParser();
	private static XsltExtractor htmlExtractor;

	public HtmlTextExtractUtil() throws InitializationException {
	if (HtmlTextExtractUtil.htmlExtractor == null) {
	TransformerFactory transFac = TransformerFactory.newInstance();
	transFac.setURIResolver(new BundleURIResolver());
	HtmlTextExtractUtil.htmlExtractor = new XsltExtractor("any", "xslt/htmlmetadata.xsl", transFac);
	HtmlTextExtractUtil.htmlExtractor.setSyntax(Syntax.RdfXml);
	}
	}

	public String getTitle(Model meta) {
	Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.title, Variable.ANY);
	if (stmt != null) {
	return stmt.getObject().toString();
	}
	return null;
	}

	public String getAuthor(Model meta) {
	Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NCO.creator, Variable.ANY);
	if (stmt != null) {
	stmt = RDFTool.findStatement(meta, stmt.getSubject(), NCO.fullname, Variable.ANY);
	if (stmt != null) {
	return stmt.getObject().toString();
	}
	}
	return null;
	}

	public String getDescription(Model meta) {
	Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.description, Variable.ANY);
	if (stmt != null) {
	return stmt.getObject().toString();
	}
	return null;
	}

	public List<String> getKeywords(Model meta) {
	List<String> kws = new ArrayList<String>();
	ClosableIterator<Statement> it = meta.findStatements(Variable.ANY, NIE.keyword, Variable.ANY);
	while (it.hasNext()) {
	kws.add(it.next().getObject().toString());
	}
	it.close();
	return kws;
	}

	public String getText(Model meta) {
	Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.plainTextContent, Variable.ANY);
	if (stmt != null) {
	return stmt.getObject().toString();
	}
	return null;
	}

	public void extract(URI id, String charset, InputStream input, RDFContainer result) throws ExtractorException {
	String encoding = charset;
	if (charset == null) {
	try {
	encoding = CharsetRecognizer.detect(input, "html", null);
	} catch (IOException e) {
	LOG.error("Charset detection problem: " + e.getMessage());
	throw new ExtractorException("Charset detection problem: " + e.getMessage());
	}
	}
	Document doc = htmlParser.getDOM(input, encoding);
	htmlExtractor.extract(id.toString(), doc, null, result);
	}

	}