blob: bfcc641a21bfd50ca2417b3bcaf9c9477b0879c4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.metaxa.core.html;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import javax.xml.transform.TransformerFactory;
import org.ontoware.aifbcommons.collection.ClosableIterator;
import org.ontoware.rdf2go.model.Model;
import org.ontoware.rdf2go.model.Statement;
import org.ontoware.rdf2go.model.Syntax;
import org.ontoware.rdf2go.model.node.URI;
import org.ontoware.rdf2go.model.node.Variable;
import org.ontoware.rdf2go.util.RDFTool;
import org.semanticdesktop.aperture.extractor.ExtractorException;
import org.semanticdesktop.aperture.rdf.RDFContainer;
import org.semanticdesktop.aperture.vocabulary.NCO;
import org.semanticdesktop.aperture.vocabulary.NIE;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
/**
* Utility class that provides core HTML text and metadata extraction independent of the configuration of Metaxa's main HTML extractor
*
* @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
*
*/
public class HtmlTextExtractUtil {
private static final Logger LOG = LoggerFactory.getLogger(HtmlTextExtractUtil.class);
private static HtmlParser htmlParser = new HtmlParser();
private static XsltExtractor htmlExtractor;
public HtmlTextExtractUtil() throws InitializationException {
if (HtmlTextExtractUtil.htmlExtractor == null) {
TransformerFactory transFac = TransformerFactory.newInstance();
transFac.setURIResolver(new BundleURIResolver());
HtmlTextExtractUtil.htmlExtractor = new XsltExtractor("any", "xslt/htmlmetadata.xsl", transFac);
HtmlTextExtractUtil.htmlExtractor.setSyntax(Syntax.RdfXml);
}
}
public String getTitle(Model meta) {
Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.title, Variable.ANY);
if (stmt != null) {
return stmt.getObject().toString();
}
return null;
}
public String getAuthor(Model meta) {
Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NCO.creator, Variable.ANY);
if (stmt != null) {
stmt = RDFTool.findStatement(meta, stmt.getSubject(), NCO.fullname, Variable.ANY);
if (stmt != null) {
return stmt.getObject().toString();
}
}
return null;
}
public String getDescription(Model meta) {
Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.description, Variable.ANY);
if (stmt != null) {
return stmt.getObject().toString();
}
return null;
}
public List<String> getKeywords(Model meta) {
List<String> kws = new ArrayList<String>();
ClosableIterator<Statement> it = meta.findStatements(Variable.ANY, NIE.keyword, Variable.ANY);
while (it.hasNext()) {
kws.add(it.next().getObject().toString());
}
it.close();
return kws;
}
public String getText(Model meta) {
Statement stmt = RDFTool.findStatement(meta, Variable.ANY, NIE.plainTextContent, Variable.ANY);
if (stmt != null) {
return stmt.getObject().toString();
}
return null;
}
public void extract(URI id, String charset, InputStream input, RDFContainer result) throws ExtractorException {
String encoding = charset;
if (charset == null) {
try {
encoding = CharsetRecognizer.detect(input, "html", null);
} catch (IOException e) {
LOG.error("Charset detection problem: " + e.getMessage());
throw new ExtractorException("Charset detection problem: " + e.getMessage());
}
}
Document doc = htmlParser.getDOM(input, encoding);
htmlExtractor.extract(id.toString(), doc, null, result);
}
}