blob: ccc89cf266679b7dcefbf854de1bbf1716d23ae9 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.metaxa.core.html;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.ontoware.rdf2go.model.Syntax;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
/**
* HtmlExtractionRegistry.java
*
* @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
*/
public class HtmlExtractionRegistry {
/**
* This contains the logger.
*/
private static final Logger LOG =
LoggerFactory.getLogger(HtmlExtractionRegistry.class);
private HashMap<String, HtmlExtractionComponent> registry;
private HashSet<String> activeExtractors;
public HtmlExtractionRegistry() {
registry = new HashMap<String, HtmlExtractionComponent>();
activeExtractors = new HashSet<String>();
}
public HtmlExtractionRegistry(String configFileName)
throws InitializationException {
this();
InputStream config = getClass().getClassLoader().getResourceAsStream(configFileName);
if (config == null) {
throw new InitializationException("File not found: "+configFileName);
}
initialize(config);
}
public HtmlExtractionRegistry(InputStream config) throws InitializationException {
this();
initialize(config);
}
public void initialize(InputStream configFileStream)
throws InitializationException {
try {
XPathFactory factory = XPathFactory.newInstance();
XPath xPath = factory.newXPath();
DocumentBuilder parser =
DocumentBuilderFactory.newInstance().newDocumentBuilder();
Document document = parser.parse(new InputSource(configFileStream));
Node node;
NodeList nodes = (NodeList) xPath.evaluate("/htmlextractors/extractor", document, XPathConstants.NODESET);
if (nodes != null) {
TransformerFactory transFac = TransformerFactory.newInstance();
transFac.setURIResolver(new BundleURIResolver());
for (int j = 0, iCnt = nodes.getLength(); j < iCnt; j++) {
Node nd = nodes.item(j);
node = (Node)xPath.evaluate("@id", nd, XPathConstants.NODE);
String id = node.getNodeValue();
Node srcNode =
(Node)xPath.evaluate("source", nd, XPathConstants.NODE);
if (srcNode != null) {
node = (Node) xPath.evaluate("@type", srcNode, XPathConstants.NODE);
String srcType = node.getNodeValue();
if (srcType.equals("xslt")) {
String rdfFormat = "rdfxml";
Syntax rdfSyntax = Syntax.RdfXml;
node =
(Node)xPath.evaluate("@syntax", srcNode,
XPathConstants.NODE);
if (node != null) {
rdfFormat = node.getNodeValue();
if (rdfFormat.equalsIgnoreCase("turtle")) {
rdfSyntax = Syntax.Turtle;
}
else if (rdfFormat.equalsIgnoreCase("ntriple")) {
rdfSyntax = Syntax.Ntriples;
}
else if (rdfFormat.equalsIgnoreCase("n3")) {
rdfSyntax = XsltExtractor.N3;
}
else if (!rdfFormat.equalsIgnoreCase("rdfxml")) {
throw new InitializationException(
"Unknown RDF Syntax: " + rdfFormat
+ " for " + id + " extractor");
}
}
// TODO: do something about disjunctions of
// Extractors? Assume, only RDFa or Microformats are
// used?
String fileName = DOMUtils.getText(srcNode);
XsltExtractor xsltExtractor =
new XsltExtractor(id, fileName,transFac);
xsltExtractor.setSyntax(rdfSyntax);
// name of URI/URL parameter of the script (default
// "uri")
node =
(Node)xPath.evaluate("@uri", srcNode,
XPathConstants.NODE);
if (node != null) {
xsltExtractor.setUriParameter(node
.getNodeValue());
}
registry.put(id, xsltExtractor);
activeExtractors.add(id);
}
else if (srcType.equals("java")) {
String clsName = srcNode.getNodeValue();
Object extractor =
Class.forName(clsName).newInstance();
if (extractor instanceof HtmlExtractionComponent) {
registry.put(id,
(HtmlExtractionComponent)extractor);
activeExtractors.add(id);
}
else {
throw new InitializationException(
"clsName is not an HtmlExtractionComponent");
}
}
else {
LOG.warn("No valid type for extractor found: "
+ id);
}
LOG.info("Extractor for: " + id);
}
}
}
} catch (FileNotFoundException e) {
throw new InitializationException(e.getMessage(), e);
} catch (XPathExpressionException e) {
throw new InitializationException(e.getMessage(), e);
} catch (DOMException e) {
throw new InitializationException(e.getMessage(), e);
} catch (ParserConfigurationException e) {
throw new InitializationException(e.getMessage(), e);
} catch (SAXException e) {
throw new InitializationException(e.getMessage(), e);
} catch (IOException e) {
throw new InitializationException(e.getMessage(), e);
} catch (ClassNotFoundException e) {
throw new InitializationException(e.getMessage(), e);
} catch (InstantiationException e) {
throw new InitializationException(e.getMessage(), e);
} catch (IllegalAccessException e) {
throw new InitializationException(e.getMessage(), e);
}
}
public HashMap<String, HtmlExtractionComponent> getRegistry() {
return registry;
}
public void setRegistry(HashMap<String, HtmlExtractionComponent> registry) {
this.registry = registry;
}
public Set<String> getActiveExtractors() {
return activeExtractors;
}
public void setActiveExtractors(HashSet<String> activeExtractors) {
this.activeExtractors = activeExtractors;
}
public static void main(String[] args) throws Exception {
int argv = 0;
HtmlExtractionRegistry inst = new HtmlExtractionRegistry(args[0]);
System.err.println("Active Components: " + inst.activeExtractors.size());
for (String s : inst.activeExtractors) {
System.err.println(s);
}
}
public void add(String id, String resourceName, String type)
throws InitializationException {
}
public void remove(String id) {
}
public void activate(String id) {
}
public void deactivate(String id) {
}
}