blob: 6fc58fec3b82c1795e11b1909a8f7c4aa40a35fe [file] [log] [blame]
package org.apache.stanbol.enhancer.engines.htmlextractor.impl;
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import javax.xml.transform.Source;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.apache.clerezza.rdf.core.Graph;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.serializedform.Parser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
/**
* XsltExtractor.java
*
* @author <a href="mailto:kasper@dfki.de">Walter Kasper</a>
*/
public class XsltExtractor implements HtmlExtractionComponent {
/**
* This contains the logger.
*/
private static final Logger LOG =
LoggerFactory.getLogger(XsltExtractor.class);
private String uriParameter = "uri";
private Transformer transformer;
private String id;
private URI source;
private String syntax ="application/rdf+xml";
public XsltExtractor() {
}
public XsltExtractor(String id, String fileName, TransformerFactory factory)
throws InitializationException {
this.id = id;
try {
URI location =
getClass().getClassLoader().getResource(fileName).toURI();
source = location;
} catch (URISyntaxException e) {
throw new InitializationException(e.getMessage(), e);
}
initialize(factory);
}
public String getUriParameter() {
return uriParameter;
}
public void setUriParameter(String uriParameter) {
this.uriParameter = uriParameter;
}
public Transformer getTransformer() {
return transformer;
}
public void setTransformer(Transformer transformer) {
this.transformer = transformer;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public URI getSource() {
return source;
}
public void setSource(URI source) {
this.source = source;
}
/**
* @return the syntax
*/
public String getSyntax() {
return syntax;
}
/**
* @param syntax the syntax to set
*/
public void setSyntax(String syntax) {
this.syntax = syntax;
}
public synchronized void extract(String id, Document doc, Map<String, Object> params,
MGraph result)
throws ExtractorException {
if (params == null) {
params = new HashMap<String, Object>();
}
params.put(this.uriParameter, id);
initTransformerParameters(params);
Source source = new DOMSource(doc);
ByteArrayOutputStream writer = new ByteArrayOutputStream(8192);
StreamResult output = new StreamResult(writer);
try {
this.transformer.transform(source, output);
if (LOG.isDebugEnabled()) {
String rdf = writer.toString("UTF-8");
LOG.debug(rdf);
}
InputStream reader = new ByteArrayInputStream(writer.toByteArray());
Parser rdfParser = Parser.getInstance();
Graph graph = rdfParser.parse(reader, this.syntax);
result.addAll(graph);
} catch (TransformerException e) {
throw new ExtractorException(e.getMessage(), e);
} catch (IOException e) {
throw new ExtractorException(e.getMessage(), e);
}
}
public void initialize(TransformerFactory factory)
throws InitializationException {
if (source == null || id == null) {
throw new InitializationException("Missing source or id");
}
if (factory == null) {
factory = TransformerFactory.newInstance();
factory.setURIResolver(new BundleURIResolver());
}
StreamSource xsltSource = new StreamSource(source.toString());
xsltSource.setSystemId(source.toString());
try {
transformer = factory.newTransformer(xsltSource);
} catch (TransformerConfigurationException e) {
throw new InitializationException(e.getMessage(), e);
}
}
public void initTransformerParameters(Map<String, Object> params) {
transformer.clearParameters();
if (params != null) {
Set<String> parms = params.keySet();
for (String piter : parms) {
transformer.setParameter(piter, params.get(piter));
}
}
}
}