blob: fc17b4fa0427f4cdbe63d37230ac7ed5bf7e4754 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import org.apache.any23.extractor.IssueReport;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.rdf.RDFParserFactory;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.rio.RDFParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import java.io.IOException;
import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
/**
* Extractor for <i>Turtle/N3</i> format embedded within <i>HTML</i>
* <i>script</i> tags.
*
* See specification draft <a href="http://esw.w3.org/N3inHTML">here</a>.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
public class TurtleHTMLExtractor implements Extractor.TagSoupDOMExtractor {
private RDFParser turtleParser;
@Override
public void run(
ExtractionParameters extractionParameters,
ExtractionContext extractionContext,
Document in,
ExtractionResult out
) throws IOException, ExtractionException {
List<Node> scriptNodes;
HTMLDocument htmlDocument = new HTMLDocument(in);
final IRI documentIRI = extractionContext.getDocumentIRI();
scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/turtle')]");
processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/n3')]");
processScriptNodes(documentIRI, extractionContext, out, scriptNodes);
scriptNodes = htmlDocument.findAll(".//SCRIPT[contains(@type,'text/plain')]");
processScriptNodes(documentIRI, extractionContext,out, scriptNodes);
}
@Override
public ExtractorDescription getDescription() {
return TurtleHTMLExtractorFactory.getDescriptionInstance();
}
/**
* Processes a list of <i>html script</i> nodes retrieving the N3 / Turtle content.
*
* @param documentIRI the IRI of the original HTML document.
* @param er the extraction result used to store triples.
* @param ns the list of script nodes.
*/
private void processScriptNodes(IRI documentIRI, ExtractionContext ec, ExtractionResult er, List<Node> ns) {
if(ns.size() > 0 && turtleParser == null) {
turtleParser = RDFParserFactory.getInstance().getTurtleParserInstance(true, false, ec, er);
}
for(Node n : ns) {
processScriptNode(turtleParser, documentIRI, n, er);
}
}
/**
* Processes a single <i>html script</i> node.
*
* @param turtleParser the parser used to digest node content.
* @param documentIRI the IRI of the original HTML document.
* @param n the script node.
* @param er the extraction result used to store triples.
*/
private void processScriptNode(RDFParser turtleParser, IRI documentIRI, Node n, ExtractionResult er) {
final Node idAttribute = n.getAttributes().getNamedItem("id");
final String graphName =
documentIRI.stringValue() +
( idAttribute == null ? "" : "#" + idAttribute.getTextContent() );
try {
turtleParser.parse( new StringReader(n.getTextContent()), graphName );
} catch (RDFParseException rdfpe) {
er.notifyIssue(
IssueReport.IssueLevel.ERROR,
String.format(Locale.ROOT,
"An error occurred while parsing turtle content within script node: %s",
Arrays.toString(DomUtils.getXPathListForNode(n))
),
rdfpe.getLineNumber(), rdfpe.getColumnNumber()
);
} catch (Exception e) {
er.notifyIssue(IssueReport.IssueLevel.ERROR, "An error occurred while processing RDF data.", -1, -1);
}
}
}