blob: d3fd414b8d8937e575c530d68a09a1c883a0c1ab [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.html;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.rdf.JSONLDExtractor;
import org.apache.any23.extractor.rdf.JSONLDExtractorFactory;
import org.apache.any23.rdf.RDFUtils;
import org.apache.any23.vocab.SINDICE;
import org.apache.commons.io.IOUtils;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* This extractor represents the HTML script tags used to embed blocks of data in documents. This way, JSON-LD content
* can be easily embedded in HTML by placing it in a script element with the type attribute set to application/ld+json
* according the <a href="http://www.w3.org/TR/json-ld/#embedding-json-ld-in-html-documents" >JSON-LD specification</a>.
*
*/
public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor {
private static final SINDICE vSINDICE = SINDICE.getInstance();
private IRI profile;
private Map<String, IRI> prefixes = new HashMap<>();
private String documentLang;
private JSONLDExtractor extractor;
/**
* {@inheritDoc}
*/
@Override
public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
ExtractionResult out) throws IOException, ExtractionException {
profile = extractProfile(in);
documentLang = getDocumentLanguage(in);
extractLinkDefinedPrefixes(in);
String baseProfile = vSINDICE.NS;
if (profile != null) {
baseProfile = profile.toString();
}
extractionContext.getDocumentIRI();
Set<JSONLDScript> jsonldScripts = extractJSONLDScript(in, baseProfile, extractionParameters, extractionContext,
out);
for (JSONLDScript jsonldScript : jsonldScripts) {
// String lang = documentLang;
// if (jsonldScript.getLang() != null) {
// lang = jsonldScript.getLang();
// }
// out.writeTriple(documentIRI, jsonldScript.getName(),
// SimpleValueFactory.getInstance().createLiteral(jsonldScript.getContent(), lang));
}
}
/**
* Returns the {@link Document} language if declared, <code>null</code> otherwise.
*
* @param in
* a instance of {@link Document}.
*
* @return the language declared, could be <code>null</code>.
*/
private String getDocumentLanguage(Document in) {
String lang = DomUtils.find(in, "string(/HTML/@lang)");
if ("".equals(lang)) {
return null;
}
return lang;
}
private IRI extractProfile(Document in) {
String profile = DomUtils.find(in, "string(/HTML/@profile)");
if ("".equals(profile)) {
return null;
}
return SimpleValueFactory.getInstance().createIRI(profile);
}
/**
* It extracts prefixes defined in the <i>LINK</i> meta tags.
*
* @param in
*/
private void extractLinkDefinedPrefixes(Document in) {
List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
for (Node linkNode : linkNodes) {
NamedNodeMap attributes = linkNode.getAttributes();
Node relNode = attributes.getNamedItem("rel");
String rel = relNode == null ? null : relNode.getTextContent();
Node hrefNode = attributes.getNamedItem("href");
String href = hrefNode == null ? null : hrefNode.getTextContent();
if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
}
}
}
private Set<JSONLDScript> extractJSONLDScript(Document in, String baseProfile,
ExtractionParameters extractionParameters, ExtractionContext extractionContext, ExtractionResult out)
throws IOException, ExtractionException {
List<Node> scriptNodes = DomUtils.findAll(in, "//SCRIPT");
Set<JSONLDScript> result = new HashSet<>();
extractor = new JSONLDExtractorFactory().createExtractor();
for (Node jsonldNode : scriptNodes) {
NamedNodeMap attributes = jsonldNode.getAttributes();
for (int i = 0; i < attributes.getLength(); i++) {
if ("application/ld+json".equalsIgnoreCase(attributes.item(i).getTextContent())) {
extractor.run(extractionParameters, extractionContext,
IOUtils.toInputStream(jsonldNode.getTextContent(), StandardCharsets.UTF_8), out);
}
}
Node nameAttribute = attributes.getNamedItem("name");
Node contentAttribute = attributes.getNamedItem("content");
if (nameAttribute == null || contentAttribute == null) {
continue;
}
String name = nameAttribute.getTextContent();
String content = contentAttribute.getTextContent();
String xpath = DomUtils.getXPathForNode(jsonldNode);
IRI nameAsIRI = getPrefixIfExists(name);
if (nameAsIRI == null) {
nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
}
JSONLDScript jsonldScript = new JSONLDScript(xpath, nameAsIRI, content);
result.add(jsonldScript);
}
return result;
}
private IRI getPrefixIfExists(String name) {
String[] split = name.split("\\.");
if (split.length == 2 && prefixes.containsKey(split[0])) {
return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
}
return null;
}
@Override
public ExtractorDescription getDescription() {
return EmbeddedJSONLDExtractorFactory.getDescriptionInstance();
}
private static class JSONLDScript {
private String xpath;
public JSONLDScript(String xpath, IRI name, String content) {
this.xpath = xpath;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null) {
return false;
}
if (!(o instanceof JSONLDScript)) {
return false;
}
JSONLDScript meta = (JSONLDScript) o;
if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) {
return false;
}
return true;
}
@Override
public int hashCode() {
return xpath != null ? xpath.hashCode() : 0;
}
}
}