blob: 4028ac1e926dce8188aabb268507f8cdc66549f6 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.rdfa;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Stack;
import javax.xml.transform.TransformerException;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.IssueReport;
import org.apache.any23.extractor.html.DomUtils;
import org.apache.any23.rdf.RDFUtils;
import org.eclipse.rdf4j.model.IRI;
import org.eclipse.rdf4j.model.Literal;
import org.eclipse.rdf4j.model.Resource;
import org.eclipse.rdf4j.model.Value;
import org.eclipse.rdf4j.model.vocabulary.RDF;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
/**
* This parser is able to extract <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa 1.0</a> and
* <a href="http://www.w3.org/TR/rdfa-core/">RDFa 1.1</a> statements from any <i>(X)HTML</i> document.
*
* @deprecated since 2.3 the {@link org.eclipse.rdf4j.rio.Rio} implementations are used to parse RDFa. Look at
* {@link org.apache.any23.extractor.rdf.RDFParserFactory#getRDFa10Parser} and
* {@link org.apache.any23.extractor.rdf.RDFParserFactory#getRDFa11Parser}.
*
* @author Michele Mostarda (mostarda@fbk.eu)
*/
@Deprecated
public class RDFa11Parser {
private static final Logger logger = LoggerFactory.getLogger(RDFa11Parser.class);
public static final String CURIE_SEPARATOR = ":";
public static final char IRI_PREFIX_SEPARATOR = ':';
public static final String IRI_SCHEMA_SEPARATOR = "://";
public static final String IRI_PATH_SEPARATOR = "/";
public static final String HEAD_TAG = "HEAD";
public static final String BODY_TAG = "BODY";
public static final String XMLNS_ATTRIBUTE = "xmlns";
public static final String XML_LANG_ATTRIBUTE = "xml:lang";
public static final String REL_ATTRIBUTE = "rel";
public static final String REV_ATTRIBUTE = "rev";
public static final String ABOUT_ATTRIBUTE = "about";
public static final String RESOURCE_ATTRIBUTE = "resource";
public static final String SRC_ATTRIBUTE = "src";
public static final String HREF_ATTRIBUTE = "href";
public static final String TYPE_ATTRIBUTE = "type";
public static final String ATTRIBUTE_CSS = "text/css";
public static final String[] SUBJECT_ATTRIBUTES = { ABOUT_ATTRIBUTE, SRC_ATTRIBUTE, RESOURCE_ATTRIBUTE,
HREF_ATTRIBUTE };
public static final String PREFIX_ATTRIBUTE = "prefix";
public static final String TYPEOF_ATTRIBUTE = "typeof";
public static final String PROPERTY_ATTRIBUTE = "property";
public static final String DATATYPE_ATTRIBUTE = "datatype";
public static final String CONTENT_ATTRIBUTE = "content";
public static final String VOCAB_ATTRIBUTE = "vocab";
// TODO: introduce support for RDFa profiles. (http://www.w3.org/TR/rdfa-core/#s_profiles)
public static final String PROFILE_ATTRIBUTE = "profile";
public static final String XML_LITERAL_DATATYPE = "rdf:XMLLiteral";
public static final String XMLNS_DEFAULT = "http://www.w3.org/1999/xhtml";
private IssueReport issueReport;
private URL documentBase;
private final Stack<IRIMapping> IRIMappingStack = new Stack<>();
private final Stack<Vocabulary> vocabularyStack = new Stack<>();
private final List<IncompleteTriple> listOfIncompleteTriples = new ArrayList<>();
private final Stack<EvaluationContext> evaluationContextStack = new Stack<>();
public RDFa11Parser() {
// default constructor
}
protected static URL getDocumentBase(URL documentURL, Document document) throws MalformedURLException {
String base;
base = DomUtils.find(document, "/HTML/HEAD/BASE/@href"); // Non XHTML documents.
if (!"".equals(base))
return new URL(base);
base = DomUtils.find(document, "//*/h:head/h:base[position()=1]/@href"); // XHTML documents.
if (!"".equals(base))
return new URL(base);
return documentURL;
}
/**
* Given a prefix declaration returns a list of <code>prefixID:prefixURL</code> strings normalizing blanks where
* present.
*
* @param prefixesDeclaration
* input prefix
*
* @return list of extracted prefixes.
*/
protected static String[] extractPrefixSections(String prefixesDeclaration) {
final String[] parts = prefixesDeclaration.split("\\s");
final List<String> out = new ArrayList<>();
int i = 0;
while (i < parts.length) {
final String part = parts[i];
if (part.length() == 0) {
i++;
continue;
}
if (part.charAt(part.length() - 1) == IRI_PREFIX_SEPARATOR) {
i++;
while (i < parts.length && parts[i].length() == 0)
i++;
out.add(part + (i < parts.length ? parts[i] : ""));
i++;
} else {
out.add(parts[i]);
i++;
}
}
return out.toArray(new String[out.size()]);
}
protected static boolean isAbsoluteIRI(String iri) {
return iri.contains(IRI_SCHEMA_SEPARATOR);
}
protected static boolean isCURIE(String curie) {
if (curie == null) {
throw new NullPointerException("curie string cannot be null.");
}
if (curie.trim().length() == 0)
return false;
// '[' PREFIX ':' VALUE ']'
if (curie.charAt(0) != '[' || curie.charAt(curie.length() - 1) != ']')
return false;
int separatorIndex = curie.indexOf(CURIE_SEPARATOR);
return separatorIndex > 0 && curie.indexOf(CURIE_SEPARATOR, separatorIndex + 1) == -1;
}
protected static boolean isCURIEBNode(String curie) {
return isCURIE(curie) && curie.substring(1, curie.length() - 1).split(CURIE_SEPARATOR)[0].equals("_");
}
protected static boolean isRelativeNode(Node node) {
if (ATTRIBUTE_CSS.equals(DomUtils.readAttribute(node, TYPE_ATTRIBUTE)))
return false;
return DomUtils.hasAttribute(node, REL_ATTRIBUTE) || DomUtils.hasAttribute(node, REV_ATTRIBUTE);
}
// RDFa1.0[5.5.9.2]
protected static Literal getAsPlainLiteral(Node node, String currentLanguage) {
final String content = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
if (content != null)
return RDFUtils.literal(content, currentLanguage);
if (!node.hasChildNodes())
return RDFUtils.literal("", currentLanguage);
final String nodeTextContent = node.getTextContent();
return nodeTextContent == null ? null : RDFUtils.literal(nodeTextContent.trim(), currentLanguage);
}
protected static Literal getAsXMLLiteral(Node node) throws IOException, TransformerException {
final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
if (!XML_LITERAL_DATATYPE.equals(datatype))
return null;
final String xmlSerializedNode = DomUtils.serializeToXML(node, false);
return RDFUtils.literal(xmlSerializedNode, RDF.XMLLITERAL);
}
protected static boolean isXMLNSDeclared(Document document) {
final String attributeValue = document.getDocumentElement().getAttribute(XMLNS_ATTRIBUTE);
if (attributeValue.length() == 0)
return false;
return XMLNS_DEFAULT.equals(attributeValue);
}
/**
* <a href="http://www.w3.org/TR/rdfa-syntax/#s_model">RDFa Syntax - Processing Model</a>.
*
* @param documentURL
* {@link java.net.URL} of the document to process
* @param extractionResult
* a {@link org.apache.any23.extractor.ExtractionResult} to populate
* @param document
* the {@link org.w3c.dom.Document} to populate with parse content
*
* @throws RDFa11ParserException
* if there is an error parsing the document
*/
public void processDocument(URL documentURL, Document document, ExtractionResult extractionResult)
throws RDFa11ParserException {
try {
this.issueReport = extractionResult;
// Check RDFa1.0[4.1.3] : default XMLNS declaration.
if (!isXMLNSDeclared(document)) {
reportError(document.getDocumentElement(),
String.format(Locale.ROOT,
"The default %s namespace is expected to be declared and equal to '%s' .",
XMLNS_ATTRIBUTE, XMLNS_DEFAULT));
}
try {
documentBase = getDocumentBase(documentURL, document);
} catch (MalformedURLException murle) {
throw new RDFa11ParserException("Invalid document base URL.", murle);
}
// RDFa1.0[5.5.1]
pushContext(document, new EvaluationContext(documentBase));
depthFirstNode(document, extractionResult);
assert listOfIncompleteTriples
.isEmpty() : "The list of incomplete triples is expected to be empty at the end of processing.";
} finally {
reset();
}
}
/**
* Resets the parser to the original state.
*/
public void reset() {
issueReport = null;
documentBase = null;
IRIMappingStack.clear();
listOfIncompleteTriples.clear();
evaluationContextStack.clear();
}
/**
* Updates the vocabulary context with possible <em>@vocab</em> declarations.
*
* @param currentNode
* the current node.
*/
protected void updateVocabulary(Node currentNode) {
final String vocabularyStr = DomUtils.readAttribute(currentNode, VOCAB_ATTRIBUTE, null);
if (vocabularyStr == null)
return;
try {
pushVocabulary(currentNode, RDFUtils.iri(vocabularyStr));
} catch (Exception e) {
reportError(currentNode,
String.format(Locale.ROOT, "Invalid vocabulary [%s], must be a IRI.", vocabularyStr));
}
}
/**
* Updates the IRI mapping with the XMLNS attributes declared in the current node.
*
* @param node
* input node.
*/
protected void updateIRIMapping(Node node) {
final NamedNodeMap attributes = node.getAttributes();
if (null == attributes)
return;
Node attribute;
final List<PrefixMap> prefixMapList = new ArrayList<PrefixMap>();
final String namespacePrefix = XMLNS_ATTRIBUTE + IRI_PREFIX_SEPARATOR;
for (int a = 0; a < attributes.getLength(); a++) {
attribute = attributes.item(a);
if (attribute.getNodeName().startsWith(namespacePrefix)) {
prefixMapList.add(new PrefixMap(attribute.getNodeName().substring(namespacePrefix.length()),
resolveIRI(attribute.getNodeValue())));
}
}
extractPrefixes(node, prefixMapList);
if (prefixMapList.size() == 0)
return;
pushMappings(node, prefixMapList);
}
/**
* Returns a IRI mapping for a given prefix.
*
* @param prefix
* input prefix.
*
* @return IRI mapping.
*/
protected IRI getMapping(String prefix) {
for (IRIMapping IRIMapping : IRIMappingStack) {
final IRI mapping = IRIMapping.map.get(prefix);
if (mapping != null) {
return mapping;
}
}
return null;
}
/**
* Resolves a <em>whitelist</em> separated list of <i>CURIE</i> or <i>URI</i>.
*
* @param n
* current node.
* @param curieOrIRIList
* list of CURIE/URI.
* @param termAllowed
* determine whether the term should be whitelisted.
*
* @return list of resolved URIs.
*
* @throws URISyntaxException
* if there is an error processing CURIE or URL
*/
protected IRI[] resolveCIRIeOrIRIList(Node n, String curieOrIRIList, boolean termAllowed)
throws URISyntaxException {
if (curieOrIRIList == null || curieOrIRIList.trim().length() == 0)
return new IRI[0];
final String[] curieOrIRIListParts = curieOrIRIList.split("\\s");
final List<IRI> result = new ArrayList<>();
Resource curieOrIRI;
for (String curieORIRIListPart : curieOrIRIListParts) {
curieOrIRI = resolveCURIEOrIRI(curieORIRIListPart, termAllowed);
if (curieOrIRI != null && curieOrIRI instanceof IRI) {
result.add((IRI) curieOrIRI);
} else {
reportError(n, String.format(Locale.ROOT, "Invalid CURIE '%s' : expected IRI, found BNode.",
curieORIRIListPart));
}
}
return result.toArray(new IRI[result.size()]);
}
/**
* Resolves a IRI string as IRI.
*
* @param iriStr
* (partial) IRI string to be resolved.
*
* @return the resolved IRI.
*/
protected IRI resolveIRI(String iriStr) {
return isAbsoluteIRI(iriStr) ? RDFUtils.iri(iriStr) : RDFUtils.iri(this.documentBase.toExternalForm(), iriStr);
}
/**
* Resolves a <i>CURIE</i> or <i>IRI</i> string.
*
* @param curieOrIRI
* individual of CURIE/URI to resolve
* @param termAllowed
* if <code>true</code> the resolution can be a term.
*
* @return the resolved resource.
*/
protected Resource resolveCURIEOrIRI(String curieOrIRI, boolean termAllowed) {
if (isCURIE(curieOrIRI)) {
return resolveNamespacedIRI(curieOrIRI.substring(1, curieOrIRI.length() - 1), ResolutionPolicy.NSRequired);
}
if (isAbsoluteIRI(curieOrIRI))
return resolveIRI(curieOrIRI);
return resolveNamespacedIRI(curieOrIRI,
termAllowed ? ResolutionPolicy.TermAllowed : ResolutionPolicy.NSNotRequired);
}
/**
* Pushes a context whiting the evaluation context stack, associated to tha given generation node.
*
* @param current
* @param ec
*/
private void pushContext(Node current, EvaluationContext ec) {
ec.node = current;
evaluationContextStack.push(ec);
}
/**
* @return the peek evaluation context.
*/
private EvaluationContext getContext() {
return evaluationContextStack.peek();
}
/**
* Pops out the peek evaluation context if ancestor of current node.
*
* @param current
* current node.
*/
private void popContext(Node current) {
final Node peekNode = evaluationContextStack.peek().node;
if (DomUtils.isAncestorOf(peekNode, current)) {
evaluationContextStack.pop();
}
}
/**
* Pushes a new vocabulary definition.
*
* @param currentNode
* node proving the vocabulary.
* @param vocab
* the vocabulary IRI.
*/
private void pushVocabulary(Node currentNode, IRI vocab) {
vocabularyStack.push(new Vocabulary(currentNode, vocab));
}
/**
* @return the current peek vocabulary.
*/
private IRI getVocabulary() {
if (vocabularyStack.isEmpty())
return null;
return vocabularyStack.peek().prefix;
}
/**
* Pops out the vocabulary definition.
*
* @param current
*/
private void popVocabulary(Node current) {
if (vocabularyStack.isEmpty())
return;
if (DomUtils.isAncestorOf(current, vocabularyStack.peek().originatingNode)) {
vocabularyStack.pop();
}
}
/**
* Purge all incomplete triples originated from a node that is descendant of <code>current</code>.
*
* @param current
*/
private void purgeIncompleteTriples(Node current) {
final List<IncompleteTriple> toBePurged = new ArrayList<>();
for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
if (DomUtils.isAncestorOf(current, incompleteTriple.originatingNode, true)) {
toBePurged.add(incompleteTriple);
}
}
listOfIncompleteTriples.removeAll(toBePurged);
toBePurged.clear();
}
/**
* Reports an error to the error reporter.
*
* @param n
* originating node.
* @param msg
* human readable message.
*/
private void reportError(Node n, String msg) {
final String errorMsg = String.format(Locale.ROOT, "Error while processing node [%s] : '%s'",
DomUtils.getXPathForNode(n), msg);
final int[] errorLocation = DomUtils.getNodeLocation(n);
this.issueReport.notifyIssue(IssueReport.IssueLevel.WARNING, errorMsg,
errorLocation == null ? -1 : errorLocation[0], errorLocation == null ? -1 : errorLocation[1]);
}
/**
* Performs a <i>deep-first</i> tree visit on the given root node.
*
* @param node
* root node.
* @param extractionResult
*/
private void depthFirstNode(Node node, ExtractionResult extractionResult) {
try {
processNode(node, extractionResult);
} catch (Exception e) {
if (logger.isDebugEnabled())
logger.debug("Error while processing node.", e);
reportError(node, e.getMessage());
}
depthFirstChildren(node.getChildNodes(), extractionResult);
purgeIncompleteTriples(node);
}
/**
* Performs a <i>deep-first</i> children list visit.
*
* @param nodeList
* @param extractionResult
*/
private void depthFirstChildren(NodeList nodeList, ExtractionResult extractionResult) {
for (int i = 0; i < nodeList.getLength(); i++) {
final Node child = nodeList.item(i);
depthFirstNode(child, extractionResult);
popMappings(child);
popVocabulary(child);
popContext(child);
}
}
/**
* Writes a triple on the extraction result.
*
* @param s
* @param p
* @param o
* @param extractionResult
*/
private void writeTriple(Resource s, IRI p, Value o, ExtractionResult extractionResult) {
assert s != null : "subject is null.";
assert p != null : "predicate is null.";
assert o != null : "object is null.";
extractionResult.writeTriple(s, p, o);
}
/**
* Processes the current node on the extraction algorithm. All the steps of this algorithm are annotated with the
* specification and section which describes it. The annotation is at form
* <em>RDFa&lt;spec-version%gt;[&lt;section&gt;]</em>
*
* @param currentElement
* @param extractionResult
*
* @throws Exception
*/
// TODO: add references to the RDFa 1.1 algorithm.
private void processNode(Node currentElement, ExtractionResult extractionResult) throws Exception {
final EvaluationContext currentEvaluationContext = getContext();
try {
if (currentElement.getNodeType() != Node.DOCUMENT_NODE && currentElement.getNodeType() != Node.ELEMENT_NODE)
return;
// RDFa1.1[7.5.3]
updateVocabulary(currentElement);
// RDFa1.0[5.5.2] / RDFa1.1[7.5.4]
// Node currentElement = node;
updateIRIMapping(currentElement);
// RDFa1.0[5.5.3] / RDFa1.1[7.5.5]
updateLanguage(currentElement, currentEvaluationContext);
if (!isRelativeNode(currentElement)) {
// RDFa1.0[5.5.4] / RDFa1.1[7.5.6]
establishNewSubject(currentElement, currentEvaluationContext);
} else {
// RDFa1.0[5.5.5] / RDFa1.1[7.5.7]
establishNewSubjectCurrentObjectResource(currentElement, currentEvaluationContext);
}
/*
* if(currentEvaluationContext.newSubject == null) { currentEvaluationContext.newSubject =
* resolveIRI(documentBase.toExternalForm()); } assert currentEvaluationContext.newSubject != null :
* "newSubject must be not null.";
*/
if (currentEvaluationContext.newSubject == null)
return;
if (logger.isDebugEnabled())
logger.debug("newSubject: " + currentEvaluationContext.newSubject);
// RDFa1.0[5.5.6] / RDFa1.1[7.5.8]
final IRI[] types = getTypes(currentElement);
for (IRI type : types) {
writeTriple(currentEvaluationContext.newSubject, RDF.TYPE, type, extractionResult);
}
// RDFa1.0[5.5.7] / RDFa1.1[7.5.9]
final IRI[] rels = getRels(currentElement);
final IRI[] revs = getRevs(currentElement);
if (currentEvaluationContext.currentObjectResource != null) {
for (IRI rel : rels) {
writeTriple(currentEvaluationContext.newSubject, rel,
currentEvaluationContext.currentObjectResource, extractionResult);
}
for (IRI rev : revs) {
writeTriple(currentEvaluationContext.currentObjectResource, rev,
currentEvaluationContext.newSubject, extractionResult);
}
} else { // RDFa1.0[5.5.8] / RDFa1.1[7.5.10]
for (IRI rel : rels) {
listOfIncompleteTriples.add(new IncompleteTriple(currentElement,
currentEvaluationContext.newSubject, rel, IncompleteTripleDirection.Forward));
}
for (IRI rev : revs) {
listOfIncompleteTriples.add(new IncompleteTriple(currentElement,
currentEvaluationContext.newSubject, rev, IncompleteTripleDirection.Reverse));
}
}
// RDFa1.0[5.5.9] / RDFa1.1[7.5.11]
final Value currentObject = getCurrentObject(currentElement);
final IRI[] predicates = getPredicate(currentElement);
if (currentObject != null && predicates != null) {
for (IRI predicate : predicates) {
writeTriple(currentEvaluationContext.newSubject, predicate, currentObject, extractionResult);
}
}
// RDFa1.0[5.5.10] / RDFa1.1[7.5.12]
if (!currentEvaluationContext.skipElem && currentEvaluationContext.newSubject != null) {
for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
incompleteTriple.produceTriple(currentElement, currentEvaluationContext.newSubject,
extractionResult);
}
}
} catch (Exception e) {
throw e;
} finally {
// RDFa1.0[5.5.11] / RDFa1.1[7.5.13]
if (currentEvaluationContext.recourse) {
EvaluationContext newEvaluationContext = new EvaluationContext(currentEvaluationContext.base);
if (currentEvaluationContext.skipElem) {
newEvaluationContext.language = currentEvaluationContext.language;
} else {
newEvaluationContext.base = currentEvaluationContext.base;
if (currentEvaluationContext.newSubject != null) {
newEvaluationContext.parentSubject = currentEvaluationContext.newSubject;
} else {
newEvaluationContext.parentSubject = currentEvaluationContext.parentSubject;
}
if (currentEvaluationContext.currentObjectResource != null) {
newEvaluationContext.parentObject = currentEvaluationContext.currentObjectResource;
} else if (currentEvaluationContext.newSubject != null) {
newEvaluationContext.parentObject = currentEvaluationContext.newSubject;
} else {
newEvaluationContext.parentObject = currentEvaluationContext.parentSubject;
}
newEvaluationContext.language = currentEvaluationContext.language;
}
pushContext(currentElement, newEvaluationContext);
}
}
}
/**
* Extract IRI namespaces (prefixes) from the current node.
*
* @param node
* @param prefixMapList
*/
private void extractPrefixes(Node node, List<PrefixMap> prefixMapList) {
final String prefixAttribute = DomUtils.readAttribute(node, PREFIX_ATTRIBUTE, null);
if (prefixAttribute == null)
return;
final String[] prefixParts = extractPrefixSections(prefixAttribute);
for (String prefixPart : prefixParts) {
int splitPoint = prefixPart.indexOf(IRI_PREFIX_SEPARATOR);
final String prefix = prefixPart.substring(0, splitPoint);
if (prefix.length() == 0) {
reportError(node,
String.format(Locale.ROOT, "Invalid prefix length in prefix attribute '%s'", prefixAttribute));
continue;
}
final IRI iri;
final String iriStr = prefixPart.substring(splitPoint + 1);
try {
iri = resolveIRI(iriStr);
} catch (Exception e) {
reportError(node, String.format(Locale.ROOT, "Resolution of prefix '%s' defines an invalid IRI: '%s'",
prefixAttribute, iriStr));
continue;
}
prefixMapList.add(new PrefixMap(prefix, iri));
}
}
/**
* Updates the current language.
*
* @param node
* @param currentEvaluationContext
*/
private void updateLanguage(Node node, EvaluationContext currentEvaluationContext) {
final String candidateLanguage = DomUtils.readAttribute(node, XML_LANG_ATTRIBUTE, null);
if (candidateLanguage != null)
currentEvaluationContext.language = candidateLanguage;
}
/**
* Establish the new subject for the current recursion. See <i>RDFa 1.0 Specification section 5.5.4</i>, <i>RDFa 1.1
* Specification section 7.5.6</i>.
*
* @param node
* @param currentEvaluationContext
*
* @throws URISyntaxException
*/
private void establishNewSubject(Node node, EvaluationContext currentEvaluationContext) throws URISyntaxException {
String candidateIRIOrCURIE;
for (String subjectAttribute : SUBJECT_ATTRIBUTES) {
candidateIRIOrCURIE = DomUtils.readAttribute(node, subjectAttribute, null);
if (candidateIRIOrCURIE != null) {
currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
return;
}
}
if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString());
return;
}
if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
currentEvaluationContext.newSubject = RDFUtils.bnode();
return;
}
if (DomUtils.hasAttribute(node, PROPERTY_ATTRIBUTE)) {
currentEvaluationContext.skipElem = true;
}
if (currentEvaluationContext.parentObject != null) {
currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
return;
}
currentEvaluationContext.newSubject = null;
}
/**
* Establishes the new subject and the current object resource.
*
* See <i>RDFa 1.0 Specification section 5.5.5</i>, <i>RDFa 1.1 Specification section 7.5.7</i>.
*
* @param node
* @param currentEvaluationContext
*
* @throws URISyntaxException
*/
private void establishNewSubjectCurrentObjectResource(Node node, EvaluationContext currentEvaluationContext)
throws URISyntaxException {
// Subject.
String candidateIRIOrCURIE;
candidateIRIOrCURIE = DomUtils.readAttribute(node, ABOUT_ATTRIBUTE, null);
if (candidateIRIOrCURIE != null) {
currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
} else {
candidateIRIOrCURIE = DomUtils.readAttribute(node, SRC_ATTRIBUTE, null);
if (candidateIRIOrCURIE != null) {
currentEvaluationContext.newSubject = resolveIRI(candidateIRIOrCURIE);
} else {
if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString());
} else {
if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
currentEvaluationContext.newSubject = RDFUtils.bnode();
} else {
if (currentEvaluationContext.parentObject != null) {
currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
}
}
}
}
}
// Object.
candidateIRIOrCURIE = DomUtils.readAttribute(node, RESOURCE_ATTRIBUTE, null);
if (candidateIRIOrCURIE != null) {
currentEvaluationContext.currentObjectResource = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
return;
}
candidateIRIOrCURIE = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
if (candidateIRIOrCURIE != null) {
currentEvaluationContext.currentObjectResource = resolveIRI(candidateIRIOrCURIE);
return;
}
currentEvaluationContext.currentObjectResource = null;
}
private IRI[] getTypes(Node node) throws URISyntaxException {
final String typeOf = DomUtils.readAttribute(node, TYPEOF_ATTRIBUTE, null);
return resolveCIRIeOrIRIList(node, typeOf, true);
}
private IRI[] getRels(Node node) throws URISyntaxException {
final String rel = DomUtils.readAttribute(node, REL_ATTRIBUTE, null);
return resolveCIRIeOrIRIList(node, rel, true);
}
private IRI[] getRevs(Node node) throws URISyntaxException {
final String rev = DomUtils.readAttribute(node, REV_ATTRIBUTE, null);
return resolveCIRIeOrIRIList(node, rev, true);
}
private IRI[] getPredicate(Node node) throws URISyntaxException {
final String candidateIRI = DomUtils.readAttribute(node, PROPERTY_ATTRIBUTE, null);
if (candidateIRI == null)
return null;
return resolveCIRIeOrIRIList(node, candidateIRI, true);
}
/**
* Establishes the new object value. See <i>RDFa 1.0 Specification section 5.5.9</i>, <i>RDFa 1.1 Specification
* section 7.5.11</i>.
*
* @param node
*
* @return
*
* @throws URISyntaxException
* @throws IOException
* @throws TransformerException
*/
private Value getCurrentObject(Node node) throws URISyntaxException, IOException, TransformerException {
final String candidateObject = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
if (candidateObject != null) {
return resolveIRI(candidateObject);
} else {
return gerCurrentObjectLiteral(node);
}
}
private Literal gerCurrentObjectLiteral(Node node) throws URISyntaxException, IOException, TransformerException {
final EvaluationContext currentEvaluationContext = getContext();
Literal literal;
literal = getAsTypedLiteral(node);
if (literal != null)
return literal;
literal = getAsXMLLiteral(node);
if (literal != null) {
currentEvaluationContext.recourse = false;
return literal;
}
literal = getAsPlainLiteral(node, currentEvaluationContext.language);
if (literal != null)
return literal;
return null;
}
private static String getNodeContent(Node node) {
final String candidateContent = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
if (candidateContent != null)
return candidateContent;
return node.getTextContent();
}
/**
* Extracts the current typed literal from the given node. See <i>RDFa 1.0 Specification section 5.5.9.1</i>.
*
* @param node
*
* @return
*
* @throws URISyntaxException
*/
private Literal getAsTypedLiteral(Node node) throws URISyntaxException {
final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
if (datatype == null || datatype.trim().length() == 0 || XML_LITERAL_DATATYPE.equals(datatype.trim())) {
return null;
}
final Resource curieOrIRI = resolveCURIEOrIRI(datatype, true);
return RDFUtils.literal(getNodeContent(node), curieOrIRI instanceof IRI ? (IRI) curieOrIRI : null);
}
private void pushMappings(Node sourceNode, List<PrefixMap> prefixMapList) {
final Map<String, IRI> mapping = new HashMap<>();
for (PrefixMap prefixMap : prefixMapList) {
mapping.put(prefixMap.prefix, prefixMap.IRI);
}
IRIMappingStack.push(new IRIMapping(sourceNode, mapping));
}
private void popMappings(Node node) {
if (IRIMappingStack.isEmpty())
return;
final IRIMapping peek = IRIMappingStack.peek();
if (!DomUtils.isAncestorOf(peek.sourceNode, node)) {
IRIMappingStack.pop();
}
}
/**
* Resolve a namespaced IRI, if <code>safe</code> is <code>true</code> then the mapping must define a prefix,
* otherwise it is considered relative.
*
* @param mapping
* @param resolutionPolicy
*
* @return
*/
private Resource resolveNamespacedIRI(String mapping, ResolutionPolicy resolutionPolicy) {
if (mapping.indexOf(IRI_PATH_SEPARATOR) == 0) { // Begins with '/'
mapping = mapping.substring(1);
}
final int prefixSeparatorIndex = mapping.indexOf(':');
if (prefixSeparatorIndex == -1) { // there is no prefix separator.
if (resolutionPolicy == ResolutionPolicy.NSRequired) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Invalid mapping string [%s], must declare a prefix.", mapping));
}
if (resolutionPolicy == ResolutionPolicy.TermAllowed) {
final IRI currentVocabulary = getVocabulary();
// Mapping is a TERM.
if (currentVocabulary != null) {
return resolveIRI(currentVocabulary.toString() + mapping);
}
}
return resolveIRI(documentBase.toString() + mapping);
}
final String prefix = mapping.substring(0, prefixSeparatorIndex);
final IRI curieMapping = getMapping(prefix);
if (curieMapping == null) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Cannot map prefix '%s'", prefix));
}
final String candidateCURIEStr = curieMapping.toString() + mapping.substring(prefixSeparatorIndex + 1);
final java.net.URI candidateCURIE;
try {
candidateCURIE = new java.net.URI(candidateCURIEStr);
} catch (URISyntaxException IRIse) {
throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid CURIE '%s'", candidateCURIEStr));
}
return resolveIRI(candidateCURIE.isAbsolute() ? candidateCURIE.toString()
: documentBase.toString() + candidateCURIE.toString());
}
/**
* The resolution policy provided to the method {@link #resolveNamespacedIRI(String, ResolutionPolicy)}.
*/
enum ResolutionPolicy {
NSNotRequired, NSRequired, TermAllowed
}
/**
* Defines an evaluation context.
*/
private class EvaluationContext {
private Node node;
private URL base;
private Resource parentSubject;
private Value parentObject;
private String language;
private boolean recourse;
private boolean skipElem;
private Resource newSubject;
private Resource currentObjectResource;
/**
* Sections <em>RDFa1.0[5.5]</em>, <em>RDFa1.0[5.5.1]</em>, <em>RDFa1.1[7.5.1]</em> .
*
* @param base
*/
EvaluationContext(URL base) {
this.base = base;
this.parentSubject = resolveIRI(base.toExternalForm());
this.parentObject = null;
this.language = null;
this.recourse = true;
this.skipElem = false;
this.newSubject = null;
this.currentObjectResource = null;
}
}
/**
* Defines a prefix mapping.
*/
private static class PrefixMap {
final String prefix;
final IRI IRI;
public PrefixMap(String prefix, IRI IRI) {
this.prefix = prefix;
this.IRI = IRI;
}
}
/**
* Defines a IRI mapping.
*/
private static class IRIMapping {
final Node sourceNode;
final Map<String, IRI> map;
public IRIMapping(Node sourceNode, Map<String, IRI> map) {
this.sourceNode = sourceNode;
this.map = map;
}
}
/**
* Defines the direction of an {@link IncompleteTriple}.
*/
private enum IncompleteTripleDirection {
Forward, Reverse
}
/**
* Defines an incomplete triple.
*/
private static class IncompleteTriple {
final Node originatingNode;
final Resource subject;
final IRI predicate;
final IncompleteTripleDirection direction;
public IncompleteTriple(Node originatingNode, Resource subject, IRI predicate,
IncompleteTripleDirection direction) {
if (originatingNode == null || subject == null || predicate == null || direction == null)
throw new IllegalArgumentException();
this.originatingNode = originatingNode;
this.subject = subject;
this.predicate = predicate;
this.direction = direction;
}
public boolean produceTriple(Node resourceNode, Resource r, ExtractionResult extractionResult) {
if (!DomUtils.isAncestorOf(originatingNode, resourceNode, true))
return false;
if (r == null)
throw new IllegalArgumentException();
switch (direction) {
case Forward:
extractionResult.writeTriple(subject, predicate, r);
break;
case Reverse:
extractionResult.writeTriple(r, predicate, subject);
break;
default:
throw new IllegalStateException();
}
return true;
}
}
/**
* Defines a vocabulary object.
*/
private static class Vocabulary {
final Node originatingNode;
final IRI prefix;
public Vocabulary(Node originatingNode, IRI prefix) {
this.originatingNode = originatingNode;
this.prefix = prefix;
}
}
}