| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.any23.extractor.rdfa; |
| |
| import java.io.IOException; |
| import java.net.MalformedURLException; |
| import java.net.URISyntaxException; |
| import java.net.URL; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Stack; |
| import javax.xml.transform.TransformerException; |
| import org.apache.any23.extractor.ExtractionResult; |
| import org.apache.any23.extractor.IssueReport; |
| import org.apache.any23.extractor.html.DomUtils; |
| import org.apache.any23.rdf.RDFUtils; |
| import org.eclipse.rdf4j.model.IRI; |
| import org.eclipse.rdf4j.model.Literal; |
| import org.eclipse.rdf4j.model.Resource; |
| import org.eclipse.rdf4j.model.Value; |
| import org.eclipse.rdf4j.model.vocabulary.RDF; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| import org.w3c.dom.Document; |
| import org.w3c.dom.NamedNodeMap; |
| import org.w3c.dom.Node; |
| import org.w3c.dom.NodeList; |
| |
| /** |
| * This parser is able to extract <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa 1.0</a> and |
| * <a href="http://www.w3.org/TR/rdfa-core/">RDFa 1.1</a> statements from any <i>(X)HTML</i> document. |
| * @deprecated since 2.3 the {@link org.eclipse.rdf4j.rio.Rio} implementations |
| * are used to parse RDFa. Look at {@link org.apache.any23.extractor.rdf.RDFParserFactory#getRDFa10Parser} |
| * and {@link org.apache.any23.extractor.rdf.RDFParserFactory#getRDFa11Parser}. |
| * |
| * @author Michele Mostarda (mostarda@fbk.eu) |
| */ |
| @Deprecated |
| public class RDFa11Parser { |
| |
| private static final Logger logger = LoggerFactory.getLogger(RDFa11Parser.class); |
| |
| public static final String CURIE_SEPARATOR = ":"; |
| public static final char IRI_PREFIX_SEPARATOR = ':'; |
| public static final String IRI_SCHEMA_SEPARATOR = "://"; |
| public static final String IRI_PATH_SEPARATOR = "/"; |
| |
| public static final String HEAD_TAG = "HEAD"; |
| public static final String BODY_TAG = "BODY"; |
| |
| public static final String XMLNS_ATTRIBUTE = "xmlns"; |
| public static final String XML_LANG_ATTRIBUTE = "xml:lang"; |
| |
| public static final String REL_ATTRIBUTE = "rel"; |
| public static final String REV_ATTRIBUTE = "rev"; |
| |
| public static final String ABOUT_ATTRIBUTE = "about"; |
| public static final String RESOURCE_ATTRIBUTE = "resource"; |
| public static final String SRC_ATTRIBUTE = "src"; |
| public static final String HREF_ATTRIBUTE = "href"; |
| |
| public static final String TYPE_ATTRIBUTE = "type"; |
| public static final String ATTRIBUTE_CSS = "text/css"; |
| |
| public static final String[] SUBJECT_ATTRIBUTES = { |
| ABOUT_ATTRIBUTE, |
| SRC_ATTRIBUTE, |
| RESOURCE_ATTRIBUTE, |
| HREF_ATTRIBUTE |
| }; |
| |
| public static final String PREFIX_ATTRIBUTE = "prefix"; |
| public static final String TYPEOF_ATTRIBUTE = "typeof"; |
| public static final String PROPERTY_ATTRIBUTE = "property"; |
| public static final String DATATYPE_ATTRIBUTE = "datatype"; |
| public static final String CONTENT_ATTRIBUTE = "content"; |
| public static final String VOCAB_ATTRIBUTE = "vocab"; |
| // TODO: introduce support for RDFa profiles. (http://www.w3.org/TR/rdfa-core/#s_profiles) |
| public static final String PROFILE_ATTRIBUTE = "profile"; |
| |
| public static final String XML_LITERAL_DATATYPE = "rdf:XMLLiteral"; |
| |
| public static final String XMLNS_DEFAULT = "http://www.w3.org/1999/xhtml"; |
| |
| private IssueReport issueReport; |
| |
| private URL documentBase; |
| |
| private final Stack<IRIMapping> IRIMappingStack = new Stack<>(); |
| |
| private final Stack<Vocabulary> vocabularyStack = new Stack<>(); |
| |
| private final List<IncompleteTriple> listOfIncompleteTriples = new ArrayList<>(); |
| |
| private final Stack<EvaluationContext> evaluationContextStack = new Stack<>(); |
| |
| public RDFa11Parser() { |
| //default constructor |
| } |
| |
| protected static URL getDocumentBase(URL documentURL, Document document) throws MalformedURLException { |
| String base; |
| base = DomUtils.find(document, "/HTML/HEAD/BASE/@href"); // Non XHTML documents. |
| if( ! "".equals(base) ) |
| return new URL(base); |
| base = DomUtils.find(document, "//*/h:head/h:base[position()=1]/@href"); // XHTML documents. |
| if( ! "".equals(base) ) |
| return new URL(base); |
| return documentURL; |
| } |
| |
| /** |
| * Given a prefix declaration returns a list of <code>prefixID:prefixURL</code> strings |
| * normalizing blanks where present. |
| * |
| * @param prefixesDeclaration input prefix |
| * @return list of extracted prefixes. |
| */ |
| protected static String[] extractPrefixSections(String prefixesDeclaration) { |
| final String[] parts = prefixesDeclaration.split("\\s"); |
| final List<String> out = new ArrayList<>(); |
| int i = 0; |
| while(i < parts.length) { |
| final String part = parts[i]; |
| if(part.length() == 0) { |
| i++; |
| continue; |
| } |
| if(part.charAt( part.length() -1 ) == IRI_PREFIX_SEPARATOR) { |
| i++; |
| while(i < parts.length && parts[i].length() == 0) |
| i++; |
| out.add( part + (i < parts.length ? parts[i] : "") ); |
| i++; |
| } else { |
| out.add(parts[i]); |
| i++; |
| } |
| } |
| return out.toArray( new String[out.size()] ); |
| } |
| |
| protected static boolean isAbsoluteIRI(String iri) { |
| return iri.contains(IRI_SCHEMA_SEPARATOR); |
| } |
| |
| protected static boolean isCURIE(String curie) { |
| if(curie == null) { |
| throw new NullPointerException("curie string cannot be null."); |
| } |
| if(curie.trim().length() == 0) |
| return false; |
| |
| // '[' PREFIX ':' VALUE ']' |
| if( curie.charAt(0) != '[' || curie.charAt(curie.length() -1) != ']') |
| return false; |
| int separatorIndex = curie.indexOf(CURIE_SEPARATOR); |
| return separatorIndex > 0 && curie.indexOf(CURIE_SEPARATOR, separatorIndex + 1) == -1; |
| } |
| |
| protected static boolean isCURIEBNode(String curie) { |
| return isCURIE(curie) && curie.substring(1, curie.length() -1).split(CURIE_SEPARATOR)[0].equals("_"); |
| } |
| |
| protected static boolean isRelativeNode(Node node) { |
| if( ATTRIBUTE_CSS.equals( DomUtils.readAttribute(node, TYPE_ATTRIBUTE) ) ) |
| return false; |
| return DomUtils.hasAttribute(node, REL_ATTRIBUTE) || DomUtils.hasAttribute(node, REV_ATTRIBUTE); |
| } |
| |
| // RDFa1.0[5.5.9.2] |
| protected static Literal getAsPlainLiteral(Node node, String currentLanguage) { |
| final String content = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null); |
| if(content != null) |
| return RDFUtils.literal(content, currentLanguage); |
| |
| if(! node.hasChildNodes() ) |
| return RDFUtils.literal("", currentLanguage); |
| |
| final String nodeTextContent = node.getTextContent(); |
| return nodeTextContent == null ? null : RDFUtils.literal(nodeTextContent.trim(), currentLanguage); |
| } |
| |
| protected static Literal getAsXMLLiteral(Node node) throws IOException, TransformerException { |
| final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null); |
| if(! XML_LITERAL_DATATYPE.equals(datatype)) |
| return null; |
| |
| final String xmlSerializedNode = DomUtils.serializeToXML(node, false); |
| return RDFUtils.literal(xmlSerializedNode, RDF.XMLLITERAL); |
| } |
| |
| protected static boolean isXMLNSDeclared(Document document) { |
| final String attributeValue = document.getDocumentElement().getAttribute(XMLNS_ATTRIBUTE); |
| if(attributeValue.length() == 0) |
| return false; |
| return XMLNS_DEFAULT.equals(attributeValue); |
| } |
| |
| /** |
| * <a href="http://www.w3.org/TR/rdfa-syntax/#s_model">RDFa Syntax - Processing Model</a>. |
| * |
| * @param documentURL {@link java.net.URL} of the document to process |
| * @param extractionResult a {@link org.apache.any23.extractor.ExtractionResult} to populate |
| * @param document the {@link org.w3c.dom.Document} to populate with parse content |
| * @throws RDFa11ParserException if there is an error parsing the document |
| */ |
| public void processDocument(URL documentURL, Document document, ExtractionResult extractionResult) |
| throws RDFa11ParserException { |
| try { |
| this.issueReport = extractionResult; |
| |
| // Check RDFa1.0[4.1.3] : default XMLNS declaration. |
| if( ! isXMLNSDeclared(document)) { |
| reportError( |
| document.getDocumentElement(), |
| String.format(Locale.ROOT, |
| "The default %s namespace is expected to be declared and equal to '%s' .", |
| XMLNS_ATTRIBUTE, XMLNS_DEFAULT |
| ) |
| ); |
| } |
| |
| try { |
| documentBase = getDocumentBase(documentURL, document); |
| } catch (MalformedURLException murle) { |
| throw new RDFa11ParserException("Invalid document base URL.", murle); |
| } |
| |
| // RDFa1.0[5.5.1] |
| pushContext(document, new EvaluationContext(documentBase)); |
| |
| depthFirstNode(document, extractionResult); |
| |
| assert listOfIncompleteTriples.isEmpty() |
| : |
| "The list of incomplete triples is expected to be empty at the end of processing."; |
| } finally { |
| reset(); |
| } |
| } |
| |
| /** |
| * Resets the parser to the original state. |
| */ |
| public void reset() { |
| issueReport = null; |
| documentBase = null; |
| IRIMappingStack.clear(); |
| listOfIncompleteTriples.clear(); |
| evaluationContextStack.clear(); |
| } |
| |
| /** |
| * Updates the vocabulary context with possible <em>@vocab</em> declarations. |
| * |
| * @param currentNode the current node. |
| */ |
| protected void updateVocabulary(Node currentNode) { |
| final String vocabularyStr = DomUtils.readAttribute(currentNode, VOCAB_ATTRIBUTE, null); |
| if(vocabularyStr == null) |
| return; |
| try { |
| pushVocabulary(currentNode, RDFUtils.iri(vocabularyStr)); |
| } catch (Exception e) { |
| reportError(currentNode, String.format(Locale.ROOT, "Invalid vocabulary [%s], must be a IRI.", vocabularyStr)); |
| } |
| } |
| |
| /** |
| * Updates the IRI mapping with the XMLNS attributes declared in the current node. |
| * |
| * @param node input node. |
| */ |
| protected void updateIRIMapping(Node node) { |
| final NamedNodeMap attributes = node.getAttributes(); |
| if (null == attributes) |
| return; |
| |
| Node attribute; |
| final List<PrefixMap> prefixMapList = new ArrayList<PrefixMap>(); |
| final String namespacePrefix = XMLNS_ATTRIBUTE + IRI_PREFIX_SEPARATOR; |
| for (int a = 0; a < attributes.getLength(); a++) { |
| attribute = attributes.item(a); |
| if (attribute.getNodeName().startsWith(namespacePrefix)) { |
| prefixMapList.add( |
| new PrefixMap( |
| attribute.getNodeName().substring(namespacePrefix.length()), |
| resolveIRI(attribute.getNodeValue()) |
| ) |
| ); |
| } |
| } |
| |
| extractPrefixes(node, prefixMapList); |
| |
| if(prefixMapList.size() == 0) |
| return; |
| pushMappings( |
| node, |
| prefixMapList |
| ); |
| } |
| |
| /** |
| * Returns a IRI mapping for a given prefix. |
| * |
| * @param prefix input prefix. |
| * @return IRI mapping. |
| */ |
| protected IRI getMapping(String prefix) { |
| for (IRIMapping IRIMapping : IRIMappingStack) { |
| final IRI mapping = IRIMapping.map.get(prefix); |
| if (mapping != null) { |
| return mapping; |
| } |
| } |
| return null; |
| } |
| |
| /** |
| * Resolves a <em>whitelist</em> separated list of <i>CURIE</i> or <i>URI</i>. |
| * |
| * @param n current node. |
| * @param curieOrIRIList list of CURIE/URI. |
| * @param termAllowed determine whether the term should be whitelisted. |
| * @return list of resolved URIs. |
| * @throws URISyntaxException if there is an error processing CURIE or URL |
| */ |
| protected IRI[] resolveCIRIeOrIRIList(Node n, String curieOrIRIList, boolean termAllowed) |
| throws URISyntaxException { |
| if(curieOrIRIList == null || curieOrIRIList.trim().length() == 0) |
| return new IRI[0]; |
| |
| final String[] curieOrIRIListParts = curieOrIRIList.split("\\s"); |
| final List<IRI> result = new ArrayList<>(); |
| Resource curieOrIRI; |
| for(String curieORIRIListPart : curieOrIRIListParts) { |
| curieOrIRI = resolveCURIEOrIRI(curieORIRIListPart, termAllowed); |
| if(curieOrIRI != null && curieOrIRI instanceof IRI) { |
| result.add((IRI) curieOrIRI); |
| } else { |
| reportError(n, String.format(Locale.ROOT, "Invalid CURIE '%s' : expected IRI, found BNode.", curieORIRIListPart)); |
| } |
| } |
| return result.toArray(new IRI[result.size()]); |
| } |
| |
| /** |
| * Resolves a IRI string as IRI. |
| * |
| * @param iriStr (partial) IRI string to be resolved. |
| * @return the resolved IRI. |
| */ |
| protected IRI resolveIRI(String iriStr) { |
| return |
| isAbsoluteIRI(iriStr) |
| ? |
| RDFUtils.iri(iriStr) |
| : |
| RDFUtils.iri( this.documentBase.toExternalForm(), iriStr ); |
| } |
| |
| /** |
| * Resolves a <i>CURIE</i> or <i>IRI</i> string. |
| * |
| * @param curieOrIRI individual of CURIE/URI to resolve |
| * @param termAllowed if <code>true</code> the resolution can be a term. |
| * @return the resolved resource. |
| */ |
| protected Resource resolveCURIEOrIRI(String curieOrIRI, boolean termAllowed) { |
| if( isCURIE(curieOrIRI) ) { |
| return resolveNamespacedIRI(curieOrIRI.substring(1, curieOrIRI.length() - 1), ResolutionPolicy.NSRequired); |
| } |
| if(isAbsoluteIRI(curieOrIRI)) |
| return resolveIRI(curieOrIRI); |
| return resolveNamespacedIRI( |
| curieOrIRI, |
| termAllowed ? ResolutionPolicy.TermAllowed : ResolutionPolicy.NSNotRequired |
| ); |
| } |
| |
| /** |
| * Pushes a context whiting the evaluation context stack, associated to tha given generation node. |
| * |
| * @param current |
| * @param ec |
| */ |
| private void pushContext(Node current, EvaluationContext ec) { |
| ec.node = current; |
| evaluationContextStack.push(ec); |
| } |
| |
| /** |
| * @return the peek evaluation context. |
| */ |
| private EvaluationContext getContext() { |
| return evaluationContextStack.peek(); |
| } |
| |
| /** |
| * Pops out the peek evaluation context if ancestor of current node. |
| * |
| * @param current current node. |
| */ |
| private void popContext(Node current) { |
| final Node peekNode = evaluationContextStack.peek().node; |
| if(DomUtils.isAncestorOf(peekNode, current)) { |
| evaluationContextStack.pop(); |
| } |
| } |
| |
| /** |
| * Pushes a new vocabulary definition. |
| * |
| * @param currentNode node proving the vocabulary. |
| * @param vocab the vocabulary IRI. |
| */ |
| private void pushVocabulary(Node currentNode, IRI vocab) { |
| vocabularyStack.push( new Vocabulary(currentNode, vocab) ); |
| } |
| |
| /** |
| * @return the current peek vocabulary. |
| */ |
| private IRI getVocabulary() { |
| if(vocabularyStack.isEmpty()) |
| return null; |
| return vocabularyStack.peek().prefix; |
| } |
| |
| /** |
| * Pops out the vocabulary definition. |
| * |
| * @param current |
| */ |
| private void popVocabulary(Node current) { |
| if(vocabularyStack.isEmpty()) |
| return; |
| if(DomUtils.isAncestorOf(current, vocabularyStack.peek().originatingNode)) { |
| vocabularyStack.pop(); |
| } |
| } |
| |
| /** |
| * Purge all incomplete triples originated from a node that is descendant of <code>current</code>. |
| * |
| * @param current |
| */ |
| private void purgeIncompleteTriples(Node current) { |
| final List<IncompleteTriple> toBePurged = new ArrayList<>(); |
| for(IncompleteTriple incompleteTriple : listOfIncompleteTriples) { |
| if( DomUtils.isAncestorOf(current, incompleteTriple.originatingNode, true) ) { |
| toBePurged.add(incompleteTriple); |
| } |
| } |
| listOfIncompleteTriples.removeAll(toBePurged); |
| toBePurged.clear(); |
| } |
| |
| /** |
| * Reports an error to the error reporter. |
| * |
| * @param n originating node. |
| * @param msg human readable message. |
| */ |
| private void reportError(Node n, String msg) { |
| final String errorMsg = String.format(Locale.ROOT, |
| "Error while processing node [%s] : '%s'", |
| DomUtils.getXPathForNode(n), msg |
| ); |
| final int[] errorLocation = DomUtils.getNodeLocation(n); |
| this.issueReport.notifyIssue( |
| IssueReport.IssueLevel.WARNING, |
| errorMsg, |
| errorLocation == null ? -1 : errorLocation[0], |
| errorLocation == null ? -1 : errorLocation[1] |
| ); |
| } |
| |
| /** |
| * Performs a <i>deep-first</i> tree visit on the given root node. |
| * |
| * @param node root node. |
| * @param extractionResult |
| */ |
| private void depthFirstNode(Node node, ExtractionResult extractionResult) { |
| try { |
| processNode(node, extractionResult); |
| } catch (Exception e) { |
| if(logger.isDebugEnabled()) |
| logger.debug("Error while processing node.", e); |
| reportError(node, e.getMessage()); |
| } |
| depthFirstChildren(node.getChildNodes(), extractionResult); |
| purgeIncompleteTriples(node); |
| } |
| |
| /** |
| * Performs a <i>deep-first</i> children list visit. |
| * |
| * @param nodeList |
| * @param extractionResult |
| */ |
| private void depthFirstChildren(NodeList nodeList, ExtractionResult extractionResult) { |
| for(int i = 0; i < nodeList.getLength(); i++) { |
| final Node child = nodeList.item(i); |
| depthFirstNode(child, extractionResult); |
| popMappings(child); |
| popVocabulary(child); |
| popContext(child); |
| } |
| } |
| |
| /** |
| * Writes a triple on the extraction result. |
| * |
| * @param s |
| * @param p |
| * @param o |
| * @param extractionResult |
| */ |
| private void writeTriple(Resource s, IRI p, Value o, ExtractionResult extractionResult) { |
| assert s != null : "subject is null."; |
| assert p != null : "predicate is null."; |
| assert o != null : "object is null."; |
| extractionResult.writeTriple(s, p, o); |
| } |
| |
| /** |
| * Processes the current node on the extraction algorithm. |
| * All the steps of this algorithm are annotated with the |
| * specification and section which describes it. The annotation is at form |
| * <em>RDFa<spec-version%gt;[<section>]</em> |
| * |
| * @param currentElement |
| * @param extractionResult |
| * @throws Exception |
| */ |
| // TODO: add references to the RDFa 1.1 algorithm. |
| private void processNode(Node currentElement, ExtractionResult extractionResult) throws Exception { |
| final EvaluationContext currentEvaluationContext = getContext(); |
| try { |
| if( |
| currentElement.getNodeType() != Node.DOCUMENT_NODE |
| && |
| currentElement.getNodeType() != Node.ELEMENT_NODE |
| ) return; |
| |
| // RDFa1.1[7.5.3] |
| updateVocabulary(currentElement); |
| |
| // RDFa1.0[5.5.2] / RDFa1.1[7.5.4] |
| //Node currentElement = node; |
| updateIRIMapping(currentElement); |
| |
| // RDFa1.0[5.5.3] / RDFa1.1[7.5.5] |
| updateLanguage(currentElement, currentEvaluationContext); |
| |
| if(! isRelativeNode(currentElement)) { |
| // RDFa1.0[5.5.4] / RDFa1.1[7.5.6] |
| establishNewSubject(currentElement, currentEvaluationContext); |
| } else { |
| // RDFa1.0[5.5.5] / RDFa1.1[7.5.7] |
| establishNewSubjectCurrentObjectResource( |
| currentElement, |
| currentEvaluationContext |
| ); |
| } |
| |
| /* |
| if(currentEvaluationContext.newSubject == null) { |
| currentEvaluationContext.newSubject = resolveIRI(documentBase.toExternalForm()); |
| } |
| assert currentEvaluationContext.newSubject != null : "newSubject must be not null."; |
| */ |
| if(currentEvaluationContext.newSubject == null) |
| return; |
| if(logger.isDebugEnabled()) |
| logger.debug("newSubject: " + currentEvaluationContext.newSubject); |
| |
| // RDFa1.0[5.5.6] / RDFa1.1[7.5.8] |
| final IRI[] types = getTypes(currentElement); |
| for(IRI type : types) { |
| writeTriple(currentEvaluationContext.newSubject, RDF.TYPE, type, extractionResult); |
| } |
| |
| // RDFa1.0[5.5.7] / RDFa1.1[7.5.9] |
| final IRI[] rels = getRels(currentElement); |
| final IRI[] revs = getRevs(currentElement); |
| if(currentEvaluationContext.currentObjectResource != null) { |
| for (IRI rel : rels) { |
| writeTriple( |
| currentEvaluationContext.newSubject, |
| rel, |
| currentEvaluationContext.currentObjectResource, |
| extractionResult |
| ); |
| } |
| for (IRI rev : revs) { |
| writeTriple( |
| currentEvaluationContext.currentObjectResource, |
| rev, |
| currentEvaluationContext.newSubject, extractionResult |
| ); |
| } |
| } else { // RDFa1.0[5.5.8] / RDFa1.1[7.5.10] |
| for(IRI rel : rels) { |
| listOfIncompleteTriples.add( |
| new IncompleteTriple( |
| currentElement, |
| currentEvaluationContext.newSubject, |
| rel, |
| IncompleteTripleDirection.Forward |
| ) |
| ); |
| } |
| for(IRI rev : revs) { |
| listOfIncompleteTriples.add( |
| new IncompleteTriple( |
| currentElement, |
| currentEvaluationContext.newSubject, |
| rev, |
| IncompleteTripleDirection.Reverse |
| ) |
| ); |
| } |
| } |
| |
| // RDFa1.0[5.5.9] / RDFa1.1[7.5.11] |
| final Value currentObject = getCurrentObject(currentElement); |
| final IRI[] predicates = getPredicate(currentElement); |
| if (currentObject != null && predicates != null) { |
| for (IRI predicate : predicates) { |
| writeTriple(currentEvaluationContext.newSubject, predicate, currentObject, extractionResult); |
| } |
| } |
| |
| // RDFa1.0[5.5.10] / RDFa1.1[7.5.12] |
| if(!currentEvaluationContext.skipElem && currentEvaluationContext.newSubject != null) { |
| for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) { |
| incompleteTriple.produceTriple( |
| currentElement, |
| currentEvaluationContext.newSubject, |
| extractionResult |
| ); |
| } |
| } |
| } catch (Exception e) { |
| throw e; |
| } finally { |
| // RDFa1.0[5.5.11] / RDFa1.1[7.5.13] |
| if(currentEvaluationContext.recourse) { |
| EvaluationContext newEvaluationContext = new EvaluationContext(currentEvaluationContext.base); |
| if(currentEvaluationContext.skipElem) { |
| newEvaluationContext.language = currentEvaluationContext.language; |
| } else { |
| newEvaluationContext.base = currentEvaluationContext.base; |
| |
| if(currentEvaluationContext.newSubject != null) { |
| newEvaluationContext.parentSubject = currentEvaluationContext.newSubject; |
| } else { |
| newEvaluationContext.parentSubject = currentEvaluationContext.parentSubject; |
| } |
| |
| if(currentEvaluationContext.currentObjectResource != null) { |
| newEvaluationContext.parentObject = currentEvaluationContext.currentObjectResource; |
| } else if(currentEvaluationContext.newSubject != null) { |
| newEvaluationContext.parentObject = currentEvaluationContext.newSubject; |
| } else { |
| newEvaluationContext.parentObject = currentEvaluationContext.parentSubject; |
| } |
| |
| newEvaluationContext.language = currentEvaluationContext.language; |
| } |
| pushContext(currentElement, newEvaluationContext); |
| } |
| } |
| } |
| |
| /** |
| * Extract IRI namespaces (prefixes) from the current node. |
| * |
| * @param node |
| * @param prefixMapList |
| */ |
| private void extractPrefixes(Node node, List<PrefixMap> prefixMapList) { |
| final String prefixAttribute = DomUtils.readAttribute(node, PREFIX_ATTRIBUTE, null); |
| if(prefixAttribute == null) |
| return; |
| final String[] prefixParts = extractPrefixSections(prefixAttribute); |
| for(String prefixPart : prefixParts) { |
| int splitPoint = prefixPart.indexOf(IRI_PREFIX_SEPARATOR); |
| final String prefix = prefixPart.substring(0, splitPoint); |
| if(prefix.length() == 0) { |
| reportError(node, String.format(Locale.ROOT, "Invalid prefix length in prefix attribute '%s'", prefixAttribute)); |
| continue; |
| } |
| final IRI iri; |
| final String iriStr = prefixPart.substring(splitPoint + 1); |
| try { |
| iri = resolveIRI(iriStr); |
| } catch (Exception e) { |
| reportError( |
| node, |
| String.format(Locale.ROOT, |
| "Resolution of prefix '%s' defines an invalid IRI: '%s'", |
| prefixAttribute, iriStr |
| ) |
| ); |
| continue; |
| } |
| prefixMapList.add( new PrefixMap(prefix, iri) ); |
| } |
| } |
| |
| /** |
| * Updates the current language. |
| * |
| * @param node |
| * @param currentEvaluationContext |
| */ |
| private void updateLanguage(Node node, EvaluationContext currentEvaluationContext) { |
| final String candidateLanguage = DomUtils.readAttribute(node, XML_LANG_ATTRIBUTE, null); |
| if(candidateLanguage != null) |
| currentEvaluationContext.language = candidateLanguage; |
| } |
| |
| /** |
| * Establish the new subject for the current recursion. |
| * See <i>RDFa 1.0 Specification section 5.5.4</i>, <i>RDFa 1.1 Specification section 7.5.6</i>. |
| * |
| * @param node |
| * @param currentEvaluationContext |
| * @throws URISyntaxException |
| */ |
| private void establishNewSubject(Node node, EvaluationContext currentEvaluationContext) |
| throws URISyntaxException { |
| String candidateIRIOrCURIE; |
| for(String subjectAttribute : SUBJECT_ATTRIBUTES) { |
| candidateIRIOrCURIE = DomUtils.readAttribute(node, subjectAttribute, null); |
| if(candidateIRIOrCURIE != null) { |
| currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false); |
| return; |
| } |
| } |
| |
| if(node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) { |
| currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString()); |
| return; |
| } |
| |
| if(DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) { |
| currentEvaluationContext.newSubject = RDFUtils.bnode(); |
| return; |
| } |
| |
| if(DomUtils.hasAttribute(node, PROPERTY_ATTRIBUTE)) { |
| currentEvaluationContext.skipElem = true; |
| } |
| if(currentEvaluationContext.parentObject != null) { |
| currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject; |
| return; |
| } |
| |
| currentEvaluationContext.newSubject = null; |
| } |
| |
| /** |
| * Establishes the new subject and the current object resource. |
| * |
| * See <i>RDFa 1.0 Specification section 5.5.5</i>, <i>RDFa 1.1 Specification section 7.5.7</i>. |
| * |
| * @param node |
| * @param currentEvaluationContext |
| * @throws URISyntaxException |
| */ |
| private void establishNewSubjectCurrentObjectResource(Node node, EvaluationContext currentEvaluationContext) |
| throws URISyntaxException { |
| // Subject. |
| String candidateIRIOrCURIE; |
| candidateIRIOrCURIE = DomUtils.readAttribute(node, ABOUT_ATTRIBUTE, null); |
| if(candidateIRIOrCURIE != null) { |
| currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false); |
| } else { |
| candidateIRIOrCURIE = DomUtils.readAttribute(node, SRC_ATTRIBUTE, null); |
| if (candidateIRIOrCURIE != null) { |
| currentEvaluationContext.newSubject = resolveIRI(candidateIRIOrCURIE); |
| } else { |
| if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) { |
| currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString()); |
| } else { |
| if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) { |
| currentEvaluationContext.newSubject = RDFUtils.bnode(); |
| } else { |
| if (currentEvaluationContext.parentObject != null) { |
| currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject; |
| } |
| } |
| } |
| } |
| } |
| |
| // Object. |
| candidateIRIOrCURIE = DomUtils.readAttribute(node, RESOURCE_ATTRIBUTE, null); |
| if(candidateIRIOrCURIE != null) { |
| currentEvaluationContext.currentObjectResource = resolveCURIEOrIRI(candidateIRIOrCURIE, false); |
| return; |
| } |
| |
| candidateIRIOrCURIE = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null); |
| if(candidateIRIOrCURIE != null) { |
| currentEvaluationContext.currentObjectResource = resolveIRI(candidateIRIOrCURIE); |
| return; |
| } |
| currentEvaluationContext.currentObjectResource = null; |
| } |
| |
| private IRI[] getTypes(Node node) throws URISyntaxException { |
| final String typeOf = DomUtils.readAttribute(node, TYPEOF_ATTRIBUTE, null); |
| return resolveCIRIeOrIRIList(node, typeOf, true); |
| } |
| |
| private IRI[] getRels(Node node) throws URISyntaxException { |
| final String rel = DomUtils.readAttribute(node, REL_ATTRIBUTE, null); |
| return resolveCIRIeOrIRIList(node, rel, true); |
| } |
| |
| private IRI[] getRevs(Node node) throws URISyntaxException { |
| final String rev = DomUtils.readAttribute(node, REV_ATTRIBUTE, null); |
| return resolveCIRIeOrIRIList(node, rev, true); |
| } |
| |
| private IRI[] getPredicate(Node node) throws URISyntaxException { |
| final String candidateIRI = DomUtils.readAttribute(node, PROPERTY_ATTRIBUTE, null); |
| if(candidateIRI == null) |
| return null; |
| return resolveCIRIeOrIRIList(node, candidateIRI, true); |
| } |
| |
| /** |
| * Establishes the new object value. |
| * See <i>RDFa 1.0 Specification section 5.5.9</i>, <i>RDFa 1.1 Specification section 7.5.11</i>. |
| * |
| * @param node |
| * @return |
| * @throws URISyntaxException |
| * @throws IOException |
| * @throws TransformerException |
| */ |
| private Value getCurrentObject(Node node) |
| throws URISyntaxException, IOException, TransformerException { |
| final String candidateObject = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null); |
| if(candidateObject != null) { |
| return resolveIRI(candidateObject); |
| } else { |
| return gerCurrentObjectLiteral(node); |
| } |
| } |
| |
| private Literal gerCurrentObjectLiteral(Node node) |
| throws URISyntaxException, IOException, TransformerException { |
| final EvaluationContext currentEvaluationContext = getContext(); |
| Literal literal; |
| |
| literal = getAsTypedLiteral(node); |
| if(literal != null) |
| return literal; |
| |
| literal = getAsXMLLiteral(node); |
| if(literal != null) { |
| currentEvaluationContext.recourse = false; |
| return literal; |
| } |
| |
| literal = getAsPlainLiteral(node, currentEvaluationContext.language); |
| if(literal != null) |
| return literal; |
| |
| return null; |
| } |
| |
| private static String getNodeContent(Node node) { |
| final String candidateContent = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null); |
| if(candidateContent != null) |
| return candidateContent; |
| return node.getTextContent(); |
| } |
| |
| /** |
| * Extracts the current typed literal from the given node. |
| * See <i>RDFa 1.0 Specification section 5.5.9.1</i>. |
| * |
| * @param node |
| * @return |
| * @throws URISyntaxException |
| */ |
| private Literal getAsTypedLiteral(Node node) throws URISyntaxException { |
| final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null); |
| if (datatype == null || datatype.trim().length() == 0 || XML_LITERAL_DATATYPE.equals(datatype.trim()) ) { |
| return null; |
| } |
| final Resource curieOrIRI = resolveCURIEOrIRI(datatype, true); |
| return RDFUtils.literal(getNodeContent(node), curieOrIRI instanceof IRI ? (IRI) curieOrIRI : null); |
| } |
| |
| private void pushMappings(Node sourceNode, List<PrefixMap> prefixMapList) { |
| |
| final Map<String, IRI> mapping = new HashMap<>(); |
| for (PrefixMap prefixMap : prefixMapList) { |
| mapping.put(prefixMap.prefix, prefixMap.IRI); |
| } |
| IRIMappingStack.push( new IRIMapping(sourceNode, mapping) ); |
| } |
| |
| private void popMappings(Node node) { |
| if(IRIMappingStack.isEmpty()) |
| return; |
| final IRIMapping peek = IRIMappingStack.peek(); |
| if( ! DomUtils.isAncestorOf(peek.sourceNode, node) ) { |
| IRIMappingStack.pop(); |
| } |
| } |
| |
| /** |
| * Resolve a namespaced IRI, if <code>safe</code> is <code>true</code> |
| * then the mapping must define a prefix, otherwise it is considered relative. |
| * |
| * @param mapping |
| * @param resolutionPolicy |
| * @return |
| */ |
| private Resource resolveNamespacedIRI(String mapping, ResolutionPolicy resolutionPolicy) { |
| if(mapping.indexOf(IRI_PATH_SEPARATOR) == 0) { // Begins with '/' |
| mapping = mapping.substring(1); |
| } |
| |
| final int prefixSeparatorIndex = mapping.indexOf(':'); |
| if(prefixSeparatorIndex == -1) { // there is no prefix separator. |
| if(resolutionPolicy == ResolutionPolicy.NSRequired) { |
| throw new IllegalArgumentException( |
| String.format(Locale.ROOT, "Invalid mapping string [%s], must declare a prefix.", mapping) |
| ); |
| } |
| if (resolutionPolicy == ResolutionPolicy.TermAllowed) { |
| final IRI currentVocabulary = getVocabulary(); |
| // Mapping is a TERM. |
| if (currentVocabulary != null) { |
| return resolveIRI(currentVocabulary.toString() + mapping); |
| } |
| } |
| return resolveIRI(documentBase.toString() + mapping); |
| } |
| |
| final String prefix = mapping.substring(0, prefixSeparatorIndex); |
| final IRI curieMapping = getMapping(prefix); |
| if(curieMapping == null) { |
| throw new IllegalArgumentException( String.format(Locale.ROOT, "Cannot map prefix '%s'", prefix) ); |
| } |
| final String candidateCURIEStr = curieMapping.toString() + mapping.substring(prefixSeparatorIndex + 1); |
| final java.net.URI candidateCURIE; |
| try { |
| candidateCURIE = new java.net.URI(candidateCURIEStr); |
| } catch (URISyntaxException IRIse) { |
| throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid CURIE '%s'", candidateCURIEStr) ); |
| } |
| return resolveIRI( |
| candidateCURIE.isAbsolute() |
| ? |
| candidateCURIE.toString() |
| : |
| documentBase.toString() + candidateCURIE.toString() |
| ); |
| } |
| |
| /** |
| * The resolution policy provided to the method {@link #resolveNamespacedIRI(String, ResolutionPolicy)}. |
| */ |
| enum ResolutionPolicy { |
| NSNotRequired, |
| NSRequired, |
| TermAllowed |
| } |
| |
| /** |
| * Defines an evaluation context. |
| */ |
| private class EvaluationContext { |
| private Node node; |
| private URL base; |
| private Resource parentSubject; |
| private Value parentObject; |
| private String language; |
| private boolean recourse; |
| private boolean skipElem; |
| private Resource newSubject; |
| private Resource currentObjectResource; |
| |
| /** |
| * Sections <em>RDFa1.0[5.5]</em>, <em>RDFa1.0[5.5.1]</em>, <em>RDFa1.1[7.5.1]</em> . |
| * |
| * @param base |
| */ |
| EvaluationContext(URL base) { |
| this.base = base; |
| this.parentSubject = resolveIRI( base.toExternalForm() ); |
| this.parentObject = null; |
| this.language = null; |
| this.recourse = true; |
| this.skipElem = false; |
| this.newSubject = null; |
| this.currentObjectResource = null; |
| } |
| } |
| |
| /** |
| * Defines a prefix mapping. |
| */ |
| private class PrefixMap { |
| final String prefix; |
| final IRI IRI; |
| public PrefixMap(String prefix, IRI IRI) { |
| this.prefix = prefix; |
| this.IRI = IRI; |
| } |
| } |
| |
| /** |
| * Defines a IRI mapping. |
| */ |
| private class IRIMapping { |
| final Node sourceNode; |
| final Map<String, IRI> map; |
| |
| public IRIMapping(Node sourceNode, Map<String, IRI> map) { |
| this.sourceNode = sourceNode; |
| this.map = map; |
| } |
| } |
| |
| /** |
| * Defines the direction of an {@link IncompleteTriple}. |
| */ |
| private enum IncompleteTripleDirection { |
| Forward, |
| Reverse |
| } |
| |
| /** |
| * Defines an incomplete triple. |
| */ |
| private class IncompleteTriple { |
| final Node originatingNode; |
| final Resource subject; |
| final IRI predicate; |
| final IncompleteTripleDirection direction; |
| |
| public IncompleteTriple( |
| Node originatingNode, |
| Resource subject, |
| IRI predicate, |
| IncompleteTripleDirection direction |
| ) { |
| if(originatingNode == null || subject == null || predicate == null || direction == null) |
| throw new IllegalArgumentException(); |
| |
| this.originatingNode = originatingNode; |
| this.subject = subject; |
| this.predicate = predicate; |
| this.direction = direction; |
| } |
| |
| public boolean produceTriple(Node resourceNode, Resource r, ExtractionResult extractionResult) { |
| if( ! DomUtils.isAncestorOf(originatingNode, resourceNode, true) ) |
| return false; |
| |
| if(r == null) |
| throw new IllegalArgumentException(); |
| switch (direction) { |
| case Forward: |
| extractionResult.writeTriple(subject, predicate, r); |
| break; |
| case Reverse: |
| extractionResult.writeTriple(r, predicate, subject); |
| break; |
| default: |
| throw new IllegalStateException(); |
| } |
| return true; |
| } |
| } |
| |
| /** |
| * Defines a vocabulary object. |
| */ |
| private class Vocabulary { |
| final Node originatingNode; |
| final IRI prefix; |
| |
| public Vocabulary(Node originatingNode, IRI prefix) { |
| this.originatingNode = originatingNode; |
| this.prefix = prefix; |
| } |
| } |
| |
| } |