| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.any23.extractor.microdata; |
| |
| import org.apache.any23.extractor.html.DomUtils; |
| import org.apache.any23.rdf.RDFUtils; |
| import org.apache.commons.lang3.StringUtils; |
| import org.eclipse.rdf4j.model.IRI; |
| import org.eclipse.rdf4j.model.Literal; |
| import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil; |
| import org.eclipse.rdf4j.model.vocabulary.XMLSchema; |
| import org.jsoup.parser.Tag; |
| import org.w3c.dom.Document; |
| import org.w3c.dom.Element; |
| import org.w3c.dom.NamedNodeMap; |
| import org.w3c.dom.Node; |
| import org.w3c.dom.NodeList; |
| import org.w3c.dom.traversal.DocumentTraversal; |
| import org.w3c.dom.traversal.NodeFilter; |
| import org.w3c.dom.traversal.TreeWalker; |
| |
| import java.io.PrintStream; |
| import java.util.ArrayList; |
| import java.util.Arrays; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.LinkedHashSet; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.stream.Collectors; |
| |
| /** |
| * This class provides utility methods for handling <b>Microdata</b> |
| * nodes contained within a <i>DOM</i> document. |
| * |
| * @author Michele Mostarda (mostarda@fbk.eu) |
| * @author Hans Brende (hansbrende@apache.org) |
| */ |
| public class MicrodataParser { |
| |
| enum ErrorMode { |
| /** This mode raises an exception at first encountered error. */ |
| STOP_AT_FIRST_ERROR, |
| /** This mode produces a full error report. */ |
| FULL_REPORT |
| } |
| |
| private final Document document; |
| |
| /** |
| * This set holds the name of properties being dereferenced. |
| * The {@link #deferProperties(String...)} checks first if the |
| * required dereference has been already asked, if so raises |
| * a loop detection error. This map works in coordination |
| * with {@link #dereferenceRecursionCounter}, so that at the end of |
| * {@link #deferProperties(String...)} call recursion the |
| * loopDetectorSet can be cleaned up. |
| */ |
| private final Set<String> loopDetectorSet = new HashSet<>(); |
| |
| /** |
| * {@link ItemScope} cache. |
| */ |
| private final Map<Node,ItemScope> itemScopes = new HashMap<>(); |
| |
| /** |
| * {@link ItemPropValue} cache. |
| */ |
| private final Map<Node, ItemPropValue> itemPropValues = new HashMap<>(); |
| |
| /** |
| * Counts the recursive call of {@link #deferProperties(String...)}. |
| * It helps to cleanup the {@link #loopDetectorSet} when recursion ends. |
| */ |
| private int dereferenceRecursionCounter = 0; |
| |
| /** |
| * Current error mode. |
| */ |
| private ErrorMode errorMode = ErrorMode.FULL_REPORT; |
| |
| /** |
| * List of collected errors. Used when {@link #errorMode} <code>==</code> {@link ErrorMode#FULL_REPORT}. |
| */ |
| private final List<MicrodataParserException> errors = new ArrayList<>(); |
| |
| public static final String ITEMSCOPE_ATTRIBUTE = "itemscope"; |
| public static final String ITEMPROP_ATTRIBUTE = "itemprop"; |
| private static final String REVERSE_ITEMPROP_ATTRIBUTE = "itemprop-reverse"; |
| |
| /** |
| * List of tags providing the <code>src</code> property. |
| */ |
| public static final Set<String> SRC_TAGS = Collections.unmodifiableSet( |
| new HashSet<String>( Arrays.asList("audio", "embed", "frame", "iframe", "img", |
| "source", "track", "video", "input", "layer", "script", "textarea") ) |
| ); |
| |
| /** |
| * List of tags providing the <code>href</code> property. |
| */ |
| public static final Set<String> HREF_TAGS = Collections.unmodifiableSet( |
| new HashSet<String>( Arrays.asList("a", "area", "link") ) |
| ); |
| |
| public MicrodataParser(Document document) { |
| if(document == null) { |
| throw new NullPointerException("Document cannot be null."); |
| } |
| this.document = document; |
| } |
| |
| /** |
| * Returns all the <i>itemScope</i>s detected within the given root node. |
| * |
| * @param node root node to search in. |
| * @return list of detected items. |
| */ |
| public static List<Node> getItemScopeNodes(Node node) { |
| return DomUtils.findAllByAttributeName(node, ITEMSCOPE_ATTRIBUTE); |
| } |
| |
| /** |
| * Check whether a node is an <i>itemScope</i>. |
| * |
| * @param node node to check. |
| * @return <code>true</code> if the node is an <i>itemScope</i>., <code>false</code> otherwise. |
| */ |
| public static boolean isItemScope(Node node) { |
| return DomUtils.readAttribute(node, ITEMSCOPE_ATTRIBUTE, null) != null; |
| } |
| |
| /** |
| * Returns all the <i>itemProp</i>s detected within the given root node. |
| * |
| * @param node root node to search in. |
| * @return list of detected items. |
| */ |
| public static List<Node> getItemPropNodes(Node node) { |
| return DomUtils.findAllByAttributeName(node, ITEMPROP_ATTRIBUTE); |
| } |
| |
| /** |
| * Check whether a node is an <i>itemProp</i>. |
| * |
| * @param node node to check. |
| * @return <code>true</code> if the node is an <i>itemProp</i>., <code>false</code> otherwise. |
| */ |
| public static boolean isItemProp(Node node) { |
| return DomUtils.readAttribute(node, ITEMPROP_ATTRIBUTE, null) != null; |
| } |
| |
| private static boolean isContainedInItemScope(Node node) { |
| for (Node p = node.getParentNode(); p != null; p = p.getParentNode()) { |
| NamedNodeMap attrs = p.getAttributes(); |
| if (attrs != null && attrs.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| private static boolean isContainedInId(Node node, Set<String> ids) { |
| do { |
| String id = DomUtils.readAttribute(node, "id", null); |
| if (id != null && ids.contains(id)) { |
| return true; |
| } |
| node = node.getParentNode(); |
| } while (node != null); |
| return false; |
| } |
| |
| /** |
| * Returns only the <i>itemScope</i>s that are top level items. |
| * |
| * @param node root node to search in. |
| * @return list of detected top item scopes. |
| */ |
| public static List<Node> getTopLevelItemScopeNodes(Node node) { |
| final List<Node> itemScopes = getItemScopeNodes(node); |
| final List<Node> topLevelItemScopes = new ArrayList<>(); |
| final List<Node> possibles = new ArrayList<>(); |
| for (Node itemScope : itemScopes) { |
| if (!isItemProp(itemScope) |
| && DomUtils.readAttribute(itemScope, REVERSE_ITEMPROP_ATTRIBUTE, null) == null) { |
| topLevelItemScopes.add(itemScope); |
| } else if (!isContainedInItemScope(itemScope)) { |
| possibles.add(itemScope); |
| } |
| } |
| |
| if (!possibles.isEmpty()) { |
| Set<String> refIds = itemScopes.stream() |
| .flatMap(n -> Arrays.stream(itemrefIds(n))) |
| .collect(Collectors.toSet()); |
| |
| for (Node itemScope : possibles) { |
| if (!isContainedInId(itemScope, refIds)) { |
| topLevelItemScopes.add(itemScope); |
| } |
| } |
| } |
| |
| return topLevelItemScopes; |
| } |
| |
| /** |
| * Returns all the <b>Microdata items</b> detected within the given <code>document</code>. |
| * |
| * @param document document to be processed. |
| * @param errorMode error management policy. |
| * @return list of <b>itemscope</b> items. |
| * @throws MicrodataParserException if |
| * <code>errorMode == {@link org.apache.any23.extractor.microdata.MicrodataParser.ErrorMode#STOP_AT_FIRST_ERROR}</code> |
| * and an error occurs. |
| */ |
| public static MicrodataParserReport getMicrodata(Document document, ErrorMode errorMode) |
| throws MicrodataParserException { |
| final List<Node> itemNodes = getTopLevelItemScopeNodes(document); |
| final List<ItemScope> items = new ArrayList<>(); |
| final MicrodataParser microdataParser = new MicrodataParser(document); |
| microdataParser.setErrorMode(errorMode); |
| for(Node itemNode : itemNodes) { |
| items.add( microdataParser.getItemScope(itemNode) ); |
| } |
| return new MicrodataParserReport( |
| items.toArray( new ItemScope[items.size()] ), |
| microdataParser.getErrors() |
| ); |
| } |
| |
| /** |
| * Returns all the <b>Microdata items</b> detected within the given <code>document</code>, |
| * works in full report mode. |
| * |
| * @param document document to be processed. |
| * @return list of <b>itemscope</b> items. |
| */ |
| public static MicrodataParserReport getMicrodata(Document document) { |
| try { |
| return getMicrodata(document, ErrorMode.FULL_REPORT); |
| } catch (MicrodataParserException mpe) { |
| throw new IllegalStateException("Unexpected exception.", mpe); |
| } |
| } |
| |
| /** |
| * Returns a <i>JSON</i> containing the list of all extracted Microdata, |
| * as described at <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>. |
| * |
| * @param document document to be processed. |
| * @param ps the {@link java.io.PrintStream} to write JSON to |
| */ |
| public static void getMicrodataAsJSON(Document document, PrintStream ps) { |
| final MicrodataParserReport report = getMicrodata(document); |
| final ItemScope[] itemScopes = report.getDetectedItemScopes(); |
| final MicrodataParserException[] errors = report.getErrors(); |
| |
| ps.append("{ "); |
| |
| // Results. |
| ps.append("\"result\" : ["); |
| for(int i = 0; i < itemScopes.length; i++) { |
| if (i > 0) { |
| ps.print(", "); |
| } |
| ps.print( itemScopes[i].toJSON() ); |
| } |
| ps.append("] "); |
| |
| // Errors. |
| if(errors != null && errors.length > 0) { |
| ps.append(", "); |
| ps.append("\"errors\" : ["); |
| for (int i = 0; i < errors.length; i++) { |
| if (i > 0) { |
| ps.print(", "); |
| } |
| ps.print( errors[i].toJSON() ); |
| } |
| ps.append("] "); |
| } |
| |
| ps.append("}"); |
| } |
| |
| public void setErrorMode(ErrorMode errorMode) { |
| if(errorMode == null) |
| throw new IllegalArgumentException("errorMode must be not null."); |
| this.errorMode = errorMode; |
| } |
| |
| public ErrorMode getErrorMode() { |
| return this.errorMode; |
| } |
| |
| public MicrodataParserException[] getErrors() { |
| return errors == null |
| ? |
| new MicrodataParserException[0] |
| : |
| errors.toArray( new MicrodataParserException[errors.size()] ); |
| } |
| |
| /** |
| * Reads the value of a <b>itemprop</b> node. |
| * |
| * @param node itemprop node. |
| * @return value detected within the given <code>node</code>. |
| * @throws MicrodataParserException if an error occurs while extracting a nested item scope. |
| */ |
| public ItemPropValue getPropertyValue(Node node) throws MicrodataParserException { |
| final ItemPropValue itemPropValue = itemPropValues.get(node); |
| if (itemPropValue != null) |
| return itemPropValue; |
| |
| if (isItemScope(node)) { |
| return new ItemPropValue( getItemScope(node), ItemPropValue.Type.Nested); |
| } |
| |
| final String nodeName = node.getNodeName().toLowerCase(Locale.ROOT); |
| |
| //see http://w3c.github.io/microdata-rdf/#dfn-property-values |
| if ("data".equals(nodeName) || "meter".equals(nodeName)) { |
| String value = value(node, "value"); |
| Literal l; |
| if (XMLDatatypeUtil.isValidInteger(value)) { |
| l = RDFUtils.literal(value, XMLSchema.INTEGER); |
| } else if (XMLDatatypeUtil.isValidDouble(value)) { |
| l = RDFUtils.literal(value, XMLSchema.DOUBLE); |
| } else { |
| l = RDFUtils.literal(value); |
| } |
| return new ItemPropValue(l); |
| } |
| if ("time".equals(nodeName)) { |
| String dateTimeStr = value(node, "datetime"); |
| Literal l; |
| if (XMLDatatypeUtil.isValidDate(dateTimeStr)) { |
| l = RDFUtils.literal(dateTimeStr, XMLSchema.DATE); |
| } else if (XMLDatatypeUtil.isValidTime(dateTimeStr)) { |
| l = RDFUtils.literal(dateTimeStr, XMLSchema.TIME); |
| } else if (XMLDatatypeUtil.isValidDateTime(dateTimeStr)) { |
| l = RDFUtils.literal(dateTimeStr, XMLSchema.DATETIME); |
| } else if (XMLDatatypeUtil.isValidGYearMonth(dateTimeStr)) { |
| l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEARMONTH); |
| } else if (XMLDatatypeUtil.isValidGYear(dateTimeStr)) { |
| l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEAR); |
| } else if (XMLDatatypeUtil.isValidDuration(dateTimeStr)) { |
| l = RDFUtils.literal(dateTimeStr, XMLSchema.DURATION); |
| } else { |
| l = RDFUtils.literal(dateTimeStr, getLanguage(node)); |
| } |
| return new ItemPropValue(l); |
| } |
| |
| if (SRC_TAGS.contains(nodeName)) { |
| return link(node, "src"); |
| } |
| if (HREF_TAGS.contains(nodeName)) { |
| return link(node, "href"); |
| } |
| |
| if ("object".equals(nodeName)) { |
| return link(node, "data"); |
| } |
| |
| String val = DomUtils.readAttribute(node, "content", null); |
| if (val != null) { |
| return new ItemPropValue(RDFUtils.literal(val, getLanguage(node))); |
| } |
| |
| Literal l = RDFUtils.literal(textContent(node), getLanguage(node)); |
| final ItemPropValue newItemPropValue = new ItemPropValue(l); |
| itemPropValues.put(node, newItemPropValue); |
| return newItemPropValue; |
| } |
| |
| private static String textContent(Node node) { |
| StringBuilder content = new StringBuilder(); |
| appendFormatted(node, content, false); |
| return content.toString(); |
| } |
| |
| private static boolean shouldSeparateWithNewline(CharSequence s0, CharSequence s1) { |
| for (int i = 0, len = s1.length(); i < len; i++) { |
| char ch = s1.charAt(i); |
| if (ch == '\n' || ch == '\r') { |
| return false; |
| } |
| if (!Character.isWhitespace(ch)) { |
| break; |
| } |
| } |
| for (int i = s0.length() - 1; i >= 0; i--) { |
| char ch = s0.charAt(i); |
| if (ch == '\n' || ch == '\r') { |
| return false; |
| } |
| if (!Character.isWhitespace(ch)) { |
| return true; |
| } |
| } |
| return false; |
| } |
| |
| private static boolean appendFormatted(Node node, StringBuilder sb, boolean needsNewline) { |
| switch (node.getNodeType()) { |
| case Node.TEXT_NODE: |
| String text = node.getTextContent(); |
| if (text.isEmpty()) { |
| return needsNewline; |
| } |
| if (needsNewline && shouldSeparateWithNewline(sb, text)) { |
| sb.append('\n'); |
| } |
| sb.append(text); |
| return false; |
| case Node.ELEMENT_NODE: |
| final String nodeName = node.getNodeName().toLowerCase(Locale.ENGLISH); |
| final boolean thisNeedsNewline = "br".equals(nodeName) || Tag.valueOf(nodeName).isBlock(); |
| final NodeList children = node.getChildNodes(); |
| boolean prevChildNeedsNewline = needsNewline || thisNeedsNewline; |
| for (int i = 0, len = children.getLength(); i < len; i++) { |
| prevChildNeedsNewline = appendFormatted(children.item(i), sb, prevChildNeedsNewline); |
| } |
| return prevChildNeedsNewline || thisNeedsNewline; |
| default: |
| return needsNewline; |
| } |
| } |
| |
| private static String content(Node node, String attrName) { |
| NamedNodeMap attributes = node.getAttributes(); |
| if (attributes != null) { |
| Node attr = attributes.getNamedItem("content"); |
| if (attr != null) { |
| return attr.getNodeValue(); |
| } |
| attr = attributes.getNamedItem(attrName); |
| if (attr != null) { |
| return attr.getNodeValue(); |
| } |
| } |
| return null; |
| } |
| |
| private static String value(Node node, String attrName) { |
| String content = content(node, attrName); |
| return StringUtils.stripToEmpty(content != null ? content : node.getTextContent()); |
| } |
| |
| private static ItemPropValue link(Node node, String attrName) { |
| String content = content(node, attrName); |
| return content == null ? new ItemPropValue(RDFUtils.literal("")) |
| : new ItemPropValue(content, ItemPropValue.Type.Link); |
| } |
| |
| //see https://www.w3.org/TR/html52/dom.html#the-lang-and-xmllang-attributes |
| private static String getLanguage(Node node) { |
| String lang; |
| do { |
| lang = DomUtils.readAttribute(node, "xml:lang", null); |
| if (StringUtils.isNotBlank(lang)) { |
| return lang.trim(); |
| } |
| lang = DomUtils.readAttribute(node, "lang", null); |
| if (StringUtils.isNotBlank(lang)) { |
| return lang.trim(); |
| } |
| node = node.getParentNode(); |
| } while (node != null); |
| return null; |
| } |
| |
| /** |
| * Returns all the <b>itemprop</b>s for the given <b>itemscope</b> node. |
| * |
| * @param scopeNode node representing the <b>itemscope</b> |
| * @param skipRoot if <code>true</code> the given root <code>node</code> |
| * will be not read as a property, even if it contains the <b>itemprop</b> attribute. |
| * @return the list of <b>itemprop</b>s detected within the given <b>itemscope</b>. |
| * @throws MicrodataParserException if an error occurs while retrieving an property value. |
| */ |
| public List<ItemProp> getItemProps(final Node scopeNode, boolean skipRoot) throws MicrodataParserException { |
| final Set<Node> accepted = new LinkedHashSet<>(); |
| |
| boolean skipRootChildren = false; |
| if (!skipRoot) { |
| NamedNodeMap attributes = scopeNode.getAttributes(); |
| if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null |
| || attributes.getNamedItem(REVERSE_ITEMPROP_ATTRIBUTE) != null) { |
| accepted.add(scopeNode); |
| } |
| if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) { |
| skipRootChildren = true; |
| } |
| } |
| |
| if (!skipRootChildren) { |
| // TreeWalker to walk DOM tree starting with the scopeNode. Nodes maybe visited multiple times. |
| TreeWalker treeWalker = ((DocumentTraversal) scopeNode.getOwnerDocument()) |
| .createTreeWalker(scopeNode, NodeFilter.SHOW_ELEMENT, new NodeFilter() { |
| @Override |
| public short acceptNode(Node node) { |
| if (node.getNodeType() == Node.ELEMENT_NODE) { |
| NamedNodeMap attributes = node.getAttributes(); |
| if ((attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null |
| || attributes.getNamedItem(REVERSE_ITEMPROP_ATTRIBUTE) != null) && scopeNode != node) { |
| accepted.add(node); |
| } |
| |
| if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) { |
| // Don't visit descendants of nodes that define a new scope |
| return FILTER_REJECT; |
| } |
| } |
| return FILTER_ACCEPT; |
| } |
| }, false); |
| |
| |
| // To populate accepted we only need to walk the tree. |
| while (treeWalker.nextNode() != null) ; |
| } |
| |
| final List<ItemProp> result = new ArrayList<>(); |
| for (Node itemPropNode : accepted) { |
| final String itemProp = DomUtils.readAttribute(itemPropNode, ITEMPROP_ATTRIBUTE, null); |
| final String reverseProp = DomUtils.readAttribute(itemPropNode, REVERSE_ITEMPROP_ATTRIBUTE, null); |
| |
| boolean hasItemProp = StringUtils.isNotBlank(itemProp); |
| boolean hasReverseProp = StringUtils.isNotBlank(reverseProp); |
| |
| if (!hasItemProp && !hasReverseProp) { |
| manageError(new MicrodataParserException("invalid property name '" + itemProp + "'", itemPropNode)); |
| continue; |
| } |
| |
| ItemPropValue itemPropValue; |
| try { |
| itemPropValue = getPropertyValue(itemPropNode); |
| } catch (MicrodataParserException mpe) { |
| manageError(mpe); |
| continue; |
| } |
| if (hasItemProp) { |
| for (String propertyName : itemProp.trim().split("\\s+")) { |
| result.add( |
| new ItemProp( |
| DomUtils.getXPathForNode(itemPropNode), |
| propertyName, |
| itemPropValue, |
| false |
| ) |
| ); |
| } |
| } |
| if (hasReverseProp) { |
| if (itemPropValue.literal != null) { |
| manageError(new MicrodataParserException(REVERSE_ITEMPROP_ATTRIBUTE |
| + " cannot point to a literal", itemPropNode)); |
| continue; |
| } |
| for (String propertyName : reverseProp.trim().split("\\s+")) { |
| result.add( |
| new ItemProp( |
| DomUtils.getXPathForNode(itemPropNode), |
| propertyName, |
| itemPropValue, |
| true |
| ) |
| ); |
| } |
| } |
| } |
| return result; |
| } |
| |
| /** |
| * Given a document and a list of <b>itemprop</b> names this method will return |
| * such <b>itemprops</b>. |
| * |
| * @param refs list of references. |
| * @return list of retrieved <b>itemprop</b>s. |
| * @throws MicrodataParserException if a loop is detected or a property name is missing. |
| */ |
| public ItemProp[] deferProperties(String... refs) throws MicrodataParserException { |
| Document document = this.document; |
| dereferenceRecursionCounter++; |
| final List<ItemProp> result = new ArrayList<>(); |
| try { |
| for (String ref : refs) { |
| if (loopDetectorSet.contains(ref)) { |
| throw new MicrodataParserException( |
| String.format(Locale.ROOT, |
| "Loop detected with depth %d while dereferencing itemProp '%s' .", |
| dereferenceRecursionCounter - 1, ref |
| ), |
| null |
| ); |
| } |
| loopDetectorSet.add(ref); |
| Element element = document.getElementById(ref); |
| if (element == null) { |
| manageError( |
| new MicrodataParserException( String.format(Locale.ROOT, "Unknown itemProp id '%s'", ref ), null ) |
| ); |
| continue; |
| } |
| result.addAll(getItemProps(element, false)); |
| } |
| } catch (MicrodataParserException mpe) { |
| if(dereferenceRecursionCounter == 1) |
| manageError(mpe); |
| else throw mpe; // Recursion end, this the the top call. |
| } finally { |
| dereferenceRecursionCounter--; |
| if(dereferenceRecursionCounter == 0) { // Recursion end, this the the top call. |
| loopDetectorSet.clear(); |
| } |
| } |
| return result.toArray( new ItemProp[result.size()] ); |
| } |
| |
| private static final String[] EMPTY_STRINGS = new String[0]; |
| private static String[] itemrefIds(Node node) { |
| String itemref = DomUtils.readAttribute(node, "itemref" , null); |
| return StringUtils.isBlank(itemref) ? EMPTY_STRINGS : itemref.trim().split("\\s+"); |
| } |
| |
| /** |
| * Returns the {@link ItemScope} instance described within the specified <code>node</code>. |
| * |
| * @param node node describing an <i>itemscope</i>. |
| * @return instance of ItemScope object. |
| * @throws MicrodataParserException if an error occurs while dereferencing properties. |
| */ |
| public ItemScope getItemScope(Node node) throws MicrodataParserException { |
| final ItemScope itemScope = itemScopes.get(node); |
| if(itemScope != null) |
| return itemScope; |
| |
| final String id = DomUtils.readAttribute(node, "id" , null); |
| final String itemType = DomUtils.readAttribute(node, "itemtype", null); |
| final String itemId = DomUtils.readAttribute(node, "itemid" , null); |
| |
| final List<ItemProp> itemProps = getItemProps(node, true); |
| final String[] itemrefIDs = itemrefIds(node); |
| final ItemProp[] deferredProperties; |
| try { |
| deferredProperties = deferProperties(itemrefIDs); |
| } catch (MicrodataParserException mpe) { |
| mpe.setErrorNode(node); |
| throw mpe; |
| } |
| for(ItemProp deferredProperty : deferredProperties) { |
| if( itemProps.contains(deferredProperty) ) { |
| manageError( |
| new MicrodataParserException( |
| String.format(Locale.ROOT, "Duplicated deferred itemProp '%s'.", deferredProperty.getName() ), |
| node |
| ) |
| ); |
| continue; |
| } |
| itemProps.add(deferredProperty); |
| } |
| |
| List<IRI> types; |
| if (itemType == null) { |
| types = Collections.emptyList(); |
| } else { |
| types = new ArrayList<>(); |
| boolean canConcatWithPrev = false; |
| for (String s : itemType.trim().split("\\s+")) { |
| try { |
| canConcatWithPrev = types.addAll(ItemScope.stringToSingletonIRI(s)); |
| } catch (RuntimeException e) { |
| if (canConcatWithPrev) { |
| int lastInd = types.size() - 1; |
| try { |
| List<IRI> secondTry = ItemScope.stringToSingletonIRI(types.get(lastInd).stringValue() + " " + s); |
| types.remove(lastInd); |
| canConcatWithPrev = types.addAll(secondTry); |
| } catch (RuntimeException e2) { |
| manageError(new MicrodataParserException(e.getMessage(), node)); |
| canConcatWithPrev = false; |
| } |
| } else { |
| manageError(new MicrodataParserException(e.getMessage(), node)); |
| } |
| } |
| } |
| } |
| |
| final ItemScope newItemScope = new ItemScope( |
| DomUtils.getXPathForNode(node), |
| itemProps.toArray(new ItemProp[itemProps.size()]), |
| id, |
| itemrefIDs, |
| types, |
| itemId |
| ); |
| itemScopes.put(node, newItemScope); |
| return newItemScope; |
| } |
| |
| private void manageError(MicrodataParserException mpe) throws MicrodataParserException { |
| switch (errorMode) { |
| case FULL_REPORT: |
| errors.add(mpe); |
| break; |
| case STOP_AT_FIRST_ERROR: |
| throw mpe; |
| default: |
| throw new IllegalStateException("Unsupported mode " + errorMode); |
| } |
| } |
| |
| } |