| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.any23.extractor.html; |
| |
| import org.w3c.dom.Document; |
| import org.w3c.dom.NamedNodeMap; |
| import org.w3c.dom.Node; |
| import org.w3c.dom.NodeList; |
| import org.w3c.dom.traversal.DocumentTraversal; |
| import org.w3c.dom.traversal.NodeFilter; |
| import org.w3c.dom.traversal.NodeIterator; |
| |
| import javax.xml.transform.OutputKeys; |
| import javax.xml.transform.Result; |
| import javax.xml.transform.Transformer; |
| import javax.xml.transform.TransformerConfigurationException; |
| import javax.xml.transform.TransformerException; |
| import javax.xml.transform.TransformerFactory; |
| import javax.xml.transform.TransformerFactoryConfigurationError; |
| import javax.xml.transform.dom.DOMSource; |
| import javax.xml.transform.stream.StreamResult; |
| import javax.xml.xpath.XPath; |
| import javax.xml.xpath.XPathConstants; |
| import javax.xml.xpath.XPathExpressionException; |
| import javax.xml.xpath.XPathFactory; |
| |
| import java.io.ByteArrayInputStream; |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.StringWriter; |
| import java.io.UnsupportedEncodingException; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.Locale; |
| import java.util.regex.Pattern; |
| |
| /** |
| * This class provides utility methods for DOM manipulation. It is separated from {@link HTMLDocument} so that its |
| * methods can be run on single DOM nodes without having to wrap them into an HTMLDocument. |
| * <p> |
| * We use a mix of XPath and DOM manipulation. |
| * </p> |
| * This is likely to be a performance bottleneck but at least everything is localized here. |
| */ |
| public class DomUtils { |
| |
| private static final String[] EMPTY_STRING_ARRAY = new String[0]; |
| |
| private final static XPath xPathEngine = XPathFactory.newInstance().newXPath(); |
| |
| private DomUtils() { |
| } |
| |
| /** |
| * Given a node this method returns the index corresponding to such node within the list of the children of its |
| * parent node. |
| * |
| * @param n |
| * the node of which returning the index. |
| * |
| * @return a non negative number. |
| */ |
| public static int getIndexInParent(Node n) { |
| Node parent = n.getParentNode(); |
| if (parent == null) { |
| return 0; |
| } |
| NodeList nodes = parent.getChildNodes(); |
| int counter = -1; |
| for (int i = 0; i < nodes.getLength(); i++) { |
| Node current = nodes.item(i); |
| if (current.getNodeType() == n.getNodeType() && current.getNodeName().equals(n.getNodeName())) { |
| counter++; |
| } |
| if (current.equals(n)) { |
| return counter; |
| } |
| } |
| throw new IllegalStateException("Cannot find a child within its parent node list."); |
| } |
| |
| /** |
| * Does a reverse walking of the DOM tree to generate a unique XPath expression leading to this node. The XPath |
| * generated is the canonical one based on sibling index: /html[1]/body[1]/div[2]/span[3] etc.. |
| * |
| * @param node |
| * the input node. |
| * |
| * @return the XPath location of node as String. |
| */ |
| public static String getXPathForNode(Node node) { |
| final StringBuilder sb = new StringBuilder(); |
| Node parent = node; |
| while (parent != null && parent.getNodeType() != Node.DOCUMENT_NODE) { |
| sb.insert(0, "]"); |
| sb.insert(0, getIndexInParent(parent) + 1); |
| sb.insert(0, "["); |
| sb.insert(0, parent.getNodeName()); |
| sb.insert(0, "/"); |
| parent = parent.getParentNode(); |
| } |
| return sb.toString(); |
| } |
| |
| /** |
| * Returns a list of tag names representing the path from the document root to the given node <i>n</i>. |
| * |
| * @param n |
| * the node for which retrieve the path. |
| * |
| * @return a sequence of HTML tag names. |
| */ |
| public static String[] getXPathListForNode(Node n) { |
| if (n == null) { |
| return EMPTY_STRING_ARRAY; |
| } |
| List<String> ancestors = new ArrayList<String>(); |
| ancestors.add(String.format(Locale.ROOT, "%s[%s]", n.getNodeName(), getIndexInParent(n))); |
| Node parent = n.getParentNode(); |
| while (parent != null) { |
| ancestors.add(0, String.format(Locale.ROOT, "%s[%s]", parent.getNodeName(), getIndexInParent(parent))); |
| parent = parent.getParentNode(); |
| } |
| return ancestors.toArray(new String[ancestors.size()]); |
| } |
| |
| /** |
| * Returns the row/col location of the given node. |
| * |
| * @param n |
| * input node. |
| * |
| * @return an array of two elements of type |
| * <code>[<begin-row>, <begin-col>, <end-row> <end-col>]</code> or <code>null</code> |
| * if not possible to extract such data. |
| */ |
| public static int[] getNodeLocation(Node n) { |
| if (n == null) |
| throw new NullPointerException("node cannot be null."); |
| final TagSoupParser.ElementLocation elementLocation = (TagSoupParser.ElementLocation) n |
| .getUserData(TagSoupParser.ELEMENT_LOCATION); |
| if (elementLocation == null) |
| return null; |
| return new int[] { elementLocation.getBeginLineNumber(), elementLocation.getBeginColumnNumber(), |
| elementLocation.getEndLineNumber(), elementLocation.getEndColumnNumber() }; |
| } |
| |
| /** |
| * Checks whether a node is ancestor or same of another node. |
| * |
| * @param candidateAncestor |
| * the candidate ancestor node. |
| * @param candidateSibling |
| * the candidate sibling node. |
| * @param strict |
| * if <code>true</code> is not allowed that the ancestor and sibling can be the same node. |
| * |
| * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>, |
| * <code>false</code> otherwise. |
| */ |
| public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling, boolean strict) { |
| if (candidateAncestor == null) |
| throw new NullPointerException("candidate ancestor cannot be null null."); |
| if (candidateSibling == null) |
| throw new NullPointerException("candidate sibling cannot be null null."); |
| if (strict && candidateAncestor.equals(candidateSibling)) |
| return false; |
| Node parent = candidateSibling; |
| while (parent != null) { |
| if (parent.equals(candidateAncestor)) |
| return true; |
| parent = parent.getParentNode(); |
| } |
| return false; |
| } |
| |
| /** |
| * Checks whether a node is ancestor or same of another node. As |
| * {@link #isAncestorOf(org.w3c.dom.Node, org.w3c.dom.Node, boolean)} with <code>strict=false</code>. |
| * |
| * @param candidateAncestor |
| * the candidate ancestor node. |
| * @param candidateSibling |
| * the candidate sibling node. |
| * |
| * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>, |
| * <code>false</code> otherwise. |
| */ |
| public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling) { |
| return isAncestorOf(candidateAncestor, candidateSibling, false); |
| } |
| |
| /** |
| * Finds all nodes that have a declared class. Note that the className is transformed to lower case before being |
| * matched against the DOM. |
| * |
| * @param root |
| * the root node from which start searching. |
| * @param className |
| * the name of the filtered class. |
| * |
| * @return list of matching nodes or an empty list. |
| */ |
| public static List<Node> findAllByClassName(Node root, String className) { |
| return findAllBy(root, null, "class", className.toLowerCase(Locale.ROOT)); |
| } |
| |
| /** |
| * Finds all nodes that have a declared attribute. Note that the className is transformed to lower case before being |
| * matched against the DOM. |
| * |
| * @param root |
| * the root node from which start searching. |
| * @param attrName |
| * the name of the filtered attribue. |
| * |
| * @return list of matching nodes or an empty list. |
| */ |
| public static List<Node> findAllByAttributeName(Node root, String attrName) { |
| return findAllBy(root, null, attrName, null); |
| } |
| |
| public static List<Node> findAllByAttributeContains(Node node, String attrName, String attrContains) { |
| return findAllBy(node, null, attrName, attrContains); |
| } |
| |
| public static List<Node> findAllByTag(Node root, String tagName) { |
| return findAllBy(root, tagName, null, null); |
| } |
| |
| public static List<Node> findAllByTagAndClassName(Node root, final String tagName, final String className) { |
| return findAllBy(root, tagName, "class", className); |
| } |
| |
| /** |
| * Mimics the JS DOM API, or prototype's $() |
| * |
| * @param root |
| * the node to locate |
| * @param id |
| * the id of the node to locate |
| * |
| * @return the {@link org.w3c.dom.Node} if one exists |
| */ |
| public static Node findNodeById(Node root, String id) { |
| Node node; |
| try { |
| String xpath = "//*[@id='" + id + "']"; |
| node = (Node) xPathEngine.evaluate(xpath, root, XPathConstants.NODE); |
| } catch (XPathExpressionException ex) { |
| throw new RuntimeException("Should not happen", ex); |
| } |
| return node; |
| } |
| |
| /** |
| * Returns a NodeList composed of all the nodes that match an XPath expression, which must be valid. |
| * |
| * @param node |
| * the node object to locate |
| * @param xpath |
| * an xpath expression |
| * |
| * @return a list of {@link org.w3c.dom.Node}'s if they exists |
| */ |
| public static List<Node> findAll(Node node, String xpath) { |
| if (node == null) { |
| throw new NullPointerException("node cannot be null."); |
| } |
| try { |
| NodeList nodes = (NodeList) xPathEngine.evaluate(xpath, node, XPathConstants.NODESET); |
| List<Node> result = new ArrayList<Node>(nodes.getLength()); |
| for (int i = 0; i < nodes.getLength(); i++) { |
| result.add(nodes.item(i)); |
| } |
| return result; |
| } catch (XPathExpressionException ex) { |
| throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex); |
| } |
| } |
| |
| /** |
| * Gets the string value of an XPath expression. |
| * |
| * @param node |
| * the node object to locate |
| * @param xpath |
| * an xpath expression |
| * |
| * @return a string xpath value |
| */ |
| public static String find(Node node, String xpath) { |
| try { |
| String val = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING); |
| if (null == val) |
| return ""; |
| return val; |
| } catch (XPathExpressionException ex) { |
| throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex); |
| } |
| } |
| |
| /** |
| * Tells if an element has a class name <b>not checking the parents in the hierarchy</b> mimicking the <i>CSS</i> |
| * .foo match. |
| * |
| * @param node |
| * the node object to locate |
| * @param className |
| * the CSS class name |
| * |
| * @return true if the class name exists |
| */ |
| public static boolean hasClassName(Node node, String className) { |
| return hasAttribute(node, "class", className); |
| } |
| |
| /** |
| * Checks the presence of an attribute value in attributes that contain whitespace-separated lists of values. The |
| * semantic is the CSS classes' ones: "foo" matches "bar foo", "foo" but not "foob" |
| * |
| * @param node |
| * the node object to locate |
| * @param attributeName |
| * attribute value |
| * @param className |
| * the CSS class name |
| * |
| * @return true if the class has the attribute name |
| */ |
| public static boolean hasAttribute(Node node, String attributeName, String className) { |
| // regex love, maybe faster but less easy to understand |
| // Pattern pattern = Pattern.compile("(^|\\s+)"+className+"(\\s+|$)"); |
| String attr = readAttribute(node, attributeName); |
| for (String c : attr.split("\\s+")) |
| if (c.equalsIgnoreCase(className)) |
| return true; |
| return false; |
| } |
| |
| /** |
| * Checks the presence of an attribute in the given <code>node</code>. |
| * |
| * @param node |
| * the node container. |
| * @param attributeName |
| * the name of the attribute. |
| * |
| * @return true if the attribute is present |
| */ |
| public static boolean hasAttribute(Node node, String attributeName) { |
| return readAttribute(node, attributeName, null) != null; |
| } |
| |
| /** |
| * Verifies if the given target node is an element. |
| * |
| * @param target |
| * target node to check |
| * |
| * @return <code>true</code> if the element the node is an element, <code>false</code> otherwise. |
| */ |
| public static boolean isElementNode(Node target) { |
| return Node.ELEMENT_NODE == target.getNodeType(); |
| } |
| |
| /** |
| * Reads the value of the specified <code>attribute</code>, returning the <code>defaultValue</code> string if not |
| * present. |
| * |
| * @param node |
| * node to read the attribute. |
| * @param attribute |
| * attribute name. |
| * @param defaultValue |
| * the default value to return if attribute is not found. |
| * |
| * @return the attribute value or <code>defaultValue</code> if not found. |
| */ |
| public static String readAttribute(Node node, String attribute, String defaultValue) { |
| NamedNodeMap attributes = node.getAttributes(); |
| if (null == attributes) |
| return defaultValue; |
| Node attr = attributes.getNamedItem(attribute); |
| if (null == attr) |
| return defaultValue; |
| return attr.getNodeValue(); |
| } |
| |
| /** |
| * Reads the value of the first <i>attribute</i> which name matches with the specified <code>attributePrefix</code>. |
| * Returns the <code>defaultValue</code> if not found. |
| * |
| * @param node |
| * node to look for attributes. |
| * @param attributePrefix |
| * attribute prefix. |
| * @param defaultValue |
| * default returned value. |
| * |
| * @return the value found or default. |
| */ |
| public static String readAttributeWithPrefix(Node node, String attributePrefix, String defaultValue) { |
| final NamedNodeMap attributes = node.getAttributes(); |
| if (null == attributes) { |
| return defaultValue; |
| } |
| Node attribute; |
| for (int a = 0; a < attributes.getLength(); a++) { |
| attribute = attributes.item(a); |
| if (attribute.getNodeName().startsWith(attributePrefix)) { |
| return attribute.getNodeValue(); |
| } |
| } |
| return defaultValue; |
| } |
| |
| /** |
| * Reads the value of an <code>attribute</code>, returning the empty string if not present. |
| * |
| * @param node |
| * node to read the attribute. |
| * @param attribute |
| * attribute name. |
| * |
| * @return the attribute value or <code>""</code> if not found. |
| */ |
| public static String readAttribute(Node node, String attribute) { |
| return readAttribute(node, attribute, ""); |
| } |
| |
| /** |
| * Given a <i>DOM</i> {@link Node} produces the <i>XML</i> serialization omitting the <i>XML declaration</i>. |
| * |
| * @param node |
| * node to be serialized. |
| * @param indent |
| * if <code>true</code> the output is indented. |
| * |
| * @return the XML serialization. |
| * |
| * @throws TransformerException |
| * if an error occurs during the serializator initialization and activation. |
| * @throws java.io.IOException |
| * if there is an error locating the node |
| */ |
| public static String serializeToXML(Node node, boolean indent) throws TransformerException, IOException { |
| final DOMSource domSource = new DOMSource(node); |
| final Transformer transformer = TransformerFactory.newInstance().newTransformer(); |
| transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); |
| transformer.setOutputProperty(OutputKeys.METHOD, "xml"); |
| transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); |
| if (indent) { |
| transformer.setOutputProperty(OutputKeys.INDENT, "yes"); |
| transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4"); |
| } |
| final StringWriter sw = new StringWriter(); |
| final StreamResult sr = new StreamResult(sw); |
| transformer.transform(domSource, sr); |
| sw.close(); |
| return sw.toString(); |
| } |
| |
| /** |
| * High performance implementation of {@link #findAll(org.w3c.dom.Node, String)}. |
| * |
| * @param root |
| * root node to start search. |
| * @param tagName |
| * name of target tag. |
| * @param attrName |
| * name of attribute filter. |
| * @param attrContains |
| * expected content for attribute. |
| * |
| * @return a {@link java.util.List} of {@link org.w3c.dom.Node}'s |
| */ |
| private static List<Node> findAllBy(Node root, final String tagName, final String attrName, String attrContains) { |
| DocumentTraversal documentTraversal = (DocumentTraversal) root.getOwnerDocument(); |
| if (documentTraversal == null) { |
| documentTraversal = (DocumentTraversal) root; |
| } |
| |
| final Pattern attrContainsPattern; |
| if (attrContains != null && !attrContains.equals("*")) { |
| attrContainsPattern = Pattern.compile("(^|\\s)" + attrContains + "(\\s|$)", Pattern.CASE_INSENSITIVE); |
| } else { |
| attrContainsPattern = null; |
| } |
| |
| final List<Node> result = new ArrayList<Node>(); |
| NodeIterator nodeIterator = documentTraversal.createNodeIterator(root, NodeFilter.SHOW_ELEMENT, |
| new NodeFilter() { |
| @Override |
| public short acceptNode(Node node) { |
| if (node.getNodeType() == Node.ELEMENT_NODE) { |
| if (tagName != null && !tagName.equals("*") && !tagName.equals(node.getNodeName())) { |
| // tagName given but doesn't match. |
| return FILTER_ACCEPT; |
| } |
| |
| if (attrName != null) { |
| Node attrNameNode = node.getAttributes().getNamedItem(attrName); |
| if (attrNameNode == null) { |
| // attrName given but doesn't match |
| return FILTER_ACCEPT; |
| } |
| |
| if (attrContainsPattern != null |
| && !attrContainsPattern.matcher(attrNameNode.getNodeValue()).find()) { |
| // attrContains given but doesn't match |
| return FILTER_ACCEPT; |
| } |
| } |
| result.add(node); |
| } |
| return FILTER_ACCEPT; |
| } |
| }, false); |
| |
| // To populate result we only need to iterate... |
| while (nodeIterator.nextNode() != null) |
| ; |
| |
| // We have to explicitly declare we are done with this nodeIterator to free it's resources. |
| nodeIterator.detach(); |
| |
| return result; |
| } |
| |
| /** |
| * Given a {@link org.w3c.dom.Document} this method will return an input stream representing that document. |
| * |
| * @param doc |
| * the input {@link org.w3c.dom.Document} |
| * |
| * @return an {@link java.io.InputStream} |
| */ |
| public static InputStream documentToInputStream(Document doc) { |
| DOMSource source = new DOMSource(doc); |
| StringWriter xmlAsWriter = new StringWriter(); |
| StreamResult result = new StreamResult(xmlAsWriter); |
| try { |
| TransformerFactory.newInstance().newTransformer().transform(source, result); |
| } catch (TransformerConfigurationException e) { |
| throw new RuntimeException("Error within Document to InputStream transformation configuration!"); |
| } catch (TransformerException e) { |
| throw new RuntimeException("Error whilst transforming the Document to InputStream!"); |
| } catch (TransformerFactoryConfigurationError e) { |
| throw new RuntimeException("Error within Document to InputStream transformation configuration factory!"); |
| } |
| |
| InputStream is = null; |
| try { |
| is = new ByteArrayInputStream(xmlAsWriter.toString().getBytes("UTF-8")); |
| } catch (UnsupportedEncodingException e) { |
| throw new RuntimeException("Error obtaining data with \"UTF-8\" encoding!", e); |
| } |
| return is; |
| } |
| |
| /** |
| * Convert a w3c dom node to a InputStream |
| * |
| * @param node |
| * {@link org.w3c.dom.Node} to convert |
| * |
| * @return the converted {@link java.io.InputStream} |
| */ |
| public static InputStream nodeToInputStream(Node node) { |
| ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); |
| Result outputTarget = new StreamResult(outputStream); |
| Transformer t = null; |
| try { |
| t = TransformerFactory.newInstance().newTransformer(); |
| } catch (TransformerConfigurationException e) { |
| throw new RuntimeException("Serious configuration error.", e); |
| } catch (TransformerFactoryConfigurationError e) { |
| throw new RuntimeException("Serious configuration error.", e); |
| } |
| t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes"); |
| try { |
| t.transform(new DOMSource(node), outputTarget); |
| } catch (TransformerException e) { |
| throw new RuntimeException("Error whilst transforming the Node to InputStream!"); |
| } |
| return new ByteArrayInputStream(outputStream.toByteArray()); |
| } |
| |
| } |