core/src/main/java/org/apache/any23/extractor/html/DomUtils.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor.html;

 import org.w3c.dom.Document;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.w3c.dom.traversal.DocumentTraversal;
 import org.w3c.dom.traversal.NodeFilter;
 import org.w3c.dom.traversal.NodeIterator;

 import javax.xml.transform.OutputKeys;
 import javax.xml.transform.Result;
 import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerException;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.TransformerFactoryConfigurationError;
 import javax.xml.transform.dom.DOMSource;
 import javax.xml.transform.stream.StreamResult;
 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;

 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.StringWriter;
 import java.io.UnsupportedEncodingException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 import java.util.regex.Pattern;

 /**
  * This class provides utility methods for DOM manipulation. It is separated from {@link HTMLDocument} so that its
  * methods can be run on single DOM nodes without having to wrap them into an HTMLDocument.
  * <p>
  * We use a mix of XPath and DOM manipulation.
  * </p>
  * This is likely to be a performance bottleneck but at least everything is localized here.
  */
 public class DomUtils {

     private static final String[] EMPTY_STRING_ARRAY = new String[0];

     private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();

     private DomUtils() {
     }

     /**
      * Given a node this method returns the index corresponding to such node within the list of the children of its
      * parent node.
      *
      * @param n
      *            the node of which returning the index.
      *
      * @return a non negative number.
      */
     public static int getIndexInParent(Node n) {
         Node parent = n.getParentNode();
         if (parent == null) {
             return 0;
         }
         NodeList nodes = parent.getChildNodes();
         int counter = -1;
         for (int i = 0; i < nodes.getLength(); i++) {
             Node current = nodes.item(i);
             if (current.getNodeType() == n.getNodeType() && current.getNodeName().equals(n.getNodeName())) {
                 counter++;
             }
             if (current.equals(n)) {
                 return counter;
             }
         }
         throw new IllegalStateException("Cannot find a child within its parent node list.");
     }

     /**
      * Does a reverse walking of the DOM tree to generate a unique XPath expression leading to this node. The XPath
      * generated is the canonical one based on sibling index: /html[1]/body[1]/div[2]/span[3] etc..
      *
      * @param node
      *            the input node.
      *
      * @return the XPath location of node as String.
      */
     public static String getXPathForNode(Node node) {
         final StringBuilder sb = new StringBuilder();
         Node parent = node;
         while (parent != null && parent.getNodeType() != Node.DOCUMENT_NODE) {
             sb.insert(0, "]");
             sb.insert(0, getIndexInParent(parent) + 1);
             sb.insert(0, "[");
             sb.insert(0, parent.getNodeName());
             sb.insert(0, "/");
             parent = parent.getParentNode();
         }
         return sb.toString();
     }

     /**
      * Returns a list of tag names representing the path from the document root to the given node <i>n</i>.
      *
      * @param n
      *            the node for which retrieve the path.
      *
      * @return a sequence of HTML tag names.
      */
     public static String[] getXPathListForNode(Node n) {
         if (n == null) {
             return EMPTY_STRING_ARRAY;
         }
         List<String> ancestors = new ArrayList<String>();
         ancestors.add(String.format(Locale.ROOT, "%s[%s]", n.getNodeName(), getIndexInParent(n)));
         Node parent = n.getParentNode();
         while (parent != null) {
             ancestors.add(0, String.format(Locale.ROOT, "%s[%s]", parent.getNodeName(), getIndexInParent(parent)));
             parent = parent.getParentNode();
         }
         return ancestors.toArray(new String[ancestors.size()]);
     }

     /**
      * Returns the row/col location of the given node.
      *
      * @param n
      *            input node.
      *
      * @return an array of two elements of type
      *         <code>[&lt;begin-row&gt;, &lt;begin-col&gt;, &lt;end-row&gt; &lt;end-col&gt;]</code> or <code>null</code>
      *         if not possible to extract such data.
      */
     public static int[] getNodeLocation(Node n) {
         if (n == null)
             throw new NullPointerException("node cannot be null.");
         final TagSoupParser.ElementLocation elementLocation = (TagSoupParser.ElementLocation) n
                 .getUserData(TagSoupParser.ELEMENT_LOCATION);
         if (elementLocation == null)
             return null;
         return new int[] { elementLocation.getBeginLineNumber(), elementLocation.getBeginColumnNumber(),
                 elementLocation.getEndLineNumber(), elementLocation.getEndColumnNumber() };
     }

     /**
      * Checks whether a node is ancestor or same of another node.
      *
      * @param candidateAncestor
      *            the candidate ancestor node.
      * @param candidateSibling
      *            the candidate sibling node.
      * @param strict
      *            if <code>true</code> is not allowed that the ancestor and sibling can be the same node.
      *
      * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
      *         <code>false</code> otherwise.
      */
     public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling, boolean strict) {
         if (candidateAncestor == null)
             throw new NullPointerException("candidate ancestor cannot be null null.");
         if (candidateSibling == null)
             throw new NullPointerException("candidate sibling cannot be null null.");
         if (strict && candidateAncestor.equals(candidateSibling))
             return false;
         Node parent = candidateSibling;
         while (parent != null) {
             if (parent.equals(candidateAncestor))
                 return true;
             parent = parent.getParentNode();
         }
         return false;
     }

     /**
      * Checks whether a node is ancestor or same of another node. As
      * {@link #isAncestorOf(org.w3c.dom.Node, org.w3c.dom.Node, boolean)} with <code>strict=false</code>.
      *
      * @param candidateAncestor
      *            the candidate ancestor node.
      * @param candidateSibling
      *            the candidate sibling node.
      *
      * @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
      *         <code>false</code> otherwise.
      */
     public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling) {
         return isAncestorOf(candidateAncestor, candidateSibling, false);
     }

     /**
      * Finds all nodes that have a declared class. Note that the className is transformed to lower case before being
      * matched against the DOM.
      *
      * @param root
      *            the root node from which start searching.
      * @param className
      *            the name of the filtered class.
      *
      * @return list of matching nodes or an empty list.
      */
     public static List<Node> findAllByClassName(Node root, String className) {
         return findAllBy(root, null, "class", className.toLowerCase(Locale.ROOT));
     }

     /**
      * Finds all nodes that have a declared attribute. Note that the className is transformed to lower case before being
      * matched against the DOM.
      *
      * @param root
      *            the root node from which start searching.
      * @param attrName
      *            the name of the filtered attribue.
      *
      * @return list of matching nodes or an empty list.
      */
     public static List<Node> findAllByAttributeName(Node root, String attrName) {
         return findAllBy(root, null, attrName, null);
     }

     public static List<Node> findAllByAttributeContains(Node node, String attrName, String attrContains) {
         return findAllBy(node, null, attrName, attrContains);
     }

     public static List<Node> findAllByTag(Node root, String tagName) {
         return findAllBy(root, tagName, null, null);
     }

     public static List<Node> findAllByTagAndClassName(Node root, final String tagName, final String className) {
         return findAllBy(root, tagName, "class", className);
     }

     /**
      * Mimics the JS DOM API, or prototype's $()
      *
      * @param root
      *            the node to locate
      * @param id
      *            the id of the node to locate
      *
      * @return the {@link org.w3c.dom.Node} if one exists
      */
     public static Node findNodeById(Node root, String id) {
         Node node;
         try {
             String xpath = "//*[@id='" + id + "']";
             node = (Node) xPathEngine.evaluate(xpath, root, XPathConstants.NODE);
         } catch (XPathExpressionException ex) {
             throw new RuntimeException("Should not happen", ex);
         }
         return node;
     }

     /**
      * Returns a NodeList composed of all the nodes that match an XPath expression, which must be valid.
      *
      * @param node
      *            the node object to locate
      * @param xpath
      *            an xpath expression
      *
      * @return a list of {@link org.w3c.dom.Node}'s if they exists
      */
     public static List<Node> findAll(Node node, String xpath) {
         if (node == null) {
             throw new NullPointerException("node cannot be null.");
         }
         try {
             NodeList nodes = (NodeList) xPathEngine.evaluate(xpath, node, XPathConstants.NODESET);
             List<Node> result = new ArrayList<Node>(nodes.getLength());
             for (int i = 0; i < nodes.getLength(); i++) {
                 result.add(nodes.item(i));
             }
             return result;
         } catch (XPathExpressionException ex) {
             throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
         }
     }

     /**
      * Gets the string value of an XPath expression.
      *
      * @param node
      *            the node object to locate
      * @param xpath
      *            an xpath expression
      *
      * @return a string xpath value
      */
     public static String find(Node node, String xpath) {
         try {
             String val = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
             if (null == val)
                 return "";
             return val;
         } catch (XPathExpressionException ex) {
             throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
         }
     }

     /**
      * Tells if an element has a class name <b>not checking the parents in the hierarchy</b> mimicking the <i>CSS</i>
      * .foo match.
      *
      * @param node
      *            the node object to locate
      * @param className
      *            the CSS class name
      *
      * @return true if the class name exists
      */
     public static boolean hasClassName(Node node, String className) {
         return hasAttribute(node, "class", className);
     }

     /**
      * Checks the presence of an attribute value in attributes that contain whitespace-separated lists of values. The
      * semantic is the CSS classes' ones: "foo" matches "bar foo", "foo" but not "foob"
      *
      * @param node
      *            the node object to locate
      * @param attributeName
      *            attribute value
      * @param className
      *            the CSS class name
      *
      * @return true if the class has the attribute name
      */
     public static boolean hasAttribute(Node node, String attributeName, String className) {
         // regex love, maybe faster but less easy to understand
         // Pattern pattern = Pattern.compile("(^|\\s+)"+className+"(\\s+|$)");
         String attr = readAttribute(node, attributeName);
         for (String c : attr.split("\\s+"))
             if (c.equalsIgnoreCase(className))
                 return true;
         return false;
     }

     /**
      * Checks the presence of an attribute in the given <code>node</code>.
      *
      * @param node
      *            the node container.
      * @param attributeName
      *            the name of the attribute.
      *
      * @return true if the attribute is present
      */
     public static boolean hasAttribute(Node node, String attributeName) {
         return readAttribute(node, attributeName, null) != null;
     }

     /**
      * Verifies if the given target node is an element.
      *
      * @param target
      *            target node to check
      *
      * @return <code>true</code> if the element the node is an element, <code>false</code> otherwise.
      */
     public static boolean isElementNode(Node target) {
         return Node.ELEMENT_NODE == target.getNodeType();
     }

     /**
      * Reads the value of the specified <code>attribute</code>, returning the <code>defaultValue</code> string if not
      * present.
      *
      * @param node
      *            node to read the attribute.
      * @param attribute
      *            attribute name.
      * @param defaultValue
      *            the default value to return if attribute is not found.
      *
      * @return the attribute value or <code>defaultValue</code> if not found.
      */
     public static String readAttribute(Node node, String attribute, String defaultValue) {
         NamedNodeMap attributes = node.getAttributes();
         if (null == attributes)
             return defaultValue;
         Node attr = attributes.getNamedItem(attribute);
         if (null == attr)
             return defaultValue;
         return attr.getNodeValue();
     }

     /**
      * Reads the value of the first <i>attribute</i> which name matches with the specified <code>attributePrefix</code>.
      * Returns the <code>defaultValue</code> if not found.
      *
      * @param node
      *            node to look for attributes.
      * @param attributePrefix
      *            attribute prefix.
      * @param defaultValue
      *            default returned value.
      *
      * @return the value found or default.
      */
     public static String readAttributeWithPrefix(Node node, String attributePrefix, String defaultValue) {
         final NamedNodeMap attributes = node.getAttributes();
         if (null == attributes) {
             return defaultValue;
         }
         Node attribute;
         for (int a = 0; a < attributes.getLength(); a++) {
             attribute = attributes.item(a);
             if (attribute.getNodeName().startsWith(attributePrefix)) {
                 return attribute.getNodeValue();
             }
         }
         return defaultValue;
     }

     /**
      * Reads the value of an <code>attribute</code>, returning the empty string if not present.
      *
      * @param node
      *            node to read the attribute.
      * @param attribute
      *            attribute name.
      *
      * @return the attribute value or <code>""</code> if not found.
      */
     public static String readAttribute(Node node, String attribute) {
         return readAttribute(node, attribute, "");
     }

     /**
      * Given a <i>DOM</i> {@link Node} produces the <i>XML</i> serialization omitting the <i>XML declaration</i>.
      *
      * @param node
      *            node to be serialized.
      * @param indent
      *            if <code>true</code> the output is indented.
      *
      * @return the XML serialization.
      *
      * @throws TransformerException
      *             if an error occurs during the serializator initialization and activation.
      * @throws java.io.IOException
      *             if there is an error locating the node
      */
     public static String serializeToXML(Node node, boolean indent) throws TransformerException, IOException {
         final DOMSource domSource = new DOMSource(node);
         final Transformer transformer = TransformerFactory.newInstance().newTransformer();
         transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
         transformer.setOutputProperty(OutputKeys.METHOD, "xml");
         transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
         if (indent) {
             transformer.setOutputProperty(OutputKeys.INDENT, "yes");
             transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
         }
         final StringWriter sw = new StringWriter();
         final StreamResult sr = new StreamResult(sw);
         transformer.transform(domSource, sr);
         sw.close();
         return sw.toString();
     }

     /**
      * High performance implementation of {@link #findAll(org.w3c.dom.Node, String)}.
      *
      * @param root
      *            root node to start search.
      * @param tagName
      *            name of target tag.
      * @param attrName
      *            name of attribute filter.
      * @param attrContains
      *            expected content for attribute.
      *
      * @return a {@link java.util.List} of {@link org.w3c.dom.Node}'s
      */
     private static List<Node> findAllBy(Node root, final String tagName, final String attrName, String attrContains) {
         DocumentTraversal documentTraversal = (DocumentTraversal) root.getOwnerDocument();
         if (documentTraversal == null) {
             documentTraversal = (DocumentTraversal) root;
         }

         final Pattern attrContainsPattern;
         if (attrContains != null && !attrContains.equals("*")) {
             attrContainsPattern = Pattern.compile("(^|\\s)" + attrContains + "(\\s|$)", Pattern.CASE_INSENSITIVE);
         } else {
             attrContainsPattern = null;
         }

         final List<Node> result = new ArrayList<Node>();
         NodeIterator nodeIterator = documentTraversal.createNodeIterator(root, NodeFilter.SHOW_ELEMENT,
                 new NodeFilter() {
                     @Override
                     public short acceptNode(Node node) {
                         if (node.getNodeType() == Node.ELEMENT_NODE) {
                             if (tagName != null && !tagName.equals("*") && !tagName.equals(node.getNodeName())) {
                                 // tagName given but doesn't match.
                                 return FILTER_ACCEPT;
                             }

                             if (attrName != null) {
                                 Node attrNameNode = node.getAttributes().getNamedItem(attrName);
                                 if (attrNameNode == null) {
                                     // attrName given but doesn't match
                                     return FILTER_ACCEPT;
                                 }

                                 if (attrContainsPattern != null
                                         && !attrContainsPattern.matcher(attrNameNode.getNodeValue()).find()) {
                                     // attrContains given but doesn't match
                                     return FILTER_ACCEPT;
                                 }
                             }
                             result.add(node);
                         }
                         return FILTER_ACCEPT;
                     }
                 }, false);

         // To populate result we only need to iterate...
         while (nodeIterator.nextNode() != null)
             ;

         // We have to explicitly declare we are done with this nodeIterator to free it's resources.
         nodeIterator.detach();

         return result;
     }

     /**
      * Given a {@link org.w3c.dom.Document} this method will return an input stream representing that document.
      *
      * @param doc
      *            the input {@link org.w3c.dom.Document}
      *
      * @return an {@link java.io.InputStream}
      */
     public static InputStream documentToInputStream(Document doc) {
         DOMSource source = new DOMSource(doc);
         StringWriter xmlAsWriter = new StringWriter();
         StreamResult result = new StreamResult(xmlAsWriter);
         try {
             TransformerFactory.newInstance().newTransformer().transform(source, result);
         } catch (TransformerConfigurationException e) {
             throw new RuntimeException("Error within Document to InputStream transformation configuration!");
         } catch (TransformerException e) {
             throw new RuntimeException("Error whilst transforming the Document to InputStream!");
         } catch (TransformerFactoryConfigurationError e) {
             throw new RuntimeException("Error within Document to InputStream transformation configuration factory!");
         }

         InputStream is = null;
         try {
             is = new ByteArrayInputStream(xmlAsWriter.toString().getBytes("UTF-8"));
         } catch (UnsupportedEncodingException e) {
             throw new RuntimeException("Error obtaining data with \"UTF-8\" encoding!", e);
         }
         return is;
     }

     /**
      * Convert a w3c dom node to a InputStream
      *
      * @param node
      *            {@link org.w3c.dom.Node} to convert
      *
      * @return the converted {@link java.io.InputStream}
      */
     public static InputStream nodeToInputStream(Node node) {
         ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
         Result outputTarget = new StreamResult(outputStream);
         Transformer t = null;
         try {
             t = TransformerFactory.newInstance().newTransformer();
         } catch (TransformerConfigurationException e) {
             throw new RuntimeException("Serious configuration error.", e);
         } catch (TransformerFactoryConfigurationError e) {
             throw new RuntimeException("Serious configuration error.", e);
         }
         t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
         try {
             t.transform(new DOMSource(node), outputTarget);
         } catch (TransformerException e) {
             throw new RuntimeException("Error whilst transforming the Node to InputStream!");
         }
         return new ByteArrayInputStream(outputStream.toByteArray());
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.any23.extractor.html;

	import org.w3c.dom.Document;
	import org.w3c.dom.NamedNodeMap;
	import org.w3c.dom.Node;
	import org.w3c.dom.NodeList;
	import org.w3c.dom.traversal.DocumentTraversal;
	import org.w3c.dom.traversal.NodeFilter;
	import org.w3c.dom.traversal.NodeIterator;

	import javax.xml.transform.OutputKeys;
	import javax.xml.transform.Result;
	import javax.xml.transform.Transformer;
	import javax.xml.transform.TransformerConfigurationException;
	import javax.xml.transform.TransformerException;
	import javax.xml.transform.TransformerFactory;
	import javax.xml.transform.TransformerFactoryConfigurationError;
	import javax.xml.transform.dom.DOMSource;
	import javax.xml.transform.stream.StreamResult;
	import javax.xml.xpath.XPath;
	import javax.xml.xpath.XPathConstants;
	import javax.xml.xpath.XPathExpressionException;
	import javax.xml.xpath.XPathFactory;

	import java.io.ByteArrayInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.StringWriter;
	import java.io.UnsupportedEncodingException;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.Locale;
	import java.util.regex.Pattern;

	/**
	* This class provides utility methods for DOM manipulation. It is separated from {@link HTMLDocument} so that its
	* methods can be run on single DOM nodes without having to wrap them into an HTMLDocument.
	* <p>
	* We use a mix of XPath and DOM manipulation.
	* </p>
	* This is likely to be a performance bottleneck but at least everything is localized here.
	*/
	public class DomUtils {

	private static final String[] EMPTY_STRING_ARRAY = new String[0];

	private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();

	private DomUtils() {
	}

	/**
	* Given a node this method returns the index corresponding to such node within the list of the children of its
	* parent node.
	*
	* @param n
	* the node of which returning the index.
	*
	* @return a non negative number.
	*/
	public static int getIndexInParent(Node n) {
	Node parent = n.getParentNode();
	if (parent == null) {
	return 0;
	}
	NodeList nodes = parent.getChildNodes();
	int counter = -1;
	for (int i = 0; i < nodes.getLength(); i++) {
	Node current = nodes.item(i);
	if (current.getNodeType() == n.getNodeType() && current.getNodeName().equals(n.getNodeName())) {
	counter++;
	}
	if (current.equals(n)) {
	return counter;
	}
	}
	throw new IllegalStateException("Cannot find a child within its parent node list.");
	}

	/**
	* Does a reverse walking of the DOM tree to generate a unique XPath expression leading to this node. The XPath
	* generated is the canonical one based on sibling index: /html[1]/body[1]/div[2]/span[3] etc..
	*
	* @param node
	* the input node.
	*
	* @return the XPath location of node as String.
	*/
	public static String getXPathForNode(Node node) {
	final StringBuilder sb = new StringBuilder();
	Node parent = node;
	while (parent != null && parent.getNodeType() != Node.DOCUMENT_NODE) {
	sb.insert(0, "]");
	sb.insert(0, getIndexInParent(parent) + 1);
	sb.insert(0, "[");
	sb.insert(0, parent.getNodeName());
	sb.insert(0, "/");
	parent = parent.getParentNode();
	}
	return sb.toString();
	}

	/**
	* Returns a list of tag names representing the path from the document root to the given node <i>n</i>.
	*
	* @param n
	* the node for which retrieve the path.
	*
	* @return a sequence of HTML tag names.
	*/
	public static String[] getXPathListForNode(Node n) {
	if (n == null) {
	return EMPTY_STRING_ARRAY;
	}
	List<String> ancestors = new ArrayList<String>();
	ancestors.add(String.format(Locale.ROOT, "%s[%s]", n.getNodeName(), getIndexInParent(n)));
	Node parent = n.getParentNode();
	while (parent != null) {
	ancestors.add(0, String.format(Locale.ROOT, "%s[%s]", parent.getNodeName(), getIndexInParent(parent)));
	parent = parent.getParentNode();
	}
	return ancestors.toArray(new String[ancestors.size()]);
	}

	/**
	* Returns the row/col location of the given node.
	*
	* @param n
	* input node.
	*
	* @return an array of two elements of type
	* <code>[<begin-row>, <begin-col>, <end-row> <end-col>]</code> or <code>null</code>
	* if not possible to extract such data.
	*/
	public static int[] getNodeLocation(Node n) {
	if (n == null)
	throw new NullPointerException("node cannot be null.");
	final TagSoupParser.ElementLocation elementLocation = (TagSoupParser.ElementLocation) n
	.getUserData(TagSoupParser.ELEMENT_LOCATION);
	if (elementLocation == null)
	return null;
	return new int[] { elementLocation.getBeginLineNumber(), elementLocation.getBeginColumnNumber(),
	elementLocation.getEndLineNumber(), elementLocation.getEndColumnNumber() };
	}

	/**
	* Checks whether a node is ancestor or same of another node.
	*
	* @param candidateAncestor
	* the candidate ancestor node.
	* @param candidateSibling
	* the candidate sibling node.
	* @param strict
	* if <code>true</code> is not allowed that the ancestor and sibling can be the same node.
	*
	* @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
	* <code>false</code> otherwise.
	*/
	public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling, boolean strict) {
	if (candidateAncestor == null)
	throw new NullPointerException("candidate ancestor cannot be null null.");
	if (candidateSibling == null)
	throw new NullPointerException("candidate sibling cannot be null null.");
	if (strict && candidateAncestor.equals(candidateSibling))
	return false;
	Node parent = candidateSibling;
	while (parent != null) {
	if (parent.equals(candidateAncestor))
	return true;
	parent = parent.getParentNode();
	}
	return false;
	}

	/**
	* Checks whether a node is ancestor or same of another node. As
	* {@link #isAncestorOf(org.w3c.dom.Node, org.w3c.dom.Node, boolean)} with <code>strict=false</code>.
	*
	* @param candidateAncestor
	* the candidate ancestor node.
	* @param candidateSibling
	* the candidate sibling node.
	*
	* @return <code>true</code> if <code>candidateSibling</code> is ancestor of <code>candidateSibling</code>,
	* <code>false</code> otherwise.
	*/
	public static boolean isAncestorOf(Node candidateAncestor, Node candidateSibling) {
	return isAncestorOf(candidateAncestor, candidateSibling, false);
	}

	/**
	* Finds all nodes that have a declared class. Note that the className is transformed to lower case before being
	* matched against the DOM.
	*
	* @param root
	* the root node from which start searching.
	* @param className
	* the name of the filtered class.
	*
	* @return list of matching nodes or an empty list.
	*/
	public static List<Node> findAllByClassName(Node root, String className) {
	return findAllBy(root, null, "class", className.toLowerCase(Locale.ROOT));
	}

	/**
	* Finds all nodes that have a declared attribute. Note that the className is transformed to lower case before being
	* matched against the DOM.
	*
	* @param root
	* the root node from which start searching.
	* @param attrName
	* the name of the filtered attribue.
	*
	* @return list of matching nodes or an empty list.
	*/
	public static List<Node> findAllByAttributeName(Node root, String attrName) {
	return findAllBy(root, null, attrName, null);
	}

	public static List<Node> findAllByAttributeContains(Node node, String attrName, String attrContains) {
	return findAllBy(node, null, attrName, attrContains);
	}

	public static List<Node> findAllByTag(Node root, String tagName) {
	return findAllBy(root, tagName, null, null);
	}

	public static List<Node> findAllByTagAndClassName(Node root, final String tagName, final String className) {
	return findAllBy(root, tagName, "class", className);
	}

	/**
	* Mimics the JS DOM API, or prototype's $()
	*
	* @param root
	* the node to locate
	* @param id
	* the id of the node to locate
	*
	* @return the {@link org.w3c.dom.Node} if one exists
	*/
	public static Node findNodeById(Node root, String id) {
	Node node;
	try {
	String xpath = "//*[@id='" + id + "']";
	node = (Node) xPathEngine.evaluate(xpath, root, XPathConstants.NODE);
	} catch (XPathExpressionException ex) {
	throw new RuntimeException("Should not happen", ex);
	}
	return node;
	}

	/**
	* Returns a NodeList composed of all the nodes that match an XPath expression, which must be valid.
	*
	* @param node
	* the node object to locate
	* @param xpath
	* an xpath expression
	*
	* @return a list of {@link org.w3c.dom.Node}'s if they exists
	*/
	public static List<Node> findAll(Node node, String xpath) {
	if (node == null) {
	throw new NullPointerException("node cannot be null.");
	}
	try {
	NodeList nodes = (NodeList) xPathEngine.evaluate(xpath, node, XPathConstants.NODESET);
	List<Node> result = new ArrayList<Node>(nodes.getLength());
	for (int i = 0; i < nodes.getLength(); i++) {
	result.add(nodes.item(i));
	}
	return result;
	} catch (XPathExpressionException ex) {
	throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
	}
	}

	/**
	* Gets the string value of an XPath expression.
	*
	* @param node
	* the node object to locate
	* @param xpath
	* an xpath expression
	*
	* @return a string xpath value
	*/
	public static String find(Node node, String xpath) {
	try {
	String val = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
	if (null == val)
	return "";
	return val;
	} catch (XPathExpressionException ex) {
	throw new IllegalArgumentException("Illegal XPath expression: " + xpath, ex);
	}
	}

	/**
	* Tells if an element has a class name <b>not checking the parents in the hierarchy</b> mimicking the <i>CSS</i>
	* .foo match.
	*
	* @param node
	* the node object to locate
	* @param className
	* the CSS class name
	*
	* @return true if the class name exists
	*/
	public static boolean hasClassName(Node node, String className) {
	return hasAttribute(node, "class", className);
	}

	/**
	* Checks the presence of an attribute value in attributes that contain whitespace-separated lists of values. The
	* semantic is the CSS classes' ones: "foo" matches "bar foo", "foo" but not "foob"
	*
	* @param node
	* the node object to locate
	* @param attributeName
	* attribute value
	* @param className
	* the CSS class name
	*
	* @return true if the class has the attribute name
	*/
	public static boolean hasAttribute(Node node, String attributeName, String className) {
	// regex love, maybe faster but less easy to understand
	// Pattern pattern = Pattern.compile("(^\|\\s+)"+className+"(\\s+\|$)");
	String attr = readAttribute(node, attributeName);
	for (String c : attr.split("\\s+"))
	if (c.equalsIgnoreCase(className))
	return true;
	return false;
	}

	/**
	* Checks the presence of an attribute in the given <code>node</code>.
	*
	* @param node
	* the node container.
	* @param attributeName
	* the name of the attribute.
	*
	* @return true if the attribute is present
	*/
	public static boolean hasAttribute(Node node, String attributeName) {
	return readAttribute(node, attributeName, null) != null;
	}

	/**
	* Verifies if the given target node is an element.
	*
	* @param target
	* target node to check
	*
	* @return <code>true</code> if the element the node is an element, <code>false</code> otherwise.
	*/
	public static boolean isElementNode(Node target) {
	return Node.ELEMENT_NODE == target.getNodeType();
	}

	/**
	* Reads the value of the specified <code>attribute</code>, returning the <code>defaultValue</code> string if not
	* present.
	*
	* @param node
	* node to read the attribute.
	* @param attribute
	* attribute name.
	* @param defaultValue
	* the default value to return if attribute is not found.
	*
	* @return the attribute value or <code>defaultValue</code> if not found.
	*/
	public static String readAttribute(Node node, String attribute, String defaultValue) {
	NamedNodeMap attributes = node.getAttributes();
	if (null == attributes)
	return defaultValue;
	Node attr = attributes.getNamedItem(attribute);
	if (null == attr)
	return defaultValue;
	return attr.getNodeValue();
	}

	/**
	* Reads the value of the first <i>attribute</i> which name matches with the specified <code>attributePrefix</code>.
	* Returns the <code>defaultValue</code> if not found.
	*
	* @param node
	* node to look for attributes.
	* @param attributePrefix
	* attribute prefix.
	* @param defaultValue
	* default returned value.
	*
	* @return the value found or default.
	*/
	public static String readAttributeWithPrefix(Node node, String attributePrefix, String defaultValue) {
	final NamedNodeMap attributes = node.getAttributes();
	if (null == attributes) {
	return defaultValue;
	}
	Node attribute;
	for (int a = 0; a < attributes.getLength(); a++) {
	attribute = attributes.item(a);
	if (attribute.getNodeName().startsWith(attributePrefix)) {
	return attribute.getNodeValue();
	}
	}
	return defaultValue;
	}

	/**
	* Reads the value of an <code>attribute</code>, returning the empty string if not present.
	*
	* @param node
	* node to read the attribute.
	* @param attribute
	* attribute name.
	*
	* @return the attribute value or <code>""</code> if not found.
	*/
	public static String readAttribute(Node node, String attribute) {
	return readAttribute(node, attribute, "");
	}

	/**
	* Given a <i>DOM</i> {@link Node} produces the <i>XML</i> serialization omitting the <i>XML declaration</i>.
	*
	* @param node
	* node to be serialized.
	* @param indent
	* if <code>true</code> the output is indented.
	*
	* @return the XML serialization.
	*
	* @throws TransformerException
	* if an error occurs during the serializator initialization and activation.
	* @throws java.io.IOException
	* if there is an error locating the node
	*/
	public static String serializeToXML(Node node, boolean indent) throws TransformerException, IOException {
	final DOMSource domSource = new DOMSource(node);
	final Transformer transformer = TransformerFactory.newInstance().newTransformer();
	transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
	transformer.setOutputProperty(OutputKeys.METHOD, "xml");
	transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
	if (indent) {
	transformer.setOutputProperty(OutputKeys.INDENT, "yes");
	transformer.setOutputProperty("{http://xml.apache.org/xslt}indent-amount", "4");
	}
	final StringWriter sw = new StringWriter();
	final StreamResult sr = new StreamResult(sw);
	transformer.transform(domSource, sr);
	sw.close();
	return sw.toString();
	}

	/**
	* High performance implementation of {@link #findAll(org.w3c.dom.Node, String)}.
	*
	* @param root
	* root node to start search.
	* @param tagName
	* name of target tag.
	* @param attrName
	* name of attribute filter.
	* @param attrContains
	* expected content for attribute.
	*
	* @return a {@link java.util.List} of {@link org.w3c.dom.Node}'s
	*/
	private static List<Node> findAllBy(Node root, final String tagName, final String attrName, String attrContains) {
	DocumentTraversal documentTraversal = (DocumentTraversal) root.getOwnerDocument();
	if (documentTraversal == null) {
	documentTraversal = (DocumentTraversal) root;
	}

	final Pattern attrContainsPattern;
	if (attrContains != null && !attrContains.equals("*")) {
	attrContainsPattern = Pattern.compile("(^\|\\s)" + attrContains + "(\\s\|$)", Pattern.CASE_INSENSITIVE);
	} else {
	attrContainsPattern = null;
	}

	final List<Node> result = new ArrayList<Node>();
	NodeIterator nodeIterator = documentTraversal.createNodeIterator(root, NodeFilter.SHOW_ELEMENT,
	new NodeFilter() {
	@Override
	public short acceptNode(Node node) {
	if (node.getNodeType() == Node.ELEMENT_NODE) {
	if (tagName != null && !tagName.equals("*") && !tagName.equals(node.getNodeName())) {
	// tagName given but doesn't match.
	return FILTER_ACCEPT;
	}

	if (attrName != null) {
	Node attrNameNode = node.getAttributes().getNamedItem(attrName);
	if (attrNameNode == null) {
	// attrName given but doesn't match
	return FILTER_ACCEPT;
	}

	if (attrContainsPattern != null
	&& !attrContainsPattern.matcher(attrNameNode.getNodeValue()).find()) {
	// attrContains given but doesn't match
	return FILTER_ACCEPT;
	}
	}
	result.add(node);
	}
	return FILTER_ACCEPT;
	}
	}, false);

	// To populate result we only need to iterate...
	while (nodeIterator.nextNode() != null)
	;

	// We have to explicitly declare we are done with this nodeIterator to free it's resources.
	nodeIterator.detach();

	return result;
	}

	/**
	* Given a {@link org.w3c.dom.Document} this method will return an input stream representing that document.
	*
	* @param doc
	* the input {@link org.w3c.dom.Document}
	*
	* @return an {@link java.io.InputStream}
	*/
	public static InputStream documentToInputStream(Document doc) {
	DOMSource source = new DOMSource(doc);
	StringWriter xmlAsWriter = new StringWriter();
	StreamResult result = new StreamResult(xmlAsWriter);
	try {
	TransformerFactory.newInstance().newTransformer().transform(source, result);
	} catch (TransformerConfigurationException e) {
	throw new RuntimeException("Error within Document to InputStream transformation configuration!");
	} catch (TransformerException e) {
	throw new RuntimeException("Error whilst transforming the Document to InputStream!");
	} catch (TransformerFactoryConfigurationError e) {
	throw new RuntimeException("Error within Document to InputStream transformation configuration factory!");
	}

	InputStream is = null;
	try {
	is = new ByteArrayInputStream(xmlAsWriter.toString().getBytes("UTF-8"));
	} catch (UnsupportedEncodingException e) {
	throw new RuntimeException("Error obtaining data with \"UTF-8\" encoding!", e);
	}
	return is;
	}

	/**
	* Convert a w3c dom node to a InputStream
	*
	* @param node
	* {@link org.w3c.dom.Node} to convert
	*
	* @return the converted {@link java.io.InputStream}
	*/
	public static InputStream nodeToInputStream(Node node) {
	ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
	Result outputTarget = new StreamResult(outputStream);
	Transformer t = null;
	try {
	t = TransformerFactory.newInstance().newTransformer();
	} catch (TransformerConfigurationException e) {
	throw new RuntimeException("Serious configuration error.", e);
	} catch (TransformerFactoryConfigurationError e) {
	throw new RuntimeException("Serious configuration error.", e);
	}
	t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
	try {
	t.transform(new DOMSource(node), outputTarget);
	} catch (TransformerException e) {
	throw new RuntimeException("Error whilst transforming the Node to InputStream!");
	}
	return new ByteArrayInputStream(outputStream.toByteArray());
	}

	}