core/src/main/java/org/apache/any23/extractor/html/HTMLDocument.java - any23 - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *  http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.any23.extractor.html;

 import org.apache.any23.extractor.ExtractionException;
 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
 import org.apache.any23.rdf.RDFUtils;
 import org.eclipse.rdf4j.model.IRI;
 import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Document;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.w3c.dom.Text;

 import javax.xml.xpath.XPath;
 import javax.xml.xpath.XPathConstants;
 import javax.xml.xpath.XPathExpressionException;
 import javax.xml.xpath.XPathFactory;
 import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.List;

 /**
  * A wrapper around the DOM representation of an HTML document. Provides convenience access to various parts of the
  * document.
  *
  * @author Gabriele Renzi
  * @author Michele Mostarda
  */
 public class HTMLDocument {

     private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
     private final static Logger log = LoggerFactory.getLogger(HTMLDocument.class);

     private Node document;
     private java.net.URI baseIRI;

     private final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
             SimpleValueFactory.getInstance());

     /**
      * Reads a text field from the given node adding the content to the given <i>res</i> list.
      *
      * @param node
      *            the node from which read the content.
      *
      * @return a valid TextField
      */
     public static TextField readTextField(Node node) {
         TextField result;
         final String name = node.getNodeName();
         final NamedNodeMap attributes = node.getAttributes();
         // excess of safety check, should be impossible
         if (attributes == null) {
             return new TextField(node.getTextContent(), node);
         }
         // first check if there are values inside
         List<Node> values = DomUtils.findAllByClassName(node, "value");
         if (!values.isEmpty()) {
             StringBuilder val = new StringBuilder();
             for (Node n : values)
                 val.append(n.getTextContent());
             return new TextField(val.toString().trim(), node);
         }
         if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
             result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
         } else if ("A".equals(name)) {
             if (DomUtils.hasAttribute(node, "rel", "tag")) {
                 String href = extractRelTag(attributes);
                 result = new TextField(href, node);
             } else
                 result = new TextField(node.getTextContent(), node);
         } else if (("IMG".equals(name) || "AREA".equals(name)) && (null != attributes.getNamedItem("alt"))) {
             result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
         } else {
             result = new TextField(node.getTextContent(), node);
         }
         return result;
     }

     /**
      * Reads an URL field from the given node adding the content to the given <i>res</i> list.
      *
      * @param res
      *            {@link java.util.List} of {@link org.apache.any23.extractor.html.HTMLDocument.TextField}
      * @param node
      *            the node to read
      */
     public static void readUrlField(List<TextField> res, Node node) {
         String name = node.getNodeName();
         NamedNodeMap attributes = node.getAttributes();
         if (null == attributes) {
             res.add(new TextField(node.getTextContent(), node));
             return;
         }
         if ("A".equals(name) || "AREA".equals(name)) {
             Node n = attributes.getNamedItem("href");
             if (n != null) {
                 res.add(new TextField(n.getNodeValue(), n));
             }
         } else if ("ABBR".equals(name)) {
             Node n = attributes.getNamedItem("title");
             if (n != null) {
                 res.add(new TextField(n.getNodeValue(), n));
             }
         } else if ("IMG".equals(name)) {
             Node n = attributes.getNamedItem("src");
             if (n != null) {
                 res.add(new TextField(n.getNodeValue(), n));
             } else {
                 n = attributes.getNamedItem("srcset");
                 if (n != null) {
                     res.add(new TextField(n.getNodeValue().split("[\\s,]+")[0], n));
                 }
             }
         } else if ("OBJECT".equals(name)) {
             Node n = attributes.getNamedItem("data");
             if (n != null) {
                 res.add(new TextField(n.getNodeValue(), n));
             }
         } else {
             res.add(new TextField(extractHCardTextContent(node), node));
         }
     }

     private static String extractHCardTextContent(Node node) {
         StringBuilder sb = new StringBuilder();
         NodeList nodes = node.getChildNodes();
         // if at least one element with 'value' class, concatenate all text in value
         if (extractTextInValue(nodes, sb) == 0) {
             // otherwise, concatenate all text not in elements with 'type' class
             extractTextNotInType(nodes, sb);
         }
         return sb.toString();
     }

     private static int extractTextInValue(NodeList nodes, StringBuilder b) {
         int count = 0;
         for (int i = 0, len = nodes.getLength(); i < len; i++) {
             Node n = nodes.item(i);
             if (DomUtils.hasClassName(n, "value")) {
                 count++;
                 b.append(n.getTextContent().trim());
             } else {
                 count += extractTextInValue(n.getChildNodes(), b);
             }
         }
         return count;
     }

     private static void extractTextNotInType(NodeList nodes, StringBuilder b) {
         for (int i = 0, len = nodes.getLength(); i < len; i++) {
             Node n = nodes.item(i);
             if (n.getNodeType() == Node.TEXT_NODE) {
                 b.append(n.getNodeValue().trim());
             } else if (!DomUtils.hasClassName(n, "type")) {
                 extractTextNotInType(n.getChildNodes(), b);
             }
         }
     }

     /**
      * Extracts the href specific rel-tag string. See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a>
      * specification.
      *
      * @param hrefAttributeContent
      *            the content of the <i>href</i> attribute.
      *
      * @return the rel-tag specification.
      */
     public static String extractRelTag(String hrefAttributeContent) {
         String[] all = hrefAttributeContent.split("[#?]");
         // Cleanup spurious segments.
         String path = all[0];
         int pathLenghtMin1 = path.length() - 1;
         if ('/' == path.charAt(pathLenghtMin1)) {
             path = path.substring(0, pathLenghtMin1);
         }
         return path;
     }

     /**
      * Extracts the href specific rel-tag string. See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a>
      * specification.
      *
      * @param attributes
      *            the list of attributes of a node.
      *
      * @return the rel-tag specification.
      */
     public static String extractRelTag(NamedNodeMap attributes) {
         return extractRelTag(attributes.getNamedItem("href").getNodeValue());
     }

     /**
      * Reads the text content of the given node and returns it. If the <code>prettify</code> flag is <code>true</code>
      * the text is cleaned up.
      *
      * @param node
      *            node to read content.
      * @param prettify
      *            if <code>true</code> blank chars will be removed.
      *
      * @return the read text.
      */
     public static String readNodeContent(Node node, boolean prettify) {
         final String content = node.getTextContent();
         return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
     }

     /**
      * Constructor accepting the root node.
      *
      * @param document
      *            a {@link org.w3c.dom.Node}
      */
     public HTMLDocument(Node document) {
         if (null == document)
             throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
         this.document = document;
     }

     /**
      * @param uri
      *            string to resolve to {@link org.eclipse.rdf4j.model.IRI}
      *
      * @return An absolute IRI, or null if the IRI is not fixable
      *
      * @throws org.apache.any23.extractor.ExtractionException
      *             If the base IRI is invalid
      */
     public IRI resolveIRI(String uri) throws ExtractionException {
         return valueFactory.resolveIRI(uri, getBaseIRI());
     }

     public String find(String xpath) {
         return DomUtils.find(getDocument(), xpath);
     }

     public Node findNodeById(String id) {
         return DomUtils.findNodeById(getDocument(), id);
     }

     public List<Node> findAll(String xpath) {
         return DomUtils.findAll(getDocument(), xpath);
     }

     public String findMicroformattedValue(String objectTag, String object, String fieldTag, String field, String key) {
         Node node = findMicroformattedObjectNode(objectTag, object);
         if (null == node)
             return "";
         // try to check if it is inline
         if (DomUtils.hasClassName(node, field))
             return node.getTextContent();

         // failed, try to find it in a child
         try {
             String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
             String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
             if (null == value) {
                 return "";
             }
             return value;
         } catch (XPathExpressionException ex) {
             throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
         }

     }

     public Node getDocument() {
         return document;
     }

     /**
      * Returns a singular text field.
      *
      * @param className
      *            name of class containing text.
      *
      * @return if multiple values are found just the first is returned, if we want to check that there are no n-ary
      *         values use plural finder
      */
     public TextField getSingularTextField(String className) {
         TextField[] res = getPluralTextField(className);
         if (res.length == 0)
             return new TextField("", null);
         return res[0];
     }

     /**
      * Returns a plural text field.
      *
      * @param className
      *            name of class node containing text.
      *
      * @return list of fields.
      */
     public TextField[] getPluralTextField(String className) {
         List<TextField> res = new ArrayList<TextField>();
         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
         for (Node node : nodes) {
             res.add(readTextField(node));
         }
         return res.toArray(new TextField[res.size()]);
     }

     /**
      * Returns the URL associated to the field marked with class <i>className</i>.
      *
      * @param className
      *            name of node class containing the URL field.
      *
      * @return if multiple values are found just the first is returned, if we want to check that there are no n-ary
      *         values use plural finder
      */
     public TextField getSingularUrlField(String className) {
         TextField[] res = getPluralUrlField(className);
         if (res.length < 1)
             return new TextField("", null);
         return res[0];
     }

     /**
      * Returns the list of URLs associated to the fields marked with class <i>className</i>.
      *
      * @param className
      *            name of node class containing the URL field.
      *
      * @return the list of {@link HTMLDocument.TextField} found.
      */
     public TextField[] getPluralUrlField(String className) {
         List<TextField> res = new ArrayList<TextField>();
         List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
         for (Node node : nodes)
             readUrlField(res, node);
         return res.toArray(new TextField[res.size()]);
     }

     public Node findMicroformattedObjectNode(String objectTag, String name) {
         List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
         if (nodes.isEmpty())
             return null;
         return nodes.get(0);
     }

     /**
      * Read an attribute avoiding NullPointerExceptions, if the attr is missing it just returns an empty string.
      *
      * @param attribute
      *            the attribute name.
      *
      * @return the string representing the attribute.
      */
     public String readAttribute(String attribute) {
         return DomUtils.readAttribute(getDocument(), attribute);
     }

     /**
      * Finds all the nodes by class name.
      *
      * @param clazz
      *            the class name.
      *
      * @return list of matching nodes.
      */
     public List<Node> findAllByClassName(String clazz) {
         return DomUtils.findAllByClassName(getDocument(), clazz);
     }

     /**
      * Returns the text contained inside a node if leaf, <code>null</code> otherwise.
      *
      * @return the text of a leaf node.
      */
     public String getText() {
         NodeList children = getDocument().getChildNodes();
         if (children.getLength() == 1 && children.item(0) instanceof Text) {
             return children.item(0).getTextContent();
         }
         return null;
     }

     /**
      * Returns the document default language.
      *
      * @return default language if any, <code>null</code> otherwise.
      */
     public String getDefaultLanguage() {
         final String xpathLanguageSelector = "/HTML";
         Node html;
         try {
             html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
         } catch (XPathExpressionException xpeee) {
             throw new IllegalStateException();
         }
         if (html == null) {
             return null;
         }
         Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
         return langAttribute == null ? null : langAttribute.getTextContent();
     }

     /**
      * Returns the sequence of ancestors from the document root to the local root (document).
      *
      * @return a sequence of node names.
      */
     public String[] getPathToLocalRoot() {
         return DomUtils.getXPathListForNode(document);
     }

     /**
      * Extracts all the <code>rel</code> tag nodes.
      *
      * @return list of rel tag nodes.
      */
     public TextField[] extractRelTagNodes() {
         final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
         final List<TextField> result = new ArrayList<TextField>();
         for (Node relTagNode : relTagNodes) {
             readUrlField(result, relTagNode);
         }
         return result.toArray(new TextField[result.size()]);
     }

     private java.net.URI getBaseIRI() throws ExtractionException {
         if (baseIRI == null) {
             // document.getBaseURI() returns null for document URIs with
             // special characters, e.g., http://semanticweb.org/wiki/Knud_Möller
             // It also does *not* take html "base" elements into account.
             // (But it does take into account urls specified by the attribute "xml:base".)

             // So, for now, let's use getDocumentURI() instead.
             // TODO: Make this approach better.

             Document doc = document instanceof Document ? (Document) document : document.getOwnerDocument();

             if (doc == null) {
                 throw new ExtractionException(
                         "Node " + document.getNodeName() + " was not associated with a document.");
             }

             String uri = doc.getDocumentURI();

             if (uri == null) {
                 throw new ExtractionException("document URI is null, this should not happen");
             }

             try {
                 baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(uri));
             } catch (IllegalArgumentException ex) {
                 throw new ExtractionException("Error in base IRI: " + uri, ex);
             } catch (URISyntaxException ex) {
                 throw new ExtractionException("Error in base IRI: " + uri, ex);
             }
         }
         return baseIRI;
     }

     /**
      * This class represents a text extracted from the <i>HTML</i> DOM related to the node from which such test has been
      * retrieved.
      */
     public static class TextField {
         private String value;
         private Node source;

         public TextField(String value, Node source) {
             this.value = value;
             this.source = source;
         }

         public String value() {
             return value;
         }

         public Node source() {
             return source;
         }
     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.any23.extractor.html;

	import org.apache.any23.extractor.ExtractionException;
	import org.apache.any23.rdf.Any23ValueFactoryWrapper;
	import org.apache.any23.rdf.RDFUtils;
	import org.eclipse.rdf4j.model.IRI;
	import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;
	import org.w3c.dom.Document;
	import org.w3c.dom.NamedNodeMap;
	import org.w3c.dom.Node;
	import org.w3c.dom.NodeList;
	import org.w3c.dom.Text;

	import javax.xml.xpath.XPath;
	import javax.xml.xpath.XPathConstants;
	import javax.xml.xpath.XPathExpressionException;
	import javax.xml.xpath.XPathFactory;
	import java.net.URISyntaxException;
	import java.util.ArrayList;
	import java.util.List;

	/**
	* A wrapper around the DOM representation of an HTML document. Provides convenience access to various parts of the
	* document.
	*
	* @author Gabriele Renzi
	* @author Michele Mostarda
	*/
	public class HTMLDocument {

	private final static XPath xPathEngine = XPathFactory.newInstance().newXPath();
	private final static Logger log = LoggerFactory.getLogger(HTMLDocument.class);

	private Node document;
	private java.net.URI baseIRI;

	private final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
	SimpleValueFactory.getInstance());

	/**
	* Reads a text field from the given node adding the content to the given <i>res</i> list.
	*
	* @param node
	* the node from which read the content.
	*
	* @return a valid TextField
	*/
	public static TextField readTextField(Node node) {
	TextField result;
	final String name = node.getNodeName();
	final NamedNodeMap attributes = node.getAttributes();
	// excess of safety check, should be impossible
	if (attributes == null) {
	return new TextField(node.getTextContent(), node);
	}
	// first check if there are values inside
	List<Node> values = DomUtils.findAllByClassName(node, "value");
	if (!values.isEmpty()) {
	StringBuilder val = new StringBuilder();
	for (Node n : values)
	val.append(n.getTextContent());
	return new TextField(val.toString().trim(), node);
	}
	if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) {
	result = new TextField(attributes.getNamedItem("title").getNodeValue(), node);
	} else if ("A".equals(name)) {
	if (DomUtils.hasAttribute(node, "rel", "tag")) {
	String href = extractRelTag(attributes);
	result = new TextField(href, node);
	} else
	result = new TextField(node.getTextContent(), node);
	} else if (("IMG".equals(name) \|\| "AREA".equals(name)) && (null != attributes.getNamedItem("alt"))) {
	result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node);
	} else {
	result = new TextField(node.getTextContent(), node);
	}
	return result;
	}

	/**
	* Reads an URL field from the given node adding the content to the given <i>res</i> list.
	*
	* @param res
	* {@link java.util.List} of {@link org.apache.any23.extractor.html.HTMLDocument.TextField}
	* @param node
	* the node to read
	*/
	public static void readUrlField(List<TextField> res, Node node) {
	String name = node.getNodeName();
	NamedNodeMap attributes = node.getAttributes();
	if (null == attributes) {
	res.add(new TextField(node.getTextContent(), node));
	return;
	}
	if ("A".equals(name) \|\| "AREA".equals(name)) {
	Node n = attributes.getNamedItem("href");
	if (n != null) {
	res.add(new TextField(n.getNodeValue(), n));
	}
	} else if ("ABBR".equals(name)) {
	Node n = attributes.getNamedItem("title");
	if (n != null) {
	res.add(new TextField(n.getNodeValue(), n));
	}
	} else if ("IMG".equals(name)) {
	Node n = attributes.getNamedItem("src");
	if (n != null) {
	res.add(new TextField(n.getNodeValue(), n));
	} else {
	n = attributes.getNamedItem("srcset");
	if (n != null) {
	res.add(new TextField(n.getNodeValue().split("[\\s,]+")[0], n));
	}
	}
	} else if ("OBJECT".equals(name)) {
	Node n = attributes.getNamedItem("data");
	if (n != null) {
	res.add(new TextField(n.getNodeValue(), n));
	}
	} else {
	res.add(new TextField(extractHCardTextContent(node), node));
	}
	}

	private static String extractHCardTextContent(Node node) {
	StringBuilder sb = new StringBuilder();
	NodeList nodes = node.getChildNodes();
	// if at least one element with 'value' class, concatenate all text in value
	if (extractTextInValue(nodes, sb) == 0) {
	// otherwise, concatenate all text not in elements with 'type' class
	extractTextNotInType(nodes, sb);
	}
	return sb.toString();
	}

	private static int extractTextInValue(NodeList nodes, StringBuilder b) {
	int count = 0;
	for (int i = 0, len = nodes.getLength(); i < len; i++) {
	Node n = nodes.item(i);
	if (DomUtils.hasClassName(n, "value")) {
	count++;
	b.append(n.getTextContent().trim());
	} else {
	count += extractTextInValue(n.getChildNodes(), b);
	}
	}
	return count;
	}

	private static void extractTextNotInType(NodeList nodes, StringBuilder b) {
	for (int i = 0, len = nodes.getLength(); i < len; i++) {
	Node n = nodes.item(i);
	if (n.getNodeType() == Node.TEXT_NODE) {
	b.append(n.getNodeValue().trim());
	} else if (!DomUtils.hasClassName(n, "type")) {
	extractTextNotInType(n.getChildNodes(), b);
	}
	}
	}

	/**
	* Extracts the href specific rel-tag string. See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a>
	* specification.
	*
	* @param hrefAttributeContent
	* the content of the <i>href</i> attribute.
	*
	* @return the rel-tag specification.
	*/
	public static String extractRelTag(String hrefAttributeContent) {
	String[] all = hrefAttributeContent.split("[#?]");
	// Cleanup spurious segments.
	String path = all[0];
	int pathLenghtMin1 = path.length() - 1;
	if ('/' == path.charAt(pathLenghtMin1)) {
	path = path.substring(0, pathLenghtMin1);
	}
	return path;
	}

	/**
	* Extracts the href specific rel-tag string. See the <a href="http://microformats.org/wiki/rel-tag">rel-tag</a>
	* specification.
	*
	* @param attributes
	* the list of attributes of a node.
	*
	* @return the rel-tag specification.
	*/
	public static String extractRelTag(NamedNodeMap attributes) {
	return extractRelTag(attributes.getNamedItem("href").getNodeValue());
	}

	/**
	* Reads the text content of the given node and returns it. If the <code>prettify</code> flag is <code>true</code>
	* the text is cleaned up.
	*
	* @param node
	* node to read content.
	* @param prettify
	* if <code>true</code> blank chars will be removed.
	*
	* @return the read text.
	*/
	public static String readNodeContent(Node node, boolean prettify) {
	final String content = node.getTextContent();
	return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content;
	}

	/**
	* Constructor accepting the root node.
	*
	* @param document
	* a {@link org.w3c.dom.Node}
	*/
	public HTMLDocument(Node document) {
	if (null == document)
	throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument");
	this.document = document;
	}

	/**
	* @param uri
	* string to resolve to {@link org.eclipse.rdf4j.model.IRI}
	*
	* @return An absolute IRI, or null if the IRI is not fixable
	*
	* @throws org.apache.any23.extractor.ExtractionException
	* If the base IRI is invalid
	*/
	public IRI resolveIRI(String uri) throws ExtractionException {
	return valueFactory.resolveIRI(uri, getBaseIRI());
	}

	public String find(String xpath) {
	return DomUtils.find(getDocument(), xpath);
	}

	public Node findNodeById(String id) {
	return DomUtils.findNodeById(getDocument(), id);
	}

	public List<Node> findAll(String xpath) {
	return DomUtils.findAll(getDocument(), xpath);
	}

	public String findMicroformattedValue(String objectTag, String object, String fieldTag, String field, String key) {
	Node node = findMicroformattedObjectNode(objectTag, object);
	if (null == node)
	return "";
	// try to check if it is inline
	if (DomUtils.hasClassName(node, field))
	return node.getTextContent();

	// failed, try to find it in a child
	try {
	String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key;
	String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING);
	if (null == value) {
	return "";
	}
	return value;
	} catch (XPathExpressionException ex) {
	throw new RuntimeException("Should not happen, XPath expression is built locally", ex);
	}

	}

	public Node getDocument() {
	return document;
	}

	/**
	* Returns a singular text field.
	*
	* @param className
	* name of class containing text.
	*
	* @return if multiple values are found just the first is returned, if we want to check that there are no n-ary
	* values use plural finder
	*/
	public TextField getSingularTextField(String className) {
	TextField[] res = getPluralTextField(className);
	if (res.length == 0)
	return new TextField("", null);
	return res[0];
	}

	/**
	* Returns a plural text field.
	*
	* @param className
	* name of class node containing text.
	*
	* @return list of fields.
	*/
	public TextField[] getPluralTextField(String className) {
	List<TextField> res = new ArrayList<TextField>();
	List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
	for (Node node : nodes) {
	res.add(readTextField(node));
	}
	return res.toArray(new TextField[res.size()]);
	}

	/**
	* Returns the URL associated to the field marked with class <i>className</i>.
	*
	* @param className
	* name of node class containing the URL field.
	*
	* @return if multiple values are found just the first is returned, if we want to check that there are no n-ary
	* values use plural finder
	*/
	public TextField getSingularUrlField(String className) {
	TextField[] res = getPluralUrlField(className);
	if (res.length < 1)
	return new TextField("", null);
	return res[0];
	}

	/**
	* Returns the list of URLs associated to the fields marked with class <i>className</i>.
	*
	* @param className
	* name of node class containing the URL field.
	*
	* @return the list of {@link HTMLDocument.TextField} found.
	*/
	public TextField[] getPluralUrlField(String className) {
	List<TextField> res = new ArrayList<TextField>();
	List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className);
	for (Node node : nodes)
	readUrlField(res, node);
	return res.toArray(new TextField[res.size()]);
	}

	public Node findMicroformattedObjectNode(String objectTag, String name) {
	List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name);
	if (nodes.isEmpty())
	return null;
	return nodes.get(0);
	}

	/**
	* Read an attribute avoiding NullPointerExceptions, if the attr is missing it just returns an empty string.
	*
	* @param attribute
	* the attribute name.
	*
	* @return the string representing the attribute.
	*/
	public String readAttribute(String attribute) {
	return DomUtils.readAttribute(getDocument(), attribute);
	}

	/**
	* Finds all the nodes by class name.
	*
	* @param clazz
	* the class name.
	*
	* @return list of matching nodes.
	*/
	public List<Node> findAllByClassName(String clazz) {
	return DomUtils.findAllByClassName(getDocument(), clazz);
	}

	/**
	* Returns the text contained inside a node if leaf, <code>null</code> otherwise.
	*
	* @return the text of a leaf node.
	*/
	public String getText() {
	NodeList children = getDocument().getChildNodes();
	if (children.getLength() == 1 && children.item(0) instanceof Text) {
	return children.item(0).getTextContent();
	}
	return null;
	}

	/**
	* Returns the document default language.
	*
	* @return default language if any, <code>null</code> otherwise.
	*/
	public String getDefaultLanguage() {
	final String xpathLanguageSelector = "/HTML";
	Node html;
	try {
	html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE);
	} catch (XPathExpressionException xpeee) {
	throw new IllegalStateException();
	}
	if (html == null) {
	return null;
	}
	Node langAttribute = html.getAttributes().getNamedItem("xml:lang");
	return langAttribute == null ? null : langAttribute.getTextContent();
	}

	/**
	* Returns the sequence of ancestors from the document root to the local root (document).
	*
	* @return a sequence of node names.
	*/
	public String[] getPathToLocalRoot() {
	return DomUtils.getXPathListForNode(document);
	}

	/**
	* Extracts all the <code>rel</code> tag nodes.
	*
	* @return list of rel tag nodes.
	*/
	public TextField[] extractRelTagNodes() {
	final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel");
	final List<TextField> result = new ArrayList<TextField>();
	for (Node relTagNode : relTagNodes) {
	readUrlField(result, relTagNode);
	}
	return result.toArray(new TextField[result.size()]);
	}

	private java.net.URI getBaseIRI() throws ExtractionException {
	if (baseIRI == null) {
	// document.getBaseURI() returns null for document URIs with
	// special characters, e.g., http://semanticweb.org/wiki/Knud_Möller
	// It also does not take html "base" elements into account.
	// (But it does take into account urls specified by the attribute "xml:base".)

	// So, for now, let's use getDocumentURI() instead.
	// TODO: Make this approach better.

	Document doc = document instanceof Document ? (Document) document : document.getOwnerDocument();

	if (doc == null) {
	throw new ExtractionException(
	"Node " + document.getNodeName() + " was not associated with a document.");
	}

	String uri = doc.getDocumentURI();

	if (uri == null) {
	throw new ExtractionException("document URI is null, this should not happen");
	}

	try {
	baseIRI = new java.net.URI(RDFUtils.fixAbsoluteIRI(uri));
	} catch (IllegalArgumentException ex) {
	throw new ExtractionException("Error in base IRI: " + uri, ex);
	} catch (URISyntaxException ex) {
	throw new ExtractionException("Error in base IRI: " + uri, ex);
	}
	}
	return baseIRI;
	}

	/**
	* This class represents a text extracted from the <i>HTML</i> DOM related to the node from which such test has been
	* retrieved.
	*/
	public static class TextField {
	private String value;
	private Node source;

	public TextField(String value, Node source) {
	this.value = value;
	this.source = source;
	}

	public String value() {
	return value;
	}

	public Node source() {
	return source;
	}
	}

	}