blob: a9aa0e43d7162a9a2384ea7fccd6d57f6702e347 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.html;
import java.net.URL;
import java.net.MalformedURLException;
import java.util.Collection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Set;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
import org.apache.nutch.util.URLUtil;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.MapWritable;
import org.apache.hadoop.io.Text;
/**
* A collection of methods for extracting content from DOM trees.
*
* This class holds a few utility methods for pulling content out of DOM nodes,
* such as getOutlinks, getText, etc.
*
*/
public class DOMContentUtils {
private String srcTagMetaName;
private boolean keepNodenames;
private Set<String> blockNodes;
public static class LinkParams {
public String elName;
public String attrName;
public int childLen;
public LinkParams(String elName, String attrName, int childLen) {
this.elName = elName;
this.attrName = attrName;
this.childLen = childLen;
}
public String toString() {
return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
}
}
private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
private Configuration conf;
public DOMContentUtils(Configuration conf) {
setConf(conf);
}
public void setConf(Configuration conf) {
// forceTags is used to override configurable tag ignoring, later on
Collection<String> forceTags = new ArrayList<String>(1);
this.conf = conf;
linkParams.clear();
linkParams.put("a", new LinkParams("a", "href", 1));
linkParams.put("area", new LinkParams("area", "href", 0));
if (conf.getBoolean("parser.html.form.use_action", true)) {
linkParams.put("form", new LinkParams("form", "action", 1));
if (conf.get("parser.html.form.use_action") != null)
forceTags.add("form");
}
linkParams.put("frame", new LinkParams("frame", "src", 0));
linkParams.put("iframe", new LinkParams("iframe", "src", 0));
linkParams.put("script", new LinkParams("script", "src", 0));
linkParams.put("link", new LinkParams("link", "href", 0));
linkParams.put("img", new LinkParams("img", "src", 0));
linkParams.put("source", new LinkParams("source", "src", 0));
// remove unwanted link tags from the linkParams map
String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
if (!forceTags.contains(ignoreTags[i]))
linkParams.remove(ignoreTags[i]);
}
//NUTCH-2433 - Should we keep the html node where the outlinks are found?
srcTagMetaName = this.conf
.get("parser.html.outlinks.htmlnode_metadata_name");
keepNodenames = (srcTagMetaName != null && srcTagMetaName.length() > 0);
blockNodes = new HashSet<>(conf.getTrimmedStringCollection("parser.html.line.separators"));
}
/**
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* append all the content text found beneath the DOM node to the
* <code>StringBuffer</code>.
*
* <p>
*
* If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
* and the <code>StringBuffer</code> will not contain any text encountered
* after a nested anchor is found.
*
* <p>
*
* @return true if nested anchors were found
*/
public boolean getText(StringBuffer sb, Node node,
boolean abortOnNestedAnchors) {
if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
return true;
}
return false;
}
/**
* This is a convinience method, equivalent to
* {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
*
*/
public void getText(StringBuffer sb, Node node) {
getText(sb, node, false);
}
// returns true if abortOnNestedAnchors is true and we find nested
// anchors
private boolean getTextHelper(StringBuffer sb, Node node,
boolean abortOnNestedAnchors, int anchorDepth) {
boolean abort = false;
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
Node previousSibling = currentNode.getPreviousSibling();
if (previousSibling != null
&& blockNodes.contains(previousSibling.getNodeName().toLowerCase())) {
appendParagraphSeparator(sb);
} else if (blockNodes.contains(nodeName.toLowerCase())) {
appendParagraphSeparator(sb);
}
if ("script".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if ("style".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
anchorDepth++;
if (anchorDepth > 1) {
abort = true;
break;
}
}
if (nodeType == Node.COMMENT_NODE) {
walker.skipChildren();
}
if (nodeType == Node.TEXT_NODE) {
// cleanup and trim the value
String text = currentNode.getNodeValue();
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0) {
appendSpace(sb);
sb.append(text);
} else {
appendParagraphSeparator(sb);
}
}
}
return abort;
}
/**
* Conditionally append a paragraph/line break to StringBuffer unless last
* character a already indicates a paragraph break. Also remove trailing space
* before paragraph break.
*
* @param buffer
* StringBuffer to append paragraph break
*/
private void appendParagraphSeparator(StringBuffer buffer) {
if (buffer.length() == 0) {
return;
}
char lastChar = buffer.charAt(buffer.length() - 1);
if ('\n' != lastChar) {
// remove white space before paragraph break
while (lastChar == ' ') {
buffer.deleteCharAt(buffer.length() - 1);
lastChar = buffer.charAt(buffer.length() - 1);
}
if ('\n' != lastChar) {
buffer.append('\n');
}
}
}
/**
* Conditionally append a space to StringBuffer unless last character is a
* space or line/paragraph break.
*
* @param buffer
* StringBuffer to append space
*/
private void appendSpace(StringBuffer buffer) {
if (buffer.length() == 0) {
return;
}
char lastChar = buffer.charAt(buffer.length() - 1);
if (' ' != lastChar && '\n' != lastChar) {
buffer.append(' ');
}
}
/**
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* append the content text found beneath the first <code>title</code> node to
* the <code>StringBuffer</code>.
*
* @return true if a title node was found, false otherwise
*/
public boolean getTitle(StringBuffer sb, Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
return false;
}
if (nodeType == Node.ELEMENT_NODE) {
if ("title".equalsIgnoreCase(nodeName)) {
getText(sb, currentNode);
return true;
}
}
}
return false;
}
/** If Node contains a BASE tag then it's HREF is returned. */
public String getBase(Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
// is this node a BASE tag?
if (nodeType == Node.ELEMENT_NODE) {
if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
return null;
}
if ("base".equalsIgnoreCase(nodeName)) {
NamedNodeMap attrs = currentNode.getAttributes();
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
if ("href".equalsIgnoreCase(attr.getNodeName())) {
return attr.getNodeValue();
}
}
}
}
}
// no.
return null;
}
private boolean hasOnlyWhiteSpace(Node node) {
String val = node.getNodeValue();
for (int i = 0; i < val.length(); i++) {
if (!Character.isWhitespace(val.charAt(i)))
return false;
}
return true;
}
// this only covers a few cases of empty links that are symptomatic
// of nekohtml's DOM-fixup process...
private boolean shouldThrowAwayLink(Node node, NodeList children,
int childLen, LinkParams params) {
if (childLen == 0) {
// this has no inner structure
if (params.childLen == 0)
return false;
else
return true;
} else if ((childLen == 1)
&& (children.item(0).getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
// single nested link
return true;
} else if (childLen == 2) {
Node c0 = children.item(0);
Node c1 = children.item(1);
if ((c0.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c0.getNodeName()))
&& (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
// single link followed by whitespace node
return true;
}
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
&& (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
// whitespace node followed by single link
return true;
}
} else if (childLen == 3) {
Node c0 = children.item(0);
Node c1 = children.item(1);
Node c2 = children.item(2);
if ((c1.getNodeType() == Node.ELEMENT_NODE)
&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
&& (c0.getNodeType() == Node.TEXT_NODE)
&& (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
&& hasOnlyWhiteSpace(c2)) {
// single link surrounded by whitespace nodes
return true;
}
}
return false;
}
/**
* This method finds all anchors below the supplied DOM <code>node</code>, and
* creates appropriate {@link Outlink} records for each (relative to the
* supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
* {@link ArrayList}.
*
* <p>
*
* Links without inner structure (tags, text, etc) are discarded, as are links
* which contain only single nested links and empty text nodes (this is a
* common DOM-fixup artifact, at least with nekohtml).
*/
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
NodeList children = currentNode.getChildNodes();
int childLen = (children != null) ? children.getLength() : 0;
if (nodeType == Node.ELEMENT_NODE) {
nodeName = nodeName.toLowerCase();
LinkParams params = (LinkParams) linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, currentNode, true);
if (linkText.toString().trim().length() == 0) {
// try harder - use img alt if present
NodeWalker subWalker = new NodeWalker(currentNode);
while (subWalker.hasNext()) {
Node subNode = subWalker.nextNode();
if (subNode.getNodeType() == Node.ELEMENT_NODE) {
if (subNode.getNodeName().toLowerCase().equals("img")) {
NamedNodeMap subAttrs = subNode.getAttributes();
Node alt = subAttrs.getNamedItem("alt");
if (alt != null) {
String altTxt = alt.getTextContent();
if (altTxt != null && altTxt.trim().length() > 0) {
if (linkText.length() > 0)
linkText.append(' ');
linkText.append(altTxt);
}
}
} else {
// ignore other types of elements
}
} else if (subNode.getNodeType() == Node.TEXT_NODE) {
String txt = subNode.getTextContent();
if (txt != null && txt.length() > 0) {
if (linkText.length() > 0)
linkText.append(' ');
linkText.append(txt);
}
}
}
}
NamedNodeMap attrs = currentNode.getAttributes();
String target = null;
boolean noFollow = false;
boolean post = false;
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
} else if ("rel".equalsIgnoreCase(attrName)
&& "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
noFollow = true;
} else if ("method".equalsIgnoreCase(attrName)
&& "post".equalsIgnoreCase(attr.getNodeValue())) {
post = true;
}
}
if (target != null && !noFollow && !post)
try {
URL url = URLUtil.resolveURL(base, target);
Outlink outlink = new Outlink(url.toString(), linkText
.toString().trim());
outlinks.add(outlink);
// NUTCH-2433 - Keep the node name where the URL was found into
// the outlink metadata
if (keepNodenames) {
MapWritable metadata = new MapWritable();
metadata.put(new Text(srcTagMetaName), new Text(nodeName));
outlink.setMetadata(metadata);
}
} catch (MalformedURLException e) {
// don't care
}
}
// this should not have any children, skip them
if (params.childLen == 0)
continue;
}
}
}
}
}