src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMContentUtils.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.parse.html;

 import java.net.URL;
 import java.net.MalformedURLException;
 import java.util.Collection;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Set;

 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.util.NodeWalker;
 import org.apache.nutch.util.URLUtil;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.MapWritable;
 import org.apache.hadoop.io.Text;

 /**
  * A collection of methods for extracting content from DOM trees.
  *
  * This class holds a few utility methods for pulling content out of DOM nodes,
  * such as getOutlinks, getText, etc.
  *
  */
 public class DOMContentUtils {

   private String srcTagMetaName;
   private boolean keepNodenames;
   private Set<String> blockNodes;

   public static class LinkParams {
     public String elName;
     public String attrName;
     public int childLen;

     public LinkParams(String elName, String attrName, int childLen) {
       this.elName = elName;
       this.attrName = attrName;
       this.childLen = childLen;
     }

     public String toString() {
       return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
     }
   }

   private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
   private Configuration conf;

   public DOMContentUtils(Configuration conf) {
     setConf(conf);
   }

   public void setConf(Configuration conf) {
     // forceTags is used to override configurable tag ignoring, later on
     Collection<String> forceTags = new ArrayList<String>(1);

     this.conf = conf;
     linkParams.clear();
     linkParams.put("a", new LinkParams("a", "href", 1));
     linkParams.put("area", new LinkParams("area", "href", 0));
     if (conf.getBoolean("parser.html.form.use_action", true)) {
       linkParams.put("form", new LinkParams("form", "action", 1));
       if (conf.get("parser.html.form.use_action") != null)
         forceTags.add("form");
     }
     linkParams.put("frame", new LinkParams("frame", "src", 0));
     linkParams.put("iframe", new LinkParams("iframe", "src", 0));
     linkParams.put("script", new LinkParams("script", "src", 0));
     linkParams.put("link", new LinkParams("link", "href", 0));
     linkParams.put("img", new LinkParams("img", "src", 0));
     linkParams.put("source", new LinkParams("source", "src", 0));

     // remove unwanted link tags from the linkParams map
     String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
     for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
       if (!forceTags.contains(ignoreTags[i]))
         linkParams.remove(ignoreTags[i]);
     }

     //NUTCH-2433 - Should we keep the html node where the outlinks are found?
     srcTagMetaName = this.conf
         .get("parser.html.outlinks.htmlnode_metadata_name");
     keepNodenames = (srcTagMetaName != null && srcTagMetaName.length() > 0);
     blockNodes = new HashSet<>(conf.getTrimmedStringCollection("parser.html.line.separators"));
   }

   /**
    * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
    * append all the content text found beneath the DOM node to the
    * <code>StringBuffer</code>.
    *
    * <p>
    *
    * If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
    * and the <code>StringBuffer</code> will not contain any text encountered
    * after a nested anchor is found.
    *
    * <p>
    *
    * @return true if nested anchors were found
    */
   public boolean getText(StringBuffer sb, Node node,
       boolean abortOnNestedAnchors) {
     if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
       return true;
     }
     return false;
   }

   /**
    * This is a convinience method, equivalent to
    * {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
    *
    */
   public void getText(StringBuffer sb, Node node) {
     getText(sb, node, false);
   }

   // returns true if abortOnNestedAnchors is true and we find nested
   // anchors
   private boolean getTextHelper(StringBuffer sb, Node node,
       boolean abortOnNestedAnchors, int anchorDepth) {
     boolean abort = false;
     NodeWalker walker = new NodeWalker(node);

     while (walker.hasNext()) {

       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();
       Node previousSibling = currentNode.getPreviousSibling();
       if (previousSibling != null
           && blockNodes.contains(previousSibling.getNodeName().toLowerCase())) {
         appendParagraphSeparator(sb);
       } else if (blockNodes.contains(nodeName.toLowerCase())) {
         appendParagraphSeparator(sb);
       }

       if ("script".equalsIgnoreCase(nodeName)) {
         walker.skipChildren();
       }
       if ("style".equalsIgnoreCase(nodeName)) {
         walker.skipChildren();
       }
       if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
         anchorDepth++;
         if (anchorDepth > 1) {
           abort = true;
           break;
         }
       }
       if (nodeType == Node.COMMENT_NODE) {
         walker.skipChildren();
       }
       if (nodeType == Node.TEXT_NODE) {
         // cleanup and trim the value
         String text = currentNode.getNodeValue();
         text = text.replaceAll("\\s+", " ");
         text = text.trim();
         if (text.length() > 0) {
           appendSpace(sb);
           sb.append(text);
         } else {
           appendParagraphSeparator(sb);
         }
       }
     }

     return abort;
   }

   /**
    * Conditionally append a paragraph/line break to StringBuffer unless last
    * character a already indicates a paragraph break. Also remove trailing space
    * before paragraph break.
    *
    * @param buffer
    *          StringBuffer to append paragraph break
    */
   private void appendParagraphSeparator(StringBuffer buffer) {
     if (buffer.length() == 0) {
       return;
     }
     char lastChar = buffer.charAt(buffer.length() - 1);
     if ('\n' != lastChar) {
       // remove white space before paragraph break
       while (lastChar == ' ') {
         buffer.deleteCharAt(buffer.length() - 1);
         lastChar = buffer.charAt(buffer.length() - 1);
       }
       if ('\n' != lastChar) {
         buffer.append('\n');
       }
     }
   }

   /**
    * Conditionally append a space to StringBuffer unless last character is a
    * space or line/paragraph break.
    *
    * @param buffer
    *          StringBuffer to append space
    */
   private void appendSpace(StringBuffer buffer) {
     if (buffer.length() == 0) {
       return;
     }
     char lastChar = buffer.charAt(buffer.length() - 1);
     if (' ' != lastChar && '\n' != lastChar) {
       buffer.append(' ');
     }
   }

   /**
    * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
    * append the content text found beneath the first <code>title</code> node to
    * the <code>StringBuffer</code>.
    *
    * @return true if a title node was found, false otherwise
    */
   public boolean getTitle(StringBuffer sb, Node node) {

     NodeWalker walker = new NodeWalker(node);

     while (walker.hasNext()) {

       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();

       if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
         return false;
       }

       if (nodeType == Node.ELEMENT_NODE) {
         if ("title".equalsIgnoreCase(nodeName)) {
           getText(sb, currentNode);
           return true;
         }
       }
     }

     return false;
   }

   /** If Node contains a BASE tag then it's HREF is returned. */
   public String getBase(Node node) {

     NodeWalker walker = new NodeWalker(node);

     while (walker.hasNext()) {

       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();

       // is this node a BASE tag?
       if (nodeType == Node.ELEMENT_NODE) {

         if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
           return null;
         }

         if ("base".equalsIgnoreCase(nodeName)) {
           NamedNodeMap attrs = currentNode.getAttributes();
           for (int i = 0; i < attrs.getLength(); i++) {
             Node attr = attrs.item(i);
             if ("href".equalsIgnoreCase(attr.getNodeName())) {
               return attr.getNodeValue();
             }
           }
         }
       }
     }

     // no.
     return null;
   }

   private boolean hasOnlyWhiteSpace(Node node) {
     String val = node.getNodeValue();
     for (int i = 0; i < val.length(); i++) {
       if (!Character.isWhitespace(val.charAt(i)))
         return false;
     }
     return true;
   }

   // this only covers a few cases of empty links that are symptomatic
   // of nekohtml's DOM-fixup process...
   private boolean shouldThrowAwayLink(Node node, NodeList children,
       int childLen, LinkParams params) {
     if (childLen == 0) {
       // this has no inner structure
       if (params.childLen == 0)
         return false;
       else
         return true;
     } else if ((childLen == 1)
         && (children.item(0).getNodeType() == Node.ELEMENT_NODE)
         && (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
       // single nested link
       return true;

     } else if (childLen == 2) {

       Node c0 = children.item(0);
       Node c1 = children.item(1);

       if ((c0.getNodeType() == Node.ELEMENT_NODE)
           && (params.elName.equalsIgnoreCase(c0.getNodeName()))
           && (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
         // single link followed by whitespace node
         return true;
       }

       if ((c1.getNodeType() == Node.ELEMENT_NODE)
           && (params.elName.equalsIgnoreCase(c1.getNodeName()))
           && (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
         // whitespace node followed by single link
         return true;
       }

     } else if (childLen == 3) {
       Node c0 = children.item(0);
       Node c1 = children.item(1);
       Node c2 = children.item(2);

       if ((c1.getNodeType() == Node.ELEMENT_NODE)
           && (params.elName.equalsIgnoreCase(c1.getNodeName()))
           && (c0.getNodeType() == Node.TEXT_NODE)
           && (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
           && hasOnlyWhiteSpace(c2)) {
         // single link surrounded by whitespace nodes
         return true;
       }
     }

     return false;
   }

   /**
    * This method finds all anchors below the supplied DOM <code>node</code>, and
    * creates appropriate {@link Outlink} records for each (relative to the
    * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
    * {@link ArrayList}.
    *
    * <p>
    *
    * Links without inner structure (tags, text, etc) are discarded, as are links
    * which contain only single nested links and empty text nodes (this is a
    * common DOM-fixup artifact, at least with nekohtml).
    */
   public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {

     NodeWalker walker = new NodeWalker(node);
     while (walker.hasNext()) {

       Node currentNode = walker.nextNode();
       String nodeName = currentNode.getNodeName();
       short nodeType = currentNode.getNodeType();
       NodeList children = currentNode.getChildNodes();
       int childLen = (children != null) ? children.getLength() : 0;

       if (nodeType == Node.ELEMENT_NODE) {

         nodeName = nodeName.toLowerCase();
         LinkParams params = (LinkParams) linkParams.get(nodeName);
         if (params != null) {
           if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

             StringBuffer linkText = new StringBuffer();
             getText(linkText, currentNode, true);
             if (linkText.toString().trim().length() == 0) {
               // try harder - use img alt if present
               NodeWalker subWalker = new NodeWalker(currentNode);
               while (subWalker.hasNext()) {
                 Node subNode = subWalker.nextNode();
                 if (subNode.getNodeType() == Node.ELEMENT_NODE) {
                   if (subNode.getNodeName().toLowerCase().equals("img")) {
                     NamedNodeMap subAttrs = subNode.getAttributes();
                     Node alt = subAttrs.getNamedItem("alt");
                     if (alt != null) {
                       String altTxt = alt.getTextContent();
                       if (altTxt != null && altTxt.trim().length() > 0) {
                         if (linkText.length() > 0)
                           linkText.append(' ');
                         linkText.append(altTxt);
                       }
                     }
                   } else {
                     // ignore other types of elements

                   }
                 } else if (subNode.getNodeType() == Node.TEXT_NODE) {
                   String txt = subNode.getTextContent();
                   if (txt != null && txt.length() > 0) {
                     if (linkText.length() > 0)
                       linkText.append(' ');
                     linkText.append(txt);
                   }
                 }
               }
             }

             NamedNodeMap attrs = currentNode.getAttributes();
             String target = null;
             boolean noFollow = false;
             boolean post = false;
             for (int i = 0; i < attrs.getLength(); i++) {
               Node attr = attrs.item(i);
               String attrName = attr.getNodeName();
               if (params.attrName.equalsIgnoreCase(attrName)) {
                 target = attr.getNodeValue();
               } else if ("rel".equalsIgnoreCase(attrName)
                   && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
                 noFollow = true;
               } else if ("method".equalsIgnoreCase(attrName)
                   && "post".equalsIgnoreCase(attr.getNodeValue())) {
                 post = true;
               }
             }
             if (target != null && !noFollow && !post)
               try {

                 URL url = URLUtil.resolveURL(base, target);
                 Outlink outlink = new Outlink(url.toString(), linkText
                     .toString().trim());
                 outlinks.add(outlink);

                 // NUTCH-2433 - Keep the node name where the URL was found into
                 // the outlink metadata
                 if (keepNodenames) {
                   MapWritable metadata = new MapWritable();
                   metadata.put(new Text(srcTagMetaName), new Text(nodeName));
                   outlink.setMetadata(metadata);
                 }

               } catch (MalformedURLException e) {
                 // don't care
               }
           }
           // this should not have any children, skip them
           if (params.childLen == 0)
             continue;
         }
       }
     }
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.parse.html;

	import java.net.URL;
	import java.net.MalformedURLException;
	import java.util.Collection;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Set;

	import org.apache.nutch.parse.Outlink;
	import org.apache.nutch.util.NodeWalker;
	import org.apache.nutch.util.URLUtil;
	import org.w3c.dom.NamedNodeMap;
	import org.w3c.dom.Node;
	import org.w3c.dom.NodeList;
	import org.apache.hadoop.conf.Configuration;
	import org.apache.hadoop.io.MapWritable;
	import org.apache.hadoop.io.Text;

	/**
	* A collection of methods for extracting content from DOM trees.
	*
	* This class holds a few utility methods for pulling content out of DOM nodes,
	* such as getOutlinks, getText, etc.
	*
	*/
	public class DOMContentUtils {

	private String srcTagMetaName;
	private boolean keepNodenames;
	private Set<String> blockNodes;

	public static class LinkParams {
	public String elName;
	public String attrName;
	public int childLen;

	public LinkParams(String elName, String attrName, int childLen) {
	this.elName = elName;
	this.attrName = attrName;
	this.childLen = childLen;
	}

	public String toString() {
	return "LP[el=" + elName + ",attr=" + attrName + ",len=" + childLen + "]";
	}
	}

	private HashMap<String, LinkParams> linkParams = new HashMap<String, LinkParams>();
	private Configuration conf;

	public DOMContentUtils(Configuration conf) {
	setConf(conf);
	}

	public void setConf(Configuration conf) {
	// forceTags is used to override configurable tag ignoring, later on
	Collection<String> forceTags = new ArrayList<String>(1);

	this.conf = conf;
	linkParams.clear();
	linkParams.put("a", new LinkParams("a", "href", 1));
	linkParams.put("area", new LinkParams("area", "href", 0));
	if (conf.getBoolean("parser.html.form.use_action", true)) {
	linkParams.put("form", new LinkParams("form", "action", 1));
	if (conf.get("parser.html.form.use_action") != null)
	forceTags.add("form");
	}
	linkParams.put("frame", new LinkParams("frame", "src", 0));
	linkParams.put("iframe", new LinkParams("iframe", "src", 0));
	linkParams.put("script", new LinkParams("script", "src", 0));
	linkParams.put("link", new LinkParams("link", "href", 0));
	linkParams.put("img", new LinkParams("img", "src", 0));
	linkParams.put("source", new LinkParams("source", "src", 0));

	// remove unwanted link tags from the linkParams map
	String[] ignoreTags = conf.getStrings("parser.html.outlinks.ignore_tags");
	for (int i = 0; ignoreTags != null && i < ignoreTags.length; i++) {
	if (!forceTags.contains(ignoreTags[i]))
	linkParams.remove(ignoreTags[i]);
	}

	//NUTCH-2433 - Should we keep the html node where the outlinks are found?
	srcTagMetaName = this.conf
	.get("parser.html.outlinks.htmlnode_metadata_name");
	keepNodenames = (srcTagMetaName != null && srcTagMetaName.length() > 0);
	blockNodes = new HashSet<>(conf.getTrimmedStringCollection("parser.html.line.separators"));
	}

	/**
	* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
	* append all the content text found beneath the DOM node to the
	* <code>StringBuffer</code>.
	*
	* <p>
	*
	* If <code>abortOnNestedAnchors</code> is true, DOM traversal will be aborted
	* and the <code>StringBuffer</code> will not contain any text encountered
	* after a nested anchor is found.
	*
	* <p>
	*
	* @return true if nested anchors were found
	*/
	public boolean getText(StringBuffer sb, Node node,
	boolean abortOnNestedAnchors) {
	if (getTextHelper(sb, node, abortOnNestedAnchors, 0)) {
	return true;
	}
	return false;
	}

	/**
	* This is a convinience method, equivalent to
	* {@link #getText(StringBuffer,Node,boolean) getText(sb, node, false)}.
	*
	*/
	public void getText(StringBuffer sb, Node node) {
	getText(sb, node, false);
	}

	// returns true if abortOnNestedAnchors is true and we find nested
	// anchors
	private boolean getTextHelper(StringBuffer sb, Node node,
	boolean abortOnNestedAnchors, int anchorDepth) {
	boolean abort = false;
	NodeWalker walker = new NodeWalker(node);

	while (walker.hasNext()) {

	Node currentNode = walker.nextNode();
	String nodeName = currentNode.getNodeName();
	short nodeType = currentNode.getNodeType();
	Node previousSibling = currentNode.getPreviousSibling();
	if (previousSibling != null
	&& blockNodes.contains(previousSibling.getNodeName().toLowerCase())) {
	appendParagraphSeparator(sb);
	} else if (blockNodes.contains(nodeName.toLowerCase())) {
	appendParagraphSeparator(sb);
	}

	if ("script".equalsIgnoreCase(nodeName)) {
	walker.skipChildren();
	}
	if ("style".equalsIgnoreCase(nodeName)) {
	walker.skipChildren();
	}
	if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
	anchorDepth++;
	if (anchorDepth > 1) {
	abort = true;
	break;
	}
	}
	if (nodeType == Node.COMMENT_NODE) {
	walker.skipChildren();
	}
	if (nodeType == Node.TEXT_NODE) {
	// cleanup and trim the value
	String text = currentNode.getNodeValue();
	text = text.replaceAll("\\s+", " ");
	text = text.trim();
	if (text.length() > 0) {
	appendSpace(sb);
	sb.append(text);
	} else {
	appendParagraphSeparator(sb);
	}
	}
	}

	return abort;
	}

	/**
	* Conditionally append a paragraph/line break to StringBuffer unless last
	* character a already indicates a paragraph break. Also remove trailing space
	* before paragraph break.
	*
	* @param buffer
	* StringBuffer to append paragraph break
	*/
	private void appendParagraphSeparator(StringBuffer buffer) {
	if (buffer.length() == 0) {
	return;
	}
	char lastChar = buffer.charAt(buffer.length() - 1);
	if ('\n' != lastChar) {
	// remove white space before paragraph break
	while (lastChar == ' ') {
	buffer.deleteCharAt(buffer.length() - 1);
	lastChar = buffer.charAt(buffer.length() - 1);
	}
	if ('\n' != lastChar) {
	buffer.append('\n');
	}
	}
	}

	/**
	* Conditionally append a space to StringBuffer unless last character is a
	* space or line/paragraph break.
	*
	* @param buffer
	* StringBuffer to append space
	*/
	private void appendSpace(StringBuffer buffer) {
	if (buffer.length() == 0) {
	return;
	}
	char lastChar = buffer.charAt(buffer.length() - 1);
	if (' ' != lastChar && '\n' != lastChar) {
	buffer.append(' ');
	}
	}

	/**
	* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
	* append the content text found beneath the first <code>title</code> node to
	* the <code>StringBuffer</code>.
	*
	* @return true if a title node was found, false otherwise
	*/
	public boolean getTitle(StringBuffer sb, Node node) {

	NodeWalker walker = new NodeWalker(node);

	while (walker.hasNext()) {

	Node currentNode = walker.nextNode();
	String nodeName = currentNode.getNodeName();
	short nodeType = currentNode.getNodeType();

	if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
	return false;
	}

	if (nodeType == Node.ELEMENT_NODE) {
	if ("title".equalsIgnoreCase(nodeName)) {
	getText(sb, currentNode);
	return true;
	}
	}
	}

	return false;
	}

	/** If Node contains a BASE tag then it's HREF is returned. */
	public String getBase(Node node) {

	NodeWalker walker = new NodeWalker(node);

	while (walker.hasNext()) {

	Node currentNode = walker.nextNode();
	String nodeName = currentNode.getNodeName();
	short nodeType = currentNode.getNodeType();

	// is this node a BASE tag?
	if (nodeType == Node.ELEMENT_NODE) {

	if ("body".equalsIgnoreCase(nodeName)) { // stop after HEAD
	return null;
	}

	if ("base".equalsIgnoreCase(nodeName)) {
	NamedNodeMap attrs = currentNode.getAttributes();
	for (int i = 0; i < attrs.getLength(); i++) {
	Node attr = attrs.item(i);
	if ("href".equalsIgnoreCase(attr.getNodeName())) {
	return attr.getNodeValue();
	}
	}
	}
	}
	}

	// no.
	return null;
	}

	private boolean hasOnlyWhiteSpace(Node node) {
	String val = node.getNodeValue();
	for (int i = 0; i < val.length(); i++) {
	if (!Character.isWhitespace(val.charAt(i)))
	return false;
	}
	return true;
	}

	// this only covers a few cases of empty links that are symptomatic
	// of nekohtml's DOM-fixup process...
	private boolean shouldThrowAwayLink(Node node, NodeList children,
	int childLen, LinkParams params) {
	if (childLen == 0) {
	// this has no inner structure
	if (params.childLen == 0)
	return false;
	else
	return true;
	} else if ((childLen == 1)
	&& (children.item(0).getNodeType() == Node.ELEMENT_NODE)
	&& (params.elName.equalsIgnoreCase(children.item(0).getNodeName()))) {
	// single nested link
	return true;

	} else if (childLen == 2) {

	Node c0 = children.item(0);
	Node c1 = children.item(1);

	if ((c0.getNodeType() == Node.ELEMENT_NODE)
	&& (params.elName.equalsIgnoreCase(c0.getNodeName()))
	&& (c1.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c1)) {
	// single link followed by whitespace node
	return true;
	}

	if ((c1.getNodeType() == Node.ELEMENT_NODE)
	&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
	&& (c0.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)) {
	// whitespace node followed by single link
	return true;
	}

	} else if (childLen == 3) {
	Node c0 = children.item(0);
	Node c1 = children.item(1);
	Node c2 = children.item(2);

	if ((c1.getNodeType() == Node.ELEMENT_NODE)
	&& (params.elName.equalsIgnoreCase(c1.getNodeName()))
	&& (c0.getNodeType() == Node.TEXT_NODE)
	&& (c2.getNodeType() == Node.TEXT_NODE) && hasOnlyWhiteSpace(c0)
	&& hasOnlyWhiteSpace(c2)) {
	// single link surrounded by whitespace nodes
	return true;
	}
	}

	return false;
	}

	/**
	* This method finds all anchors below the supplied DOM <code>node</code>, and
	* creates appropriate {@link Outlink} records for each (relative to the
	* supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
	* {@link ArrayList}.
	*
	* <p>
	*
	* Links without inner structure (tags, text, etc) are discarded, as are links
	* which contain only single nested links and empty text nodes (this is a
	* common DOM-fixup artifact, at least with nekohtml).
	*/
	public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {

	NodeWalker walker = new NodeWalker(node);
	while (walker.hasNext()) {

	Node currentNode = walker.nextNode();
	String nodeName = currentNode.getNodeName();
	short nodeType = currentNode.getNodeType();
	NodeList children = currentNode.getChildNodes();
	int childLen = (children != null) ? children.getLength() : 0;

	if (nodeType == Node.ELEMENT_NODE) {

	nodeName = nodeName.toLowerCase();
	LinkParams params = (LinkParams) linkParams.get(nodeName);
	if (params != null) {
	if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {

	StringBuffer linkText = new StringBuffer();
	getText(linkText, currentNode, true);
	if (linkText.toString().trim().length() == 0) {
	// try harder - use img alt if present
	NodeWalker subWalker = new NodeWalker(currentNode);
	while (subWalker.hasNext()) {
	Node subNode = subWalker.nextNode();
	if (subNode.getNodeType() == Node.ELEMENT_NODE) {
	if (subNode.getNodeName().toLowerCase().equals("img")) {
	NamedNodeMap subAttrs = subNode.getAttributes();
	Node alt = subAttrs.getNamedItem("alt");
	if (alt != null) {
	String altTxt = alt.getTextContent();
	if (altTxt != null && altTxt.trim().length() > 0) {
	if (linkText.length() > 0)
	linkText.append(' ');
	linkText.append(altTxt);
	}
	}
	} else {
	// ignore other types of elements

	}
	} else if (subNode.getNodeType() == Node.TEXT_NODE) {
	String txt = subNode.getTextContent();
	if (txt != null && txt.length() > 0) {
	if (linkText.length() > 0)
	linkText.append(' ');
	linkText.append(txt);
	}
	}
	}
	}

	NamedNodeMap attrs = currentNode.getAttributes();
	String target = null;
	boolean noFollow = false;
	boolean post = false;
	for (int i = 0; i < attrs.getLength(); i++) {
	Node attr = attrs.item(i);
	String attrName = attr.getNodeName();
	if (params.attrName.equalsIgnoreCase(attrName)) {
	target = attr.getNodeValue();
	} else if ("rel".equalsIgnoreCase(attrName)
	&& "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
	noFollow = true;
	} else if ("method".equalsIgnoreCase(attrName)
	&& "post".equalsIgnoreCase(attr.getNodeValue())) {
	post = true;
	}
	}
	if (target != null && !noFollow && !post)
	try {

	URL url = URLUtil.resolveURL(base, target);
	Outlink outlink = new Outlink(url.toString(), linkText
	.toString().trim());
	outlinks.add(outlink);

	// NUTCH-2433 - Keep the node name where the URL was found into
	// the outlink metadata
	if (keepNodenames) {
	MapWritable metadata = new MapWritable();
	metadata.put(new Text(srcTagMetaName), new Text(nodeName));
	outlink.setMetadata(metadata);
	}

	} catch (MalformedURLException e) {
	// don't care
	}
	}
	// this should not have any children, skip them
	if (params.childLen == 0)
	continue;
	}
	}
	}
	}

	}