src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java - nutch - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.nutch.parse.js;

 import java.lang.invoke.MethodHandles;
 import java.io.BufferedReader;
 import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.HtmlParseFilter;
 import org.apache.nutch.parse.Outlink;
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseData;
 import org.apache.nutch.parse.ParseImpl;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.parse.ParseText;
 import org.apache.nutch.parse.ParseStatus;
 import org.apache.nutch.parse.Parser;
 import org.apache.nutch.protocol.Content;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.hadoop.conf.Configuration;
 import org.w3c.dom.DocumentFragment;
 import org.w3c.dom.Element;
 import org.w3c.dom.NamedNodeMap;
 import org.w3c.dom.Node;
 import org.w3c.dom.NodeList;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 /**
  * This class is a heuristic link extractor for JavaScript files and code
  * snippets. The general idea of a two-pass regex matching comes from Heritrix.
  * Parts of the code come from OutlinkExtractor.java
  */
 public class JSParseFilter implements HtmlParseFilter, Parser {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   private static final int MAX_TITLE_LEN = 80;

   private Configuration conf;

   /**
    * Scan the JavaScript fragments of a HTML page looking for possible {@link Outlink}'s
    *
    * @param content
    *          page content
    * @param parseResult
    *          parsed content, result of running the HTML parser
    * @param metaTags
    *          within the {@link HTMLMetaTags}
    * @param doc
    *          The {@link DocumentFragment} object
    * @return parse the actual {@link ParseResult} object with additional outlinks from JavaScript
    */
   @Override
   public ParseResult filter(Content content, ParseResult parseResult,
       HTMLMetaTags metaTags, DocumentFragment doc) {

     Parse parse = parseResult.get(content.getUrl());

     String url = content.getBaseUrl();
     ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
     walk(doc, parse, metaTags, url, outlinks);
     if (outlinks.size() > 0) {
       Outlink[] old = parse.getData().getOutlinks();
       String title = parse.getData().getTitle();
       List<Outlink> list = Arrays.asList(old);
       outlinks.addAll(list);
       ParseStatus status = parse.getData().getStatus();
       String text = parse.getText();
       Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks
           .size()]);
       ParseData parseData = new ParseData(status, title, newlinks, parse
           .getData().getContentMeta(), parse.getData().getParseMeta());

       // replace original parse obj with new one
       parseResult.put(content.getUrl(), new ParseText(text), parseData);
     }
     return parseResult;
   }

   private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
       List<Outlink> outlinks) {
     if (n instanceof Element) {
       String name = n.getNodeName();
       if (name.equalsIgnoreCase("script")) {
         /*
          * String lang = null; Node lNode =
          * n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
          * "javascript"; else lang = lNode.getNodeValue();
          */
         StringBuffer script = new StringBuffer();
         NodeList nn = n.getChildNodes();
         if (nn.getLength() > 0) {
           for (int i = 0; i < nn.getLength(); i++) {
             if (i > 0)
               script.append('\n');
             script.append(nn.item(i).getNodeValue());
           }
           // if (LOG.isInfoEnabled()) {
           // LOG.info("script: language=" + lang + ", text: " +
           // script.toString());
           // }
           Outlink[] links = getJSLinks(script.toString(), "", base);
           if (links != null && links.length > 0)
             outlinks.addAll(Arrays.asList(links));
           // no other children of interest here, go one level up.
           return;
         }
       } else {
         // process all HTML 4.0 events, if present...
         NamedNodeMap attrs = n.getAttributes();
         int len = attrs.getLength();
         for (int i = 0; i < len; i++) {
           // Window: onload,onunload
           // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
           // Keyboard: onkeydown,onkeypress,onkeyup
           // Mouse:
           // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
           Node anode = attrs.item(i);
           Outlink[] links = null;
           if (anode.getNodeName().startsWith("on")) {
             links = getJSLinks(anode.getNodeValue(), "", base);
           } else if (anode.getNodeName().equalsIgnoreCase("href")) {
             String val = anode.getNodeValue();
             if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
               links = getJSLinks(val, "", base);
             }
           }
           if (links != null && links.length > 0)
             outlinks.addAll(Arrays.asList(links));
         }
       }
     }
     NodeList nl = n.getChildNodes();
     for (int i = 0; i < nl.getLength(); i++) {
       walk(nl.item(i), parse, metaTags, base, outlinks);
     }
   }

   /**
    * Parse a JavaScript file and extract outlinks
    *
    * @param c
    *          page content
    * @return parse the actual {@link Parse} object
    */
   @Override
   public ParseResult getParse(Content c) {
     String script = new String(c.getContent());
     Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
     if (outlinks == null)
       outlinks = new Outlink[0];
     // Title? use the first line of the script...
     String title;
     int idx = script.indexOf('\n');
     if (idx != -1) {
       if (idx > MAX_TITLE_LEN)
         idx = MAX_TITLE_LEN;
       title = script.substring(0, idx);
     } else {
       idx = Math.min(MAX_TITLE_LEN, script.length());
       title = script.substring(0, idx);
     }
     ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
         c.getMetadata());
     return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
   }

   private static final Pattern STRING_PATTERN = Pattern.compile(
       "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)",
       Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
   // A simple pattern. This allows also invalid URL characters.
   private static final Pattern URI_PATTERN = Pattern.compile(
       "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)",
       Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);

   // Alternative pattern, which limits valid url characters.
   // private static final String URI_PATTERN =
   // "(^|\\s*?)[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]*))?($|\\s*)";

   /**
    * This method extracts URLs from literals embedded in JavaScript.
    */
   private Outlink[] getJSLinks(String plainText, String anchor, String base) {

     final List<Outlink> outlinks = new ArrayList<Outlink>();
     URL baseURL = null;

     try {
       baseURL = new URL(base);
     } catch (Exception e) {
       if (LOG.isErrorEnabled()) {
         LOG.error("error assigning base URL", e);
       }
     }

     try {

       Matcher matcher = STRING_PATTERN.matcher(plainText);

       String url;

       while (matcher.find()) {
         url = matcher.group(2);
         Matcher matcherUri = URI_PATTERN.matcher(url);
         if (!matcherUri.matches()) {
           continue;
         }
         if (url.startsWith("www.")) {
           url = "http://" + url;
         } else {
           // See if candidate URL is parseable. If not, pass and move on to
           // the next match.
           try {
             url = new URL(baseURL, url).toString();
           } catch (MalformedURLException ex) {
             if (LOG.isTraceEnabled()) {
               LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
                   + baseURL + "'", ex);
             }
             continue;
           }
         }
         url = url.replaceAll("&amp;", "&");
         if (LOG.isTraceEnabled()) {
           LOG.trace(" - outlink from JS: '" + url + "'");
         }
         outlinks.add(new Outlink(url, anchor));
       }
     } catch (Exception ex) {
       // if it is a malformed URL we just throw it away and continue with
       // extraction.
       if (LOG.isErrorEnabled()) {
         LOG.error(" - invalid or malformed URL", ex);
       }
     }

     final Outlink[] retval;

     // create array of the Outlinks
     if (outlinks != null && outlinks.size() > 0) {
       retval = outlinks.toArray(new Outlink[0]);
     } else {
       retval = new Outlink[0];
     }

     return retval;
   }

   /**
    * Main method which can be run from command line with the plugin option. The
    * method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js
    * baseURL
    *
    * @param args
    * @throws Exception
    */
   public static void main(String[] args) throws Exception {
     if (args.length < 2) {
       System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
       return;
     }
     InputStream in = new FileInputStream(args[0]);
     BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
     StringBuffer sb = new StringBuffer();
     String line = null;
     while ((line = br.readLine()) != null)
       sb.append(line + "\n");
     br.close();

     JSParseFilter parseFilter = new JSParseFilter();
     parseFilter.setConf(NutchConfiguration.create());
     Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
     System.out.println("Outlinks extracted: " + links.length);
     for (int i = 0; i < links.length; i++)
       System.out.println(" - " + links[i]);
   }

   public void setConf(Configuration conf) {
     this.conf = conf;
   }

   public Configuration getConf() {
     return this.conf;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.nutch.parse.js;

	import java.lang.invoke.MethodHandles;
	import java.io.BufferedReader;
	import java.io.FileInputStream;
	import java.io.InputStream;
	import java.io.InputStreamReader;
	import java.net.MalformedURLException;
	import java.net.URL;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.nutch.parse.HTMLMetaTags;
	import org.apache.nutch.parse.HtmlParseFilter;
	import org.apache.nutch.parse.Outlink;
	import org.apache.nutch.parse.Parse;
	import org.apache.nutch.parse.ParseData;
	import org.apache.nutch.parse.ParseImpl;
	import org.apache.nutch.parse.ParseResult;
	import org.apache.nutch.parse.ParseText;
	import org.apache.nutch.parse.ParseStatus;
	import org.apache.nutch.parse.Parser;
	import org.apache.nutch.protocol.Content;
	import org.apache.nutch.util.NutchConfiguration;
	import org.apache.hadoop.conf.Configuration;
	import org.w3c.dom.DocumentFragment;
	import org.w3c.dom.Element;
	import org.w3c.dom.NamedNodeMap;
	import org.w3c.dom.Node;
	import org.w3c.dom.NodeList;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	/**
	* This class is a heuristic link extractor for JavaScript files and code
	* snippets. The general idea of a two-pass regex matching comes from Heritrix.
	* Parts of the code come from OutlinkExtractor.java
	*/
	public class JSParseFilter implements HtmlParseFilter, Parser {
	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	private static final int MAX_TITLE_LEN = 80;

	private Configuration conf;

	/**
	* Scan the JavaScript fragments of a HTML page looking for possible {@link Outlink}'s
	*
	* @param content
	* page content
	* @param parseResult
	* parsed content, result of running the HTML parser
	* @param metaTags
	* within the {@link HTMLMetaTags}
	* @param doc
	* The {@link DocumentFragment} object
	* @return parse the actual {@link ParseResult} object with additional outlinks from JavaScript
	*/
	@Override
	public ParseResult filter(Content content, ParseResult parseResult,
	HTMLMetaTags metaTags, DocumentFragment doc) {

	Parse parse = parseResult.get(content.getUrl());

	String url = content.getBaseUrl();
	ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
	walk(doc, parse, metaTags, url, outlinks);
	if (outlinks.size() > 0) {
	Outlink[] old = parse.getData().getOutlinks();
	String title = parse.getData().getTitle();
	List<Outlink> list = Arrays.asList(old);
	outlinks.addAll(list);
	ParseStatus status = parse.getData().getStatus();
	String text = parse.getText();
	Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks
	.size()]);
	ParseData parseData = new ParseData(status, title, newlinks, parse
	.getData().getContentMeta(), parse.getData().getParseMeta());

	// replace original parse obj with new one
	parseResult.put(content.getUrl(), new ParseText(text), parseData);
	}
	return parseResult;
	}

	private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
	List<Outlink> outlinks) {
	if (n instanceof Element) {
	String name = n.getNodeName();
	if (name.equalsIgnoreCase("script")) {
	/*
	* String lang = null; Node lNode =
	* n.getAttributes().getNamedItem("language"); if (lNode == null) lang =
	* "javascript"; else lang = lNode.getNodeValue();
	*/
	StringBuffer script = new StringBuffer();
	NodeList nn = n.getChildNodes();
	if (nn.getLength() > 0) {
	for (int i = 0; i < nn.getLength(); i++) {
	if (i > 0)
	script.append('\n');
	script.append(nn.item(i).getNodeValue());
	}
	// if (LOG.isInfoEnabled()) {
	// LOG.info("script: language=" + lang + ", text: " +
	// script.toString());
	// }
	Outlink[] links = getJSLinks(script.toString(), "", base);
	if (links != null && links.length > 0)
	outlinks.addAll(Arrays.asList(links));
	// no other children of interest here, go one level up.
	return;
	}
	} else {
	// process all HTML 4.0 events, if present...
	NamedNodeMap attrs = n.getAttributes();
	int len = attrs.getLength();
	for (int i = 0; i < len; i++) {
	// Window: onload,onunload
	// Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
	// Keyboard: onkeydown,onkeypress,onkeyup
	// Mouse:
	// onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
	Node anode = attrs.item(i);
	Outlink[] links = null;
	if (anode.getNodeName().startsWith("on")) {
	links = getJSLinks(anode.getNodeValue(), "", base);
	} else if (anode.getNodeName().equalsIgnoreCase("href")) {
	String val = anode.getNodeValue();
	if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
	links = getJSLinks(val, "", base);
	}
	}
	if (links != null && links.length > 0)
	outlinks.addAll(Arrays.asList(links));
	}
	}
	}
	NodeList nl = n.getChildNodes();
	for (int i = 0; i < nl.getLength(); i++) {
	walk(nl.item(i), parse, metaTags, base, outlinks);
	}
	}

	/**
	* Parse a JavaScript file and extract outlinks
	*
	* @param c
	* page content
	* @return parse the actual {@link Parse} object
	*/
	@Override
	public ParseResult getParse(Content c) {
	String script = new String(c.getContent());
	Outlink[] outlinks = getJSLinks(script, "", c.getUrl());
	if (outlinks == null)
	outlinks = new Outlink[0];
	// Title? use the first line of the script...
	String title;
	int idx = script.indexOf('\n');
	if (idx != -1) {
	if (idx > MAX_TITLE_LEN)
	idx = MAX_TITLE_LEN;
	title = script.substring(0, idx);
	} else {
	idx = Math.min(MAX_TITLE_LEN, script.length());
	title = script.substring(0, idx);
	}
	ParseData pd = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks,
	c.getMetadata());
	return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
	}

	private static final Pattern STRING_PATTERN = Pattern.compile(
	"(\\\\*(?:\"\|\'))([^\\s\"\']+?)(?:\\1)",
	Pattern.CASE_INSENSITIVE \| Pattern.MULTILINE);
	// A simple pattern. This allows also invalid URL characters.
	private static final Pattern URI_PATTERN = Pattern.compile(
	"(^\|\\s?)/?\\S+?[/\\.]\\S+($\|\\s)",
	Pattern.CASE_INSENSITIVE \| Pattern.MULTILINE);

	// Alternative pattern, which limits valid url characters.
	// private static final String URI_PATTERN =
	// "(^\|\\s?)[A-Za-z0-9/](([A-Za-z0-9$_.+!,;/?:@&~=-])\|%[A-Fa-f0-9]{2})+[/.](([A-Za-z0-9$_.+!,;/?:@&~=-])\|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!,;/?:@&~=%-]))?($\|\\s)";

	/**
	* This method extracts URLs from literals embedded in JavaScript.
	*/
	private Outlink[] getJSLinks(String plainText, String anchor, String base) {

	final List<Outlink> outlinks = new ArrayList<Outlink>();
	URL baseURL = null;

	try {
	baseURL = new URL(base);
	} catch (Exception e) {
	if (LOG.isErrorEnabled()) {
	LOG.error("error assigning base URL", e);
	}
	}

	try {

	Matcher matcher = STRING_PATTERN.matcher(plainText);

	String url;

	while (matcher.find()) {
	url = matcher.group(2);
	Matcher matcherUri = URI_PATTERN.matcher(url);
	if (!matcherUri.matches()) {
	continue;
	}
	if (url.startsWith("www.")) {
	url = "http://" + url;
	} else {
	// See if candidate URL is parseable. If not, pass and move on to
	// the next match.
	try {
	url = new URL(baseURL, url).toString();
	} catch (MalformedURLException ex) {
	if (LOG.isTraceEnabled()) {
	LOG.trace(" - failed URL parse '" + url + "' and baseURL '"
	+ baseURL + "'", ex);
	}
	continue;
	}
	}
	url = url.replaceAll("&", "&");
	if (LOG.isTraceEnabled()) {
	LOG.trace(" - outlink from JS: '" + url + "'");
	}
	outlinks.add(new Outlink(url, anchor));
	}
	} catch (Exception ex) {
	// if it is a malformed URL we just throw it away and continue with
	// extraction.
	if (LOG.isErrorEnabled()) {
	LOG.error(" - invalid or malformed URL", ex);
	}
	}

	final Outlink[] retval;

	// create array of the Outlinks
	if (outlinks != null && outlinks.size() > 0) {
	retval = outlinks.toArray(new Outlink[0]);
	} else {
	retval = new Outlink[0];
	}

	return retval;
	}

	/**
	* Main method which can be run from command line with the plugin option. The
	* method takes two arguments e.g. o.a.n.parse.js.JSParseFilter file.js
	* baseURL
	*
	* @param args
	* @throws Exception
	*/
	public static void main(String[] args) throws Exception {
	if (args.length < 2) {
	System.err.println(JSParseFilter.class.getName() + " file.js baseURL");
	return;
	}
	InputStream in = new FileInputStream(args[0]);
	BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));
	StringBuffer sb = new StringBuffer();
	String line = null;
	while ((line = br.readLine()) != null)
	sb.append(line + "\n");
	br.close();

	JSParseFilter parseFilter = new JSParseFilter();
	parseFilter.setConf(NutchConfiguration.create());
	Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);
	System.out.println("Outlinks extracted: " + links.length);
	for (int i = 0; i < links.length; i++)
	System.out.println(" - " + links[i]);
	}

	public void setConf(Configuration conf) {
	this.conf = conf;
	}

	public Configuration getConf() {
	return this.conf;
	}
	}