nutch-core/src/main/java/org/apache/nutch/parse/OutlinkExtractor.java - nutch - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.nutch.parse;

 import java.lang.invoke.MethodHandles;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.List;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

 import org.apache.hadoop.conf.Configuration;
 import org.apache.oro.text.regex.MatchResult;
 import org.apache.oro.text.regex.Pattern;
 import org.apache.oro.text.regex.PatternCompiler;
 import org.apache.oro.text.regex.PatternMatcher;
 import org.apache.oro.text.regex.PatternMatcherInput;
 import org.apache.oro.text.regex.Perl5Compiler;
 import org.apache.oro.text.regex.Perl5Matcher;

 /**
  * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
  * plain text using Regular Expressions.
  *
  * @see <a
  *      href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
  *      of different regexp-Implementations </a>
  * @see <a href="http://regex.info/java.html">Overview about Java Regexp APIs
  *      </a>
  *
  * @author Stephan Strittmatter - http://www.sybit.de
  * @version 1.0
  * @since 0.7
  */
 public class OutlinkExtractor {
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());

   /**
    * Regex pattern to get URLs within a plain text.
    *
    * @see <a
    *      href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html

    *      </a>
    */
   private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)";

   /**
    * Extracts <code>Outlink</code> from given plain text. Applying this method
    * to non-plain-text can result in extremely lengthy runtimes for parasitic
    * cases (postscript is a known example).
    *
    * @param plainText
    *          the plain text from which URLs should be extracted.
    *
    * @return Array of <code>Outlink</code>s within found in plainText
    */
   public static Outlink[] getOutlinks(final String plainText, Configuration conf) {
     return OutlinkExtractor.getOutlinks(plainText, "", conf);
   }

   /**
    * Extracts <code>Outlink</code> from given plain text and adds anchor to the
    * extracted <code>Outlink</code>s
    *
    * @param plainText
    *          the plain text from wich URLs should be extracted.
    * @param anchor
    *          the anchor of the url
    *
    * @return Array of <code>Outlink</code>s within found in plainText
    */
   public static Outlink[] getOutlinks(final String plainText, String anchor,
       Configuration conf) {
     long start = System.currentTimeMillis();
     final List<Outlink> outlinks = new ArrayList<>();

     try {
       final PatternCompiler cp = new Perl5Compiler();
       final Pattern pattern = cp.compile(URL_PATTERN,
           Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
               | Perl5Compiler.MULTILINE_MASK);
       final PatternMatcher matcher = new Perl5Matcher();

       final PatternMatcherInput input = new PatternMatcherInput(plainText);

       MatchResult result;
       String url;

       // loop the matches
       while (matcher.contains(input, pattern)) {
         // if this is taking too long, stop matching
         // (SHOULD really check cpu time used so that heavily loaded systems
         // do not unnecessarily hit this limit.)
         if (System.currentTimeMillis() - start >= 60000L) {
           if (LOG.isWarnEnabled()) {
             LOG.warn("Time limit exceeded for getOutLinks");
           }
           break;
         }
         result = matcher.getMatch();
         url = result.group(0);
         try {
           outlinks.add(new Outlink(url, anchor));
         } catch (MalformedURLException mue) {
           LOG.warn("Invalid url: '" + url + "', skipping.");
         }
       }
     } catch (Exception ex) {
       // if the matcher fails (perhaps a malformed URL) we just log it and move
       // on
       if (LOG.isErrorEnabled()) {
         LOG.error("getOutlinks", ex);
       }
     }

     final Outlink[] retval;

     // create array of the Outlinks
     if (outlinks != null && outlinks.size() > 0) {
       retval = outlinks.toArray(new Outlink[0]);
     } else {
       retval = new Outlink[0];
     }

     return retval;
   }

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.nutch.parse;

	import java.lang.invoke.MethodHandles;
	import java.net.MalformedURLException;
	import java.util.ArrayList;
	import java.util.List;

	import org.slf4j.Logger;
	import org.slf4j.LoggerFactory;

	import org.apache.hadoop.conf.Configuration;
	import org.apache.oro.text.regex.MatchResult;
	import org.apache.oro.text.regex.Pattern;
	import org.apache.oro.text.regex.PatternCompiler;
	import org.apache.oro.text.regex.PatternMatcher;
	import org.apache.oro.text.regex.PatternMatcherInput;
	import org.apache.oro.text.regex.Perl5Compiler;
	import org.apache.oro.text.regex.Perl5Matcher;

	/**
	* Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from
	* plain text using Regular Expressions.
	*
	* @see <a
	* href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison
	* of different regexp-Implementations </a>
	* @see <a href="http://regex.info/java.html">Overview about Java Regexp APIs
	* </a>
	*
	* @author Stephan Strittmatter - http://www.sybit.de
	* @version 1.0
	* @since 0.7
	*/
	public class OutlinkExtractor {
	private static final Logger LOG = LoggerFactory
	.getLogger(MethodHandles.lookup().lookupClass());

	/**
	* Regex pattern to get URLs within a plain text.
	*
	* @see <a
	* href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html

	* </a>
	*/
	private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!,;/?:@&~=-])\|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!,;/?:@&~=%-]{0,1000}))?)";

	/**
	* Extracts <code>Outlink</code> from given plain text. Applying this method
	* to non-plain-text can result in extremely lengthy runtimes for parasitic
	* cases (postscript is a known example).
	*
	* @param plainText
	* the plain text from which URLs should be extracted.
	*
	* @return Array of <code>Outlink</code>s within found in plainText
	*/
	public static Outlink[] getOutlinks(final String plainText, Configuration conf) {
	return OutlinkExtractor.getOutlinks(plainText, "", conf);
	}

	/**
	* Extracts <code>Outlink</code> from given plain text and adds anchor to the
	* extracted <code>Outlink</code>s
	*
	* @param plainText
	* the plain text from wich URLs should be extracted.
	* @param anchor
	* the anchor of the url
	*
	* @return Array of <code>Outlink</code>s within found in plainText
	*/
	public static Outlink[] getOutlinks(final String plainText, String anchor,
	Configuration conf) {
	long start = System.currentTimeMillis();
	final List<Outlink> outlinks = new ArrayList<>();

	try {
	final PatternCompiler cp = new Perl5Compiler();
	final Pattern pattern = cp.compile(URL_PATTERN,
	Perl5Compiler.CASE_INSENSITIVE_MASK \| Perl5Compiler.READ_ONLY_MASK
	\| Perl5Compiler.MULTILINE_MASK);
	final PatternMatcher matcher = new Perl5Matcher();

	final PatternMatcherInput input = new PatternMatcherInput(plainText);

	MatchResult result;
	String url;

	// loop the matches
	while (matcher.contains(input, pattern)) {
	// if this is taking too long, stop matching
	// (SHOULD really check cpu time used so that heavily loaded systems
	// do not unnecessarily hit this limit.)
	if (System.currentTimeMillis() - start >= 60000L) {
	if (LOG.isWarnEnabled()) {
	LOG.warn("Time limit exceeded for getOutLinks");
	}
	break;
	}
	result = matcher.getMatch();
	url = result.group(0);
	try {
	outlinks.add(new Outlink(url, anchor));
	} catch (MalformedURLException mue) {
	LOG.warn("Invalid url: '" + url + "', skipping.");
	}
	}
	} catch (Exception ex) {
	// if the matcher fails (perhaps a malformed URL) we just log it and move
	// on
	if (LOG.isErrorEnabled()) {
	LOG.error("getOutlinks", ex);
	}
	}

	final Outlink[] retval;

	// create array of the Outlinks
	if (outlinks != null && outlinks.size() > 0) {
	retval = outlinks.toArray(new Outlink[0]);
	} else {
	retval = new Outlink[0];
	}

	return retval;
	}

	}