| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.nutch.parse; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.net.MalformedURLException; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.oro.text.regex.MatchResult; |
| import org.apache.oro.text.regex.Pattern; |
| import org.apache.oro.text.regex.PatternCompiler; |
| import org.apache.oro.text.regex.PatternMatcher; |
| import org.apache.oro.text.regex.PatternMatcherInput; |
| import org.apache.oro.text.regex.Perl5Compiler; |
| import org.apache.oro.text.regex.Perl5Matcher; |
| |
| /** |
| * Extractor to extract {@link org.apache.nutch.parse.Outlink}s / URLs from |
| * plain text using Regular Expressions. |
| * |
| * @see <a |
| * href="http://wiki.java.net/bin/view/Javapedia/RegularExpressions">Comparison |
| * of different regexp-Implementations </a> |
| * @see <a href="http://regex.info/java.html">Overview about Java Regexp APIs |
| * </a> |
| * |
| * @author Stephan Strittmatter - http://www.sybit.de |
| * @version 1.0 |
| * @since 0.7 |
| */ |
| public class OutlinkExtractor { |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| /** |
| * Regex pattern to get URLs within a plain text. |
| * |
| * @see <a |
| * href="http://www.truerwords.net/articles/ut/urlactivation.html">http://www.truerwords.net/articles/ut/urlactivation.html |
| |
| * </a> |
| */ |
| private static final String URL_PATTERN = "([A-Za-z][A-Za-z0-9+.-]{1,120}:[A-Za-z0-9/](([A-Za-z0-9$_.+!*,;/?:@&~=-])|%[A-Fa-f0-9]{2}){1,333}(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*,;/?:@&~=%-]{0,1000}))?)"; |
| |
| /** |
| * Extracts <code>Outlink</code> from given plain text. Applying this method |
| * to non-plain-text can result in extremely lengthy runtimes for parasitic |
| * cases (postscript is a known example). |
| * |
| * @param plainText |
| * the plain text from which URLs should be extracted. |
| * |
| * @return Array of <code>Outlink</code>s within found in plainText |
| */ |
| public static Outlink[] getOutlinks(final String plainText, Configuration conf) { |
| return OutlinkExtractor.getOutlinks(plainText, "", conf); |
| } |
| |
| /** |
| * Extracts <code>Outlink</code> from given plain text and adds anchor to the |
| * extracted <code>Outlink</code>s |
| * |
| * @param plainText |
| * the plain text from wich URLs should be extracted. |
| * @param anchor |
| * the anchor of the url |
| * |
| * @return Array of <code>Outlink</code>s within found in plainText |
| */ |
| public static Outlink[] getOutlinks(final String plainText, String anchor, |
| Configuration conf) { |
| long start = System.currentTimeMillis(); |
| final List<Outlink> outlinks = new ArrayList<>(); |
| |
| try { |
| final PatternCompiler cp = new Perl5Compiler(); |
| final Pattern pattern = cp.compile(URL_PATTERN, |
| Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK |
| | Perl5Compiler.MULTILINE_MASK); |
| final PatternMatcher matcher = new Perl5Matcher(); |
| |
| final PatternMatcherInput input = new PatternMatcherInput(plainText); |
| |
| MatchResult result; |
| String url; |
| |
| // loop the matches |
| while (matcher.contains(input, pattern)) { |
| // if this is taking too long, stop matching |
| // (SHOULD really check cpu time used so that heavily loaded systems |
| // do not unnecessarily hit this limit.) |
| if (System.currentTimeMillis() - start >= 60000L) { |
| if (LOG.isWarnEnabled()) { |
| LOG.warn("Time limit exceeded for getOutLinks"); |
| } |
| break; |
| } |
| result = matcher.getMatch(); |
| url = result.group(0); |
| try { |
| outlinks.add(new Outlink(url, anchor)); |
| } catch (MalformedURLException mue) { |
| LOG.warn("Invalid url: '" + url + "', skipping."); |
| } |
| } |
| } catch (Exception ex) { |
| // if the matcher fails (perhaps a malformed URL) we just log it and move |
| // on |
| if (LOG.isErrorEnabled()) { |
| LOG.error("getOutlinks", ex); |
| } |
| } |
| |
| final Outlink[] retval; |
| |
| // create array of the Outlinks |
| if (outlinks != null && outlinks.size() > 0) { |
| retval = outlinks.toArray(new Outlink[0]); |
| } else { |
| retval = new Outlink[0]; |
| } |
| |
| return retval; |
| } |
| |
| } |