| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.nutch.indexer.links; |
| |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.io.Text; |
| import org.apache.nutch.crawl.CrawlDatum; |
| import org.apache.nutch.crawl.Inlink; |
| import org.apache.nutch.crawl.Inlinks; |
| import org.apache.nutch.indexer.IndexingException; |
| import org.apache.nutch.indexer.IndexingFilter; |
| import org.apache.nutch.indexer.NutchDocument; |
| import org.apache.nutch.parse.Outlink; |
| import org.apache.nutch.parse.Parse; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import java.lang.invoke.MethodHandles; |
| import java.net.MalformedURLException; |
| import java.net.URL; |
| import java.util.HashSet; |
| import java.util.Iterator; |
| import java.util.Set; |
| |
| /** |
| * An {@link org.apache.nutch.indexer.IndexingFilter} that adds |
| * <code>outlinks</code> and <code>inlinks</code> field(s) to the document. |
| * |
| * In case that you want to ignore the outlinks that point to the same host |
| * as the URL being indexed use the following settings in your configuration |
| * file: |
| * |
| * <property> |
| * <name>index.links.outlinks.host.ignore</name> |
| * <value>true</value> |
| * </property> |
| * |
| * The same configuration is available for inlinks: |
| * |
| * <property> |
| * <name>index.links.inlinks.host.ignore</name> |
| * <value>true</value> |
| * </property> |
| * |
| * To store only the host portion of each inlink URL or outlink URL add the |
| * following to your configuration file. |
| * |
| * <property> |
| * <name>index.links.hosts.only</name> |
| * <value>false</value> |
| * </property> |
| * |
| */ |
| public class LinksIndexingFilter implements IndexingFilter { |
| |
| public final static String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore"; |
| public final static String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore"; |
| public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only"; |
| |
| private static final Logger LOG = LoggerFactory |
| .getLogger(MethodHandles.lookup().lookupClass()); |
| |
| private Configuration conf; |
| private boolean filterOutlinks; |
| private boolean filterInlinks; |
| private boolean indexHost; |
| |
| @Override |
| public NutchDocument filter(NutchDocument doc, Parse parse, Text url, |
| CrawlDatum datum, Inlinks inlinks) throws IndexingException { |
| |
| // Add the outlinks |
| Outlink[] outlinks = parse.getData().getOutlinks(); |
| |
| if (outlinks != null) { |
| Set<String> hosts = new HashSet<String>(); |
| |
| for (Outlink outlink : outlinks) { |
| try { |
| String linkUrl = outlink.getToUrl(); |
| String outHost = new URL(linkUrl).getHost().toLowerCase(); |
| |
| if (indexHost) { |
| linkUrl = outHost; |
| |
| if (hosts.contains(linkUrl)) |
| continue; |
| |
| hosts.add(linkUrl); |
| } |
| |
| addFilteredLink("outlinks", url.toString(), linkUrl, outHost, |
| filterOutlinks, doc); |
| } catch (MalformedURLException e) { |
| LOG.error("Malformed URL in {}: {}", url, e.getMessage()); |
| } |
| } |
| } |
| |
| // Add the inlinks |
| if (null != inlinks) { |
| Iterator<Inlink> iterator = inlinks.iterator(); |
| Set<String> inlinkHosts = new HashSet<String>(); |
| |
| while (iterator.hasNext()) { |
| try { |
| Inlink link = iterator.next(); |
| String linkUrl = link.getFromUrl(); |
| String inHost = new URL(linkUrl).getHost().toLowerCase(); |
| |
| if (indexHost) { |
| linkUrl = inHost; |
| |
| if (inlinkHosts.contains(linkUrl)) |
| continue; |
| |
| inlinkHosts.add(linkUrl); |
| } |
| |
| addFilteredLink("inlinks", url.toString(), linkUrl, inHost, |
| filterInlinks, doc); |
| } catch (MalformedURLException e) { |
| LOG.error("Malformed URL in {}: {}", url, e.getMessage()); |
| } |
| } |
| } |
| |
| return doc; |
| } |
| |
| private void addFilteredLink(String fieldName, String url, String linkUrl, |
| String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException { |
| if (filter) { |
| String host = new URL(url.toString()).getHost().toLowerCase(); |
| |
| if (!host.equalsIgnoreCase(urlHost)) { |
| doc.add(fieldName, linkUrl); |
| } |
| } else { |
| doc.add(fieldName, linkUrl); |
| } |
| } |
| |
| public void setConf(Configuration conf) { |
| this.conf = conf; |
| filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false); |
| filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false); |
| |
| indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false); |
| } |
| |
| public Configuration getConf() { |
| return this.conf; |
| } |
| } |