blob: 35370f240ceeb4cf226260ff32c911a2d209d236 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.indexer.links;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlink;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.Parse;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
/**
* An {@link org.apache.nutch.indexer.IndexingFilter} that adds
* <code>outlinks</code> and <code>inlinks</code> field(s) to the document.
*
* In case that you want to ignore the outlinks that point to the same host
* as the URL being indexed use the following settings in your configuration
* file:
*
* &lt;property&gt;
* &lt;name&gt;index.links.outlinks.host.ignore&lt;/name&gt;
* &lt;value&gt;true&lt;/value&gt;
* &lt;/property&gt;
*
* The same configuration is available for inlinks:
*
* &lt;property&gt;
* &lt;name&gt;index.links.inlinks.host.ignore&lt;/name&gt;
* &lt;value&gt;true&lt;/value&gt;
* &lt;/property&gt;
*
* To store only the host portion of each inlink URL or outlink URL add the
* following to your configuration file.
*
* &lt;property&gt;
* &lt;name&gt;index.links.hosts.only&lt;/name&gt;
* &lt;value&gt;false&lt;/value&gt;
* &lt;/property&gt;
*
*/
public class LinksIndexingFilter implements IndexingFilter {
public final static String LINKS_OUTLINKS_HOST = "index.links.outlinks.host.ignore";
public final static String LINKS_INLINKS_HOST = "index.links.inlinks.host.ignore";
public final static String LINKS_ONLY_HOSTS = "index.links.hosts.only";
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private Configuration conf;
private boolean filterOutlinks;
private boolean filterInlinks;
private boolean indexHost;
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
// Add the outlinks
Outlink[] outlinks = parse.getData().getOutlinks();
if (outlinks != null) {
Set<String> hosts = new HashSet<String>();
for (Outlink outlink : outlinks) {
try {
String linkUrl = outlink.getToUrl();
String outHost = new URL(linkUrl).getHost().toLowerCase();
if (indexHost) {
linkUrl = outHost;
if (hosts.contains(linkUrl))
continue;
hosts.add(linkUrl);
}
addFilteredLink("outlinks", url.toString(), linkUrl, outHost,
filterOutlinks, doc);
} catch (MalformedURLException e) {
LOG.error("Malformed URL in {}: {}", url, e.getMessage());
}
}
}
// Add the inlinks
if (null != inlinks) {
Iterator<Inlink> iterator = inlinks.iterator();
Set<String> inlinkHosts = new HashSet<String>();
while (iterator.hasNext()) {
try {
Inlink link = iterator.next();
String linkUrl = link.getFromUrl();
String inHost = new URL(linkUrl).getHost().toLowerCase();
if (indexHost) {
linkUrl = inHost;
if (inlinkHosts.contains(linkUrl))
continue;
inlinkHosts.add(linkUrl);
}
addFilteredLink("inlinks", url.toString(), linkUrl, inHost,
filterInlinks, doc);
} catch (MalformedURLException e) {
LOG.error("Malformed URL in {}: {}", url, e.getMessage());
}
}
}
return doc;
}
private void addFilteredLink(String fieldName, String url, String linkUrl,
String urlHost, boolean filter, NutchDocument doc) throws MalformedURLException {
if (filter) {
String host = new URL(url.toString()).getHost().toLowerCase();
if (!host.equalsIgnoreCase(urlHost)) {
doc.add(fieldName, linkUrl);
}
} else {
doc.add(fieldName, linkUrl);
}
}
public void setConf(Configuration conf) {
this.conf = conf;
filterOutlinks = conf.getBoolean(LINKS_OUTLINKS_HOST, false);
filterInlinks = conf.getBoolean(LINKS_INLINKS_HOST, false);
indexHost = conf.getBoolean(LINKS_ONLY_HOSTS, false);
}
public Configuration getConf() {
return this.conf;
}
}