blob: 33895f9a940c236c4b0ff7761ce999420468c33b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.crawl;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Iterator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
/**
* This class provides a way to separate the URL normalization and filtering
* steps from the rest of LinkDb manipulation code.
*
* @author Andrzej Bialecki
*/
public class LinkDbFilter extends Mapper<Text, Inlinks, Text, Inlinks> {
public static final String URL_FILTERING = "linkdb.url.filters";
public static final String URL_NORMALIZING = "linkdb.url.normalizer";
public static final String URL_NORMALIZING_SCOPE = "linkdb.url.normalizer.scope";
private boolean filter;
private boolean normalize;
private URLFilters filters;
private URLNormalizers normalizers;
private String scope;
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private Text newKey = new Text();
public void setup(Mapper<Text, Inlinks, Text, Inlinks>.Context context) {
Configuration conf = context.getConfiguration();
filter = conf.getBoolean(URL_FILTERING, false);
normalize = conf.getBoolean(URL_NORMALIZING, false);
if (filter) {
filters = new URLFilters(conf);
}
if (normalize) {
scope = conf.get(URL_NORMALIZING_SCOPE, URLNormalizers.SCOPE_LINKDB);
normalizers = new URLNormalizers(conf, scope);
}
}
public void close() {
}
public void map(Text key, Inlinks value, Context context)
throws IOException, InterruptedException {
String url = key.toString();
Inlinks result = new Inlinks();
if (normalize) {
try {
url = normalizers.normalize(url, scope); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
url = null;
}
}
if (url != null && filter) {
try {
url = filters.filter(url); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + url + ":" + e);
url = null;
}
}
if (url == null)
return; // didn't pass the filters
Iterator<Inlink> it = value.iterator();
String fromUrl = null;
while (it.hasNext()) {
Inlink inlink = it.next();
fromUrl = inlink.getFromUrl();
if (normalize) {
try {
fromUrl = normalizers.normalize(fromUrl, scope); // normalize the url
} catch (Exception e) {
LOG.warn("Skipping " + fromUrl + ":" + e);
fromUrl = null;
}
}
if (fromUrl != null && filter) {
try {
fromUrl = filters.filter(fromUrl); // filter the url
} catch (Exception e) {
LOG.warn("Skipping " + fromUrl + ":" + e);
fromUrl = null;
}
}
if (fromUrl != null) {
result.add(new Inlink(fromUrl, inlink.getAnchor()));
}
}
if (result.size() > 0) { // don't collect empty inlinks
newKey.set(url);
context.write(newKey, result);
}
}
}