blob: 675d85756744143dba07ca12785a7b969c7cf002 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.urlfilter.ignoreexempt;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.URLExemptionFilter;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.urlfilter.regex.RegexURLFilter;
import java.io.IOException;
import java.io.Reader;
import java.util.regex.Pattern;
import java.util.List;
/**
* This implementation of {@link org.apache.nutch.net.URLExemptionFilter} uses regex configuration
* to check if URL is eligible for exemption from 'db.ignore.external'.
* When this filter is enabled, the external urls will be checked against configured sequence of regex rules.
*<p>
* The exemption rule file defaults to db-ignore-external-exemptions.txt in the classpath but can be
* overridden using the property <code>"db.ignore.external.exemptions.file" in ./conf/nutch-*.xml</code>
*</p>
*
* The exemption rules are specified in plain text file where each line is a rule.
* The format is same same as `regex-urlfilter.txt`.
* Each non-comment, non-blank line contains a regular expression
* prefixed by '+' or '-'. The first matching pattern in the file
* determines whether a URL is exempted or ignored. If no pattern
* matches, the URL is ignored.
*
* @since Feb 10, 2016
* @version 1
* @see org.apache.nutch.net.URLExemptionFilter
* @see org.apache.nutch.urlfilter.regex.RegexURLFilter
*/
public class ExemptionUrlFilter extends RegexURLFilter
implements URLExemptionFilter {
public static final String DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE
= "db.ignore.external.exemptions.file";
private List<Pattern> exemptions;
public List<Pattern> getExemptions() {
return exemptions;
}
@Override
public boolean filter(String fromUrl, String toUrl) {
//this implementation does not consider fromUrl param.
//the regex rules are applied to toUrl.
return this.filter(toUrl) != null;
}
/**
* Gets reader for regex rules
*/
protected Reader getRulesReader(Configuration conf)
throws IOException {
String fileRules = conf.get(DB_IGNORE_EXTERNAL_EXEMPTIONS_FILE);
return conf.getConfResourceAsReader(fileRules);
}
public static void main(String[] args) {
if (args.length != 1) {
System.out.println("Error: Invalid Args");
System.out.println("Usage: " +
ExemptionUrlFilter.class.getName() + " <url>");
return;
}
String url = args[0];
ExemptionUrlFilter instance = new ExemptionUrlFilter();
instance.setConf(NutchConfiguration.create());
System.out.println(instance.filter(null, url));
}
}