Merge pull request #526 from sebastian-nagel/NUTCH-2419-urlfilter-rule-file-precedence
NUTCH-2419 Some URL filters and normalizers do not respect command-line override for rule file
diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index 3c43cf5..6e86fc6 100644
--- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -51,20 +51,11 @@
private static final Logger LOG = LoggerFactory
.getLogger(MethodHandles.lookup().lookupClass());
private static String attributeFile = null;
- private String regexFile = null;
private Configuration conf;
private static final Map<String,RegexRule> rules = new HashMap<>();
- public RegexParseFilter() {
- //default constructor
- }
-
- public RegexParseFilter(String regexFile) {
- this.regexFile = regexFile;
- }
-
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
Parse parse = parseResult.get(content.getUrl());
String html = new String(content.getContent());
@@ -129,15 +120,8 @@
}
}
- // domain file and attribute "file" take precedence if defined
- String file = conf.get("parsefilter.regex.file");
+ String file = conf.get("parsefilter.regex.file", attributeFile);
String stringRules = conf.get("parsefilter.regex.rules");
- if (regexFile != null) {
- file = regexFile;
- }
- else if (attributeFile != null) {
- file = attributeFile;
- }
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
diff --git a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java b/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
index 238d300..64fa7f6 100644
--- a/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
+++ b/src/plugin/parsefilter-regex/src/test/org/apache/nutch/parsefilter/regex/TestRegexParseFilter.java
@@ -35,7 +35,8 @@
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
- RegexParseFilter filter = new RegexParseFilter(file);
+ conf.set("parsefilter.regex.file", file);
+ RegexParseFilter filter = new RegexParseFilter();
filter.setConf(conf);
String url = "http://nutch.apache.org/";
@@ -56,7 +57,8 @@
Configuration conf = NutchConfiguration.create();
String file = SAMPLES + SEPARATOR + "regex-parsefilter.txt";
- RegexParseFilter filter = new RegexParseFilter(file);
+ conf.set("parsefilter.regex.file", file);
+ RegexParseFilter filter = new RegexParseFilter();
filter.setConf(conf);
String url = "http://nutch.apache.org/";
diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
index 9e2e2e7..f629262 100644
--- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
+++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -38,26 +38,28 @@
/**
* <p>
* Filters URLs based on a file containing domain suffixes, domain names, and
- * hostnames. Only a url that matches one of the suffixes, domains, or hosts
+ * hostnames. Only a URL that matches one of the suffixes, domains, or hosts
* present in the file is allowed.
* </p>
*
* <p>
- * Urls are checked in order of domain suffix, domain name, and hostname against
+ * URLs are checked in order of domain suffix, domain name, and hostname against
* entries in the domain file. The domain file would be setup as follows with
* one entry per line:
*
* <pre>
- * com apache.org www.apache.org
+ * com
+ * apache.org
+ * www.apache.org
* </pre>
*
* <p>
* The first line is an example of a filter that would allow all .com domains.
- * The second line allows all urls from apache.org and all of its subdomains
+ * The second line allows all URLs from apache.org and all of its subdomains
* such as lucene.apache.org and hadoop.apache.org. The third line would allow
- * only urls from www.apache.org. There is no specific ordering to entries. The
+ * only URLs from www.apache.org. There is no specific ordering to entries. The
* entries are from more general to more specific with the more general
- * overridding the more specific.
+ * overriding the more specific.
* </p>
*
* The domain file defaults to domain-urlfilter.txt in the classpath but can be
@@ -72,7 +74,6 @@
* </li>
* </ul>
*
- * the attribute "file" has higher precedence if defined.
*/
public class DomainURLFilter implements URLFilter {
@@ -82,7 +83,6 @@
// read in attribute "file" of this plugin.
private static String attributeFile = null;
private Configuration conf;
- private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();
private void readConfiguration(Reader configReader) throws IOException {
@@ -99,23 +99,6 @@
}
/**
- * Default constructor.
- */
- public DomainURLFilter() {
-
- }
-
- /**
- * Constructor that specifies the domain file to use.
- *
- * @param domainFile
- * The domain file, overrides domain-urlfilter.text default.
- */
- public DomainURLFilter(String domainFile) {
- this.domainFile = domainFile;
- }
-
- /**
* Sets the configuration.
*/
public void setConf(Configuration conf) {
@@ -133,44 +116,36 @@
}
}
- // handle blank non empty input
- if (attributeFile != null && attributeFile.trim().equals("")) {
+ if (attributeFile != null && attributeFile.trim().isEmpty()) {
attributeFile = null;
}
if (attributeFile != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Attribute \"file\" is defined for plugin " + pluginName
- + " as " + attributeFile);
- }
- } else {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
- + pluginName);
- }
+ LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile);
}
- // domain file and attribute "file" take precedence if defined
- String file = conf.get("urlfilter.domain.file");
+ // precedence hierarchy for definition of filter rules
+ // (first non-empty definition takes precedence):
+ // 1. string rules defined by `urlfilter.domain.rules`
+ // 2. rule file name defined by `urlfilter.domain.file`
+ // 3. rule file name defined in plugin.xml (`attributeFile`)
String stringRules = conf.get("urlfilter.domain.rules");
- if (domainFile != null) {
- file = domainFile;
- } else if (attributeFile != null) {
- file = attributeFile;
- }
+ String file = conf.get("urlfilter.domain.file", attributeFile);
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
+ LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
try {
if (reader == null) {
+ // read local file
reader = new FileReader(file);
}
readConfiguration(reader);
} catch (IOException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}
diff --git a/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java b/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
index 0be1e31..7878aa1 100644
--- a/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
+++ b/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java
@@ -31,7 +31,8 @@
String domainFile = SAMPLES + SEPARATOR + "hosts.txt";
Configuration conf = NutchConfiguration.create();
- DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+ conf.set("urlfilter.domain.file", domainFile);
+ DomainURLFilter domainFilter = new DomainURLFilter();
domainFilter.setConf(conf);
Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
@@ -50,7 +51,8 @@
// https://issues.apache.org/jira/browse/NUTCH-2189
String domainFile = SAMPLES + SEPARATOR + "this-file-does-not-exist.txt";
Configuration conf = NutchConfiguration.create();
- DomainURLFilter domainFilter = new DomainURLFilter(domainFile);
+ conf.set("urlfilter.domain.file", domainFile);
+ DomainURLFilter domainFilter = new DomainURLFilter();
domainFilter.setConf(conf);
Assert.assertNotNull(domainFilter.filter("http://lucene.apache.org"));
Assert.assertNotNull(domainFilter.filter("http://hadoop.apache.org"));
diff --git a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
index 452f6d4..77c238b 100644
--- a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
+++ b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
@@ -38,26 +38,28 @@
/**
* <p>
* Filters URLs based on a file containing domain suffixes, domain names, and
- * hostnames. A url that matches one of the suffixes, domains, or hosts present
+ * hostnames. A URL that matches one of the suffixes, domains, or hosts present
* in the file is filtered out.
* </p>
*
* <p>
- * Urls are checked in order of domain suffix, domain name, and hostname against
+ * URLs are checked in order of domain suffix, domain name, and hostname against
* entries in the domain file. The domain file would be setup as follows with
* one entry per line:
*
* <pre>
- * com apache.org www.apache.org
+ * com
+ * apache.org
+ * www.apache.org
* </pre>
*
* <p>
- * The first line is an example of a filter that would allow all .com domains.
- * The second line allows all urls from apache.org and all of its subdomains
- * such as lucene.apache.org and hadoop.apache.org. The third line would allow
- * only urls from www.apache.org. There is no specific ordering to entries. The
+ * The first line is an example of a filter that would exclude all .com domains.
+ * The second line excludes all URLs from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would exclude
+ * only URLs from www.apache.org. There is no specific ordering to entries. The
* entries are from more general to more specific with the more general
- * overridding the more specific.
+ * overriding the more specific.
* </p>
*
* The domain file defaults to domainblacklist-urlfilter.txt in the classpath
@@ -72,7 +74,6 @@
* </li>
* </ul>
*
- * the attribute "file" has higher precedence if defined.
*/
public class DomainBlacklistURLFilter implements URLFilter {
@@ -82,7 +83,6 @@
// read in attribute "file" of this plugin.
private static String attributeFile = null;
private Configuration conf;
- private String domainFile = null;
private Set<String> domainSet = new LinkedHashSet<String>();
private void readConfiguration(Reader configReader) throws IOException {
@@ -99,23 +99,6 @@
}
/**
- * Default constructor.
- */
- public DomainBlacklistURLFilter() {
-
- }
-
- /**
- * Constructor that specifies the domain file to use.
- *
- * @param domainFile
- * The domain file, overrides domainblacklist-urlfilter.text default.
- */
- public DomainBlacklistURLFilter(String domainFile) {
- this.domainFile = domainFile;
- }
-
- /**
* Sets the configuration.
*/
public void setConf(Configuration conf) {
@@ -133,44 +116,37 @@
}
}
- // handle blank non empty input
- if (attributeFile != null && attributeFile.trim().equals("")) {
+ if (attributeFile != null && attributeFile.trim().isEmpty()) {
attributeFile = null;
}
if (attributeFile != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Attribute \"file\" is defined for plugin " + pluginName
- + " as " + attributeFile);
- }
- } else {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
- + pluginName);
- }
+ LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName,
+ attributeFile);
}
- // domain file and attribute "file" take precedence if defined
- String file = conf.get("urlfilter.domainblacklist.file");
+ // precedence hierarchy for definition of filter rules
+ // (first non-empty definition takes precedence):
+ // 1. string rules defined by `urlfilter.domainblacklist.rules`
+ // 2. rule file name defined by `urlfilter.domainblacklist.file`
+ // 3. rule file name defined in plugin.xml (`attributeFile`)
String stringRules = conf.get("urlfilter.domainblacklist.rules");
- if (domainFile != null) {
- file = domainFile;
- } else if (attributeFile != null) {
- file = attributeFile;
- }
+ String file = conf.get("urlfilter.domainblacklist.file", attributeFile);
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
+ LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
try {
if (reader == null) {
+ // read local file
reader = new FileReader(file);
}
readConfiguration(reader);
} catch (IOException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}
diff --git a/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java b/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
index d253867..9ab207a 100644
--- a/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
+++ b/src/plugin/urlfilter-domainblacklist/src/test/org/apache/nutch/urlfilter/domainblacklist/TestDomainBlacklistURLFilter.java
@@ -31,8 +31,8 @@
String domainBlacklistFile = SAMPLES + SEPARATOR + "hosts.txt";
Configuration conf = NutchConfiguration.create();
- DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter(
- domainBlacklistFile);
+ conf.set("urlfilter.domainblacklist.file", domainBlacklistFile);
+ DomainBlacklistURLFilter domainBlacklistFilter = new DomainBlacklistURLFilter();
domainBlacklistFilter.setConf(conf);
Assert.assertNull(domainBlacklistFilter.filter("http://lucene.apache.org"));
Assert.assertNull(domainBlacklistFilter.filter("http://hadoop.apache.org"));
diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
index d1d5caa..61c6f17 100644
--- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
+++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -39,9 +39,8 @@
/**
* Filters URLs based on a file of URL prefixes. The file is named by (1)
- * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, and (2)
- * attribute "file" in plugin.xml of this plugin Attribute "file" has higher
- * precedence if defined.
+ * property "urlfilter.prefix.file" in ./conf/nutch-default.xml, or (2)
+ * the attribute "file" in plugin.xml of this plugin.
*
* <p>
* The format of this file is one URL prefix per line.
@@ -129,43 +128,39 @@
break;
}
}
- if (attributeFile != null && attributeFile.trim().equals(""))
+
+ if (attributeFile != null && attributeFile.trim().isEmpty()) {
attributeFile = null;
- if (attributeFile != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Attribute \"file\" is defined for plugin " + pluginName
- + " as " + attributeFile);
- }
- } else {
- // if (LOG.isWarnEnabled()) {
- // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
- // plugin "+pluginName);
- // }
}
- String file = conf.get("urlfilter.prefix.file");
+ if (attributeFile != null) {
+ LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile);
+ }
+
+ // precedence hierarchy for definition of filter rules
+ // (first non-empty definition takes precedence):
+ // 1. string rules defined by `urlfilter.domainblacklist.rules`
+ // 2. rule file name defined by `urlfilter.domainblacklist.file`
+ // 3. rule file name defined in plugin.xml (`attributeFile`)
+ String file = conf.get("urlfilter.prefix.file", attributeFile);
String stringRules = conf.get("urlfilter.prefix.rules");
- // attribute "file" takes precedence if defined
- if (attributeFile != null)
- file = attributeFile;
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
+ LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
if (reader == null) {
+ LOG.warn("Missing {} rule file '{}': all URLs will be rejected!",
+ pluginName, file);
trie = new PrefixStringMatcher(new String[0]);
} else {
try {
trie = readConfiguration(reader);
} catch (IOException e) {
- if (LOG.isErrorEnabled()) {
- LOG.error(e.getMessage());
- }
- // TODO mb@media-style.com: throw Exception? Because broken api.
- throw new RuntimeException(e.getMessage(), e);
+ LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}
}
diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
index df5a5d8..3833f3c 100644
--- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
+++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -47,8 +47,7 @@
* <li>property "urlfilter.suffix.file" in ./conf/nutch-default.xml, and</li>
* <li>attribute "file" in plugin.xml of this plugin</li>
* </ol>
- * Attribute "file" has higher precedence if defined. If the config file is
- * missing, all URLs will be rejected.
+ * If the config file is missing, all URLs will be rejected.
*
* <p>
* This filter can be configured to work in one of two modes:
@@ -177,9 +176,7 @@
// handle missing config file
if (reader == null) {
- if (LOG.isWarnEnabled()) {
- LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!");
- }
+ LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!");
suffixes = new SuffixStringMatcher(new String[0]);
modeAccept = false;
ignoreCase = false;
@@ -265,39 +262,34 @@
break;
}
}
- if (attributeFile != null && attributeFile.trim().equals(""))
+
+ if (attributeFile != null && attributeFile.trim().isEmpty()) {
attributeFile = null;
- if (attributeFile != null) {
- if (LOG.isInfoEnabled()) {
- LOG.info("Attribute \"file\" is defined for plugin " + pluginName
- + " as " + attributeFile);
- }
- } else {
- // if (LOG.isWarnEnabled()) {
- // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
- // plugin "+pluginName);
- // }
}
- String file = conf.get("urlfilter.suffix.file");
+ if (attributeFile != null) {
+ LOG.info("Attribute \"file\" is defined for plugin {} as {}", pluginName, attributeFile);
+ }
+
+ // precedence hierarchy for definition of filter rules
+ // (first non-empty definition takes precedence):
+ // 1. string rules defined by `urlfilter.domainblacklist.rules`
+ // 2. rule file name defined by `urlfilter.domainblacklist.file`
+ // 3. rule file name defined in plugin.xml (`attributeFile`)
+ String file = conf.get("urlfilter.suffix.file", attributeFile);
String stringRules = conf.get("urlfilter.suffix.rules");
- // attribute "file" takes precedence if defined
- if (attributeFile != null)
- file = attributeFile;
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
+ LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
try {
readConfiguration(reader);
} catch (IOException e) {
- if (LOG.isErrorEnabled()) {
- LOG.error(e.getMessage());
- }
- throw new RuntimeException(e.getMessage(), e);
+ LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}
diff --git a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
index ef83284..3a3c8a4 100644
--- a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
+++ b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
@@ -53,16 +53,8 @@
.getLogger(MethodHandles.lookup().lookupClass());
private static String attributeFile = null;
- private String hostsFile = null;
private static final HashMap<String, String> hostsMap = new HashMap<String, String>();
- public HostURLNormalizer() {
- }
-
- public HostURLNormalizer(String hostsFile) {
- this.hostsFile = hostsFile;
- }
-
private synchronized void readConfiguration(Reader configReader)
throws IOException {
if (hostsMap.size() > 0) {
@@ -121,18 +113,18 @@
}
}
- // domain file and attribute "file" take precedence if defined
- String file = conf.get("urlnormalizer.hosts.file");
+ // precedence hierarchy for definition of normalizer rules
+ // (first non-empty definition takes precedence):
+ // 1. string rules defined by `urlnormalizer.hosts.rules`
+ // 2. rule file name defined by `urlnormalizer.hosts.file"`
+ // 3. rule file name defined in plugin.xml (`attributeFile`)
+ String file = conf.get("urlnormalizer.hosts.file", attributeFile);
String stringRules = conf.get("urlnormalizer.hosts.rules");
- if (hostsFile != null) {
- file = hostsFile;
- } else if (attributeFile != null) {
- file = attributeFile;
- }
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
+ LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
try {
@@ -141,7 +133,7 @@
}
readConfiguration(reader);
} catch (IOException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}
diff --git a/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
index c9e1a2c..68cb50a 100644
--- a/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
+++ b/src/plugin/urlnormalizer-host/src/test/org/apache/nutch/net/urlnormalizer/host/TestHostURLNormalizer.java
@@ -32,7 +32,8 @@
Configuration conf = NutchConfiguration.create();
String hostsFile = SAMPLES + SEPARATOR + "hosts.txt";
- HostURLNormalizer normalizer = new HostURLNormalizer(hostsFile);
+ conf.set("urlnormalizer.hosts.file", hostsFile);
+ HostURLNormalizer normalizer = new HostURLNormalizer();
normalizer.setConf(conf);
// Force www. sub domain when hitting link without sub domain
diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index 12ecbf4..f60c291 100644
--- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -50,18 +50,11 @@
private static final String PROTOCOL_DELIMITER = "://";
private static String attributeFile = null;
- private String protocolsFile = null;
// We record a map of hosts and boolean, the boolean denotes whether the host should
// have slashes after URL paths. True means slash, false means remove the slash
private static final Map<String,String> protocolsMap = new HashMap<String,String>();
- public ProtocolURLNormalizer() {}
-
- public ProtocolURLNormalizer(String protocolsFile) {
- this.protocolsFile = protocolsFile;
- }
-
private synchronized void readConfiguration(Reader configReader) throws IOException {
if (protocolsMap.size() > 0) {
return;
@@ -126,19 +119,18 @@
}
}
- // domain file and attribute "file" take precedence if defined
- String file = conf.get("urlnormalizer.protocols.file");
+ // precedence hierarchy for definition of normalizer rules
+ // (first non-empty definition takes precedence):
+ // 1. string rules defined by `urlnormalizer.protocols.rules`
+ // 2. rule file name defined by `urlnormalizer.protocols.file"`
+ // 3. rule file name defined in plugin.xml (`attributeFile`)
+ String file = conf.get("urlnormalizer.protocols.file", attributeFile);
String stringRules = conf.get("urlnormalizer.protocols.rules");
- if (protocolsFile != null) {
- file = protocolsFile;
- }
- else if (attributeFile != null) {
- file = attributeFile;
- }
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
+ LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
try {
@@ -148,7 +140,7 @@
readConfiguration(reader);
}
catch (IOException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}
diff --git a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
index 22005ce..1b9760b 100644
--- a/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/test/org/apache/nutch/net/urlnormalizer/protocol/TestProtocolURLNormalizer.java
@@ -31,7 +31,8 @@
Configuration conf = NutchConfiguration.create();
String protocolsFile = SAMPLES + SEPARATOR + "protocols.txt";
- ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer(protocolsFile);
+ conf.set("urlnormalizer.protocols.file", protocolsFile);
+ ProtocolURLNormalizer normalizer = new ProtocolURLNormalizer();
normalizer.setConf(conf);
// No change
diff --git a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
index 8d05f5e..2570427 100644
--- a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
+++ b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
@@ -52,20 +52,11 @@
private static final String PROTOCOL_DELIMITER = "://";
private static String attributeFile = null;
- private String slashesFile = null;
// We record a map of hosts and boolean, the boolean denotes whether the host should
// have slashes after URL paths. True means slash, false means remove the slash
private static final Map<String,Boolean> slashesMap = new HashMap<>();
- public SlashURLNormalizer() {
- //default constructor
- }
-
- public SlashURLNormalizer(String slashesFile) {
- this.slashesFile = slashesFile;
- }
-
private synchronized void readConfiguration(Reader configReader) throws IOException {
if (slashesMap.size() > 0) {
return;
@@ -134,19 +125,18 @@
}
}
- // domain file and attribute "file" take precedence if defined
- String file = conf.get("urlnormalizer.slashes.file");
+ // precedence hierarchy for definition of normalizer rules
+ // (first non-empty definition takes precedence):
+ // 1. string rules defined by `urlnormalizer.slashes.rules`
+ // 2. rule file name defined by `urlnormalizer.slashes.file"`
+ // 3. rule file name defined in plugin.xml (`attributeFile`)
+ String file = conf.get("urlnormalizer.slashes.file", attributeFile);
String stringRules = conf.get("urlnormalizer.slashes.rules");
- if (slashesFile != null) {
- file = slashesFile;
- }
- else if (attributeFile != null) {
- file = attributeFile;
- }
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
+ LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
try {
@@ -156,7 +146,7 @@
readConfiguration(reader);
}
catch (IOException e) {
- LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+ LOG.error("Error reading " + pluginName + " rule file " + file, e);
}
}
diff --git a/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
index c5b3897..54af2bf 100644
--- a/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
+++ b/src/plugin/urlnormalizer-slash/src/test/org/apache/nutch/net/urlnormalizer/slash/TestSlashURLNormalizer.java
@@ -31,7 +31,8 @@
Configuration conf = NutchConfiguration.create();
String slashesFile = SAMPLES + SEPARATOR + "slashes.txt";
- SlashURLNormalizer normalizer = new SlashURLNormalizer(slashesFile);
+ conf.set("urlnormalizer.slashes.file", slashesFile);
+ SlashURLNormalizer normalizer = new SlashURLNormalizer();
normalizer.setConf(conf);
// No change