NUTCH-2419 Some URL filters and normalizers do not respect command-line override for rule file
- simplify selection of rule file (from property or attribute in plugin.xml)
diff --git a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
index c84f27c..6e86fc6 100644
--- a/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
+++ b/src/plugin/parsefilter-regex/src/java/org/apache/nutch/parsefilter/regex/RegexParseFilter.java
@@ -120,14 +120,8 @@
}
}
- // domain file and attribute "file" take precedence if defined
- String file = conf.get("parsefilter.regex.file");
+ String file = conf.get("parsefilter.regex.file", attributeFile);
String stringRules = conf.get("parsefilter.regex.rules");
- if (file != null) {
- // take file
- } else if (attributeFile != null) {
- file = attributeFile;
- }
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
diff --git a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
index fac02af..f629262 100644
--- a/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
+++ b/src/plugin/urlfilter-domain/src/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -59,7 +59,7 @@
* such as lucene.apache.org and hadoop.apache.org. The third line would allow
* only URLs from www.apache.org. There is no specific ordering to entries. The
* entries are from more general to more specific with the more general
- * overridding the more specific.
+ * overriding the more specific.
* </p>
*
* The domain file defaults to domain-urlfilter.txt in the classpath but can be
@@ -130,16 +130,11 @@
// 2. rule file name defined by `urlfilter.domain.file`
// 3. rule file name defined in plugin.xml (`attributeFile`)
String stringRules = conf.get("urlfilter.domain.rules");
- String file = conf.get("urlfilter.domain.file");
+ String file = conf.get("urlfilter.domain.file", attributeFile);
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
- if (file != null) {
- // take file
- } else if (attributeFile != null) {
- file = attributeFile;
- }
LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
diff --git a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
index 56b11e9..77c238b 100644
--- a/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
+++ b/src/plugin/urlfilter-domainblacklist/src/java/org/apache/nutch/urlfilter/domainblacklist/DomainBlacklistURLFilter.java
@@ -59,7 +59,7 @@
* such as lucene.apache.org and hadoop.apache.org. The third line would exclude
* only URLs from www.apache.org. There is no specific ordering to entries. The
* entries are from more general to more specific with the more general
- * overridding the more specific.
+ * overriding the more specific.
* </p>
*
* The domain file defaults to domainblacklist-urlfilter.txt in the classpath
@@ -131,16 +131,11 @@
// 2. rule file name defined by `urlfilter.domainblacklist.file`
// 3. rule file name defined in plugin.xml (`attributeFile`)
String stringRules = conf.get("urlfilter.domainblacklist.rules");
- String file = conf.get("urlfilter.domainblacklist.file");
+ String file = conf.get("urlfilter.domainblacklist.file", attributeFile);
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
- if (file != null) {
- // take file
- } else if (attributeFile != null) {
- file = attributeFile;
- }
LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
diff --git a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
index eeef9cf..61c6f17 100644
--- a/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
+++ b/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java
@@ -142,17 +142,12 @@
// 1. string rules defined by `urlfilter.domainblacklist.rules`
// 2. rule file name defined by `urlfilter.domainblacklist.file`
// 3. rule file name defined in plugin.xml (`attributeFile`)
- String file = conf.get("urlfilter.prefix.file");
+ String file = conf.get("urlfilter.prefix.file", attributeFile);
String stringRules = conf.get("urlfilter.prefix.rules");
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
- if (file != null) {
- // take file
- } else if (attributeFile != null) {
- file = attributeFile;
- }
LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
diff --git a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
index 55382cc..3833f3c 100644
--- a/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
+++ b/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -276,17 +276,12 @@
// 1. string rules defined by `urlfilter.domainblacklist.rules`
// 2. rule file name defined by `urlfilter.domainblacklist.file`
// 3. rule file name defined in plugin.xml (`attributeFile`)
- String file = conf.get("urlfilter.suffix.file");
+ String file = conf.get("urlfilter.suffix.file", attributeFile);
String stringRules = conf.get("urlfilter.suffix.rules");
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
} else {
- if (file != null) {
- // take file
- } else if (attributeFile != null) {
- file = attributeFile;
- }
LOG.info("Reading {} rules file {}", pluginName, file);
reader = conf.getConfResourceAsReader(file);
}
diff --git a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
index 4506c85..3a3c8a4 100644
--- a/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
+++ b/src/plugin/urlnormalizer-host/src/java/org/apache/nutch/net/urlnormalizer/host/HostURLNormalizer.java
@@ -118,13 +118,8 @@
// 1. string rules defined by `urlnormalizer.hosts.rules`
// 2. rule file name defined by `urlnormalizer.hosts.file"`
// 3. rule file name defined in plugin.xml (`attributeFile`)
- String file = conf.get("urlnormalizer.hosts.file");
+ String file = conf.get("urlnormalizer.hosts.file", attributeFile);
String stringRules = conf.get("urlnormalizer.hosts.rules");
- if (file != null) {
- // take file
- } else if (attributeFile != null) {
- file = attributeFile;
- }
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
diff --git a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
index f18ac65..f60c291 100644
--- a/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
+++ b/src/plugin/urlnormalizer-protocol/src/java/org/apache/nutch/net/urlnormalizer/protocol/ProtocolURLNormalizer.java
@@ -124,13 +124,8 @@
// 1. string rules defined by `urlnormalizer.protocols.rules`
// 2. rule file name defined by `urlnormalizer.protocols.file"`
// 3. rule file name defined in plugin.xml (`attributeFile`)
- String file = conf.get("urlnormalizer.protocols.file");
+ String file = conf.get("urlnormalizer.protocols.file", attributeFile);
String stringRules = conf.get("urlnormalizer.protocols.rules");
- if (file != null) {
- // take file
- } else if (attributeFile != null) {
- file = attributeFile;
- }
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);
diff --git a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
index 6e8b7b9..2570427 100644
--- a/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
+++ b/src/plugin/urlnormalizer-slash/src/java/org/apache/nutch/net/urlnormalizer/slash/SlashURLNormalizer.java
@@ -130,13 +130,8 @@
// 1. string rules defined by `urlnormalizer.slashes.rules`
// 2. rule file name defined by `urlnormalizer.slashes.file"`
// 3. rule file name defined in plugin.xml (`attributeFile`)
- String file = conf.get("urlnormalizer.slashes.file");
+ String file = conf.get("urlnormalizer.slashes.file", attributeFile);
String stringRules = conf.get("urlnormalizer.slashes.rules");
- if (file != null) {
- // take file
- } else if (attributeFile != null) {
- file = attributeFile;
- }
Reader reader = null;
if (stringRules != null) { // takes precedence over files
reader = new StringReader(stringRules);