NUTCH-3043 Generator: count URLs rejected by URL filters (#814)
- add counters URL_FILTERS_REJECTED and URL_FILTER_EXCEPTION
- simplify logging statement
- remove unnecessary cast
- use parameterized logging
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 33f743a..f57642a 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -224,9 +224,12 @@
// If filtering is on don't generate URLs that don't pass
// URLFilters
try {
- if (filters.filter(url.toString()) == null)
+ if (filters.filter(url.toString()) == null) {
+ context.getCounter("Generator", "URL_FILTERS_REJECTED").increment(1);
return;
+ }
} catch (URLFilterException e) {
+ context.getCounter("Generator", "URL_FILTER_EXCEPTION").increment(1);
LOG.warn("Couldn't filter url: {} ({})", url, e.getMessage());
}
}
@@ -253,10 +256,7 @@
try {
sort = scfilters.generatorSortValue(key, crawlDatum, sort);
} catch (ScoringFilterException sfe) {
- if (LOG.isWarnEnabled()) {
- LOG.warn(
- "Couldn't filter generatorSortValue for " + key + ": " + sfe);
- }
+ LOG.warn("Couldn't filter generatorSortValue for {}: {}", key, sfe);
}
// check expr
@@ -625,7 +625,7 @@
// make later bytes more significant in hash code, so that sorting
// by hashcode correlates less with by-host ordering.
for (int i = length - 1; i >= 0; i--)
- hash = (31 * hash) + (int) bytes[start + i];
+ hash = (31 * hash) + bytes[start + i];
return hash;
}
}