NUTCH-2737 Generator: count and log reason of rejections during selection
- add counters for rejections in Generator's SelectorMapper
- parameterize log messages to simplify code
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index bc6a3aa..3aa6dd6 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -20,12 +20,14 @@
 import java.io.DataOutput;
 import java.io.IOException;
 import java.lang.invoke.MethodHandles;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Random;
 
@@ -35,6 +37,7 @@
 import org.apache.commons.jexl2.Expression;
 import org.apache.commons.jexl2.JexlContext;
 import org.apache.commons.jexl2.MapContext;
+import org.apache.hadoop.mapreduce.Counter;
 import org.apache.hadoop.mapreduce.Job;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.Reducer;
@@ -211,18 +214,16 @@
             if (filters.filter(url.toString()) == null)
               return;
           } catch (URLFilterException e) {
-            if (LOG.isWarnEnabled()) {
-              LOG.warn("Couldn't filter url: " + url + " (" + e.getMessage()
-                  + ")");
-            }
+            LOG.warn("Couldn't filter url: {} ({})", url, e.getMessage());
           }
         }
         CrawlDatum crawlDatum = value;
 
         // check fetch schedule
         if (!schedule.shouldFetch(url, crawlDatum, curTime)) {
-          LOG.debug("-shouldFetch rejected '" + url + "', fetchTime="
-              + crawlDatum.getFetchTime() + ", curTime=" + curTime);
+          LOG.debug("-shouldFetch rejected '{}', fetchTime={}, curTime={}", url,
+              crawlDatum.getFetchTime(), curTime);
+          context.getCounter("Generator", "SCHEDULE_REJECTED").increment(1);
           return;
         }
 
@@ -231,6 +232,7 @@
         if (oldGenTime != null) { // awaiting fetch & update
           if (oldGenTime.get() + genDelay > curTime) // still wait for
             // update
+            context.getCounter("Generator", "WAIT_FOR_UPDATE").increment(1);
             return;
         }
         float sort = 1.0f;
@@ -245,24 +247,31 @@
         // check expr
         if (expr != null) {
           if (!crawlDatum.evaluate(expr, key.toString())) {
+            context.getCounter("Generator", "EXPR_REJECTED").increment(1);
             return;
           }
         }
 
         if (restrictStatus != null
-            && !restrictStatus.equalsIgnoreCase(CrawlDatum
-                .getStatusName(crawlDatum.getStatus())))
+            && !restrictStatus
+                .equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) {
+          context.getCounter("Generator", "STATUS_REJECTED").increment(1);
           return;
+        }
 
         // consider only entries with a score superior to the threshold
-        if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold)
+        if (!Float.isNaN(scoreThreshold) && sort < scoreThreshold) {
+          context.getCounter("Generator", "SCORE_TOO_LOW").increment(1);
           return;
+        }
 
         // consider only entries with a retry (or fetch) interval lower than
         // threshold
         if (intervalThreshold != -1
-            && crawlDatum.getFetchInterval() > intervalThreshold)
+            && crawlDatum.getFetchInterval() > intervalThreshold) {
+          context.getCounter("Generator", "INTERVAL_REJECTED").increment(1);
           return;
+        }
 
         // sort by decreasing score, using DecreasingFloatComparator
         sortValue.set(sort);
@@ -456,11 +465,11 @@
             if (byDomain) {
               hostordomain = URLUtil.getDomainName(u);
             } else {
-              hostordomain = new URL(urlString).getHost();
+              hostordomain = u.getHost();
             }
-          } catch (Exception e) {
-            LOG.warn("Malformed URL: '" + urlString + "', skipping ("
-                + StringUtils.stringifyException(e) + ")");
+          } catch (MalformedURLException e) {
+            LOG.warn("Malformed URL: '{}', skipping ({})", urlString,
+                StringUtils.stringifyException(e));
             context.getCounter("Generator", "MALFORMED_URL").increment(1);
             continue;
           }
@@ -493,13 +502,9 @@
                 hostCount[1] = 1;
               } else {
                 if (hostCount[1] == maxCount && LOG.isInfoEnabled()) {
-                  LOG.info("Host or domain "
-                      + hostordomain
-                      + " has more than "
-                      + maxCount
-                      + " URLs for all "
-                      + maxNumSegments
-                      + " segments. Additional URLs won't be included in the fetchlist.");
+                  LOG.info(
+                      "Host or domain {} has more than {} URLs for all {} segments. Additional URLs won't be included in the fetchlist.",
+                      hostordomain, maxCount, maxNumSegments);
                 }
                 // skip this entry
                 continue;
@@ -804,6 +809,13 @@
       throw e;
     }
 
+    LOG.info("Generator: number of items rejected during selection:");
+    for (Counter counter : job.getCounters().getGroup("Generator")) {
+      LOG.info("Generator: {}  {}",
+          String.format(Locale.ROOT, "%6d", counter.getValue()),
+          counter.getName());
+    }
+
     // read the subdirectories generated in the temp
     // output and turn them into segments
     List<Path> generatedSegments = new ArrayList<>();