NUTCH-2738 Generator: document property generate.restrict.status
- add generate.restrict.status to nutch-default.xml
- get status (byte) from status name in setConf()
to speed up comparison in SelectorMapper
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index 6bbf7dd..ca3a949 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -904,6 +904,13 @@
See https://issues.apache.org/jira/browse/NUTCH-2368</description>
</property>
+<property>
+ <name>generate.restrict.status</name>
+ <value></value>
+ <description>Select only entries of this status, see
+ https://issues.apache.org/jira/browse/NUTCH-1248</description>
+</property>
+
<!-- urlpartitioner properties -->
<property>
diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java b/src/java/org/apache/nutch/crawl/CrawlDatum.java
index 66a6fff..e05d7fd 100644
--- a/src/java/org/apache/nutch/crawl/CrawlDatum.java
+++ b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -185,6 +185,15 @@
return res;
}
+ public static byte getStatusByName(String name) {
+ for (Entry<Byte, String> status : statNames.entrySet()) {
+ if (name.equalsIgnoreCase(status.getValue())) {
+ return status.getKey();
+ }
+ }
+ return -1;
+ }
+
public void setStatus(int status) {
this.status = (byte) status;
}
diff --git a/src/java/org/apache/nutch/crawl/Generator.java b/src/java/org/apache/nutch/crawl/Generator.java
index 3aa6dd6..555c42e 100644
--- a/src/java/org/apache/nutch/crawl/Generator.java
+++ b/src/java/org/apache/nutch/crawl/Generator.java
@@ -181,7 +181,7 @@
private FetchSchedule schedule;
private float scoreThreshold = 0f;
private int intervalThreshold = -1;
- private String restrictStatus = null;
+ private byte restrictStatus = -1;
private Expression expr = null;
@Override
@@ -198,7 +198,10 @@
schedule = FetchScheduleFactory.getFetchSchedule(conf);
scoreThreshold = conf.getFloat(GENERATOR_MIN_SCORE, Float.NaN);
intervalThreshold = conf.getInt(GENERATOR_MIN_INTERVAL, -1);
- restrictStatus = conf.get(GENERATOR_RESTRICT_STATUS, null);
+ String restrictStatusString = conf.getTrimmed(GENERATOR_RESTRICT_STATUS, "");
+ if (!restrictStatusString.isEmpty()) {
+ restrictStatus = CrawlDatum.getStatusByName(restrictStatusString);
+ }
expr = JexlUtil.parseExpression(conf.get(GENERATOR_EXPR, null));
}
@@ -252,9 +255,7 @@
}
}
- if (restrictStatus != null
- && !restrictStatus
- .equalsIgnoreCase(CrawlDatum.getStatusName(crawlDatum.getStatus()))) {
+ if (restrictStatus != -1 && restrictStatus != crawlDatum.getStatus()) {
context.getCounter("Generator", "STATUS_REJECTED").increment(1);
return;
}