NUTCH-2068 Allow subcollection overrides via metadata
diff --git a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
index c4b8b31..df12e4f 100644
--- a/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
+++ b/src/plugin/subcollection/src/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -57,6 +57,7 @@
public void setConf(Configuration conf) {
this.conf = conf;
fieldName = conf.get("subcollection.default.fieldname", "subcollection");
+ metadataSource = conf.get("subcollection.metadata.source", "subcollection");
}
/**
@@ -70,6 +71,11 @@
* Doc field name
*/
public static String fieldName = "subcollection";
+
+ /**
+ * Metadata source field name
+ */
+ public static String metadataSource = "subcollection";
/**
* Logger
@@ -96,6 +102,17 @@
public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+ // Check for subcollection overrride in HTML metadata
+ String subcollection = parse.getData().getMeta(metadataSource);
+ if (subcollection != null) {
+ subcollection = subcollection.trim();
+
+ if (subcollection.length() > 0) {
+ doc.add(fieldName, subcollection);
+ return doc;
+ }
+ }
+
String sUrl = url.toString();
addSubCollectionField(doc, sUrl);
return doc;