NUTCH-2720 ROBOTS metatag ignored when capitalized
- parse-tika: add lowercase "robots" to metadata
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index f2461fe..4d9495c 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -218,8 +218,14 @@
if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
continue;
String[] values = tikamd.getValues(tikaMDName);
- for (String v : values)
+ for (String v : values) {
nutchMetadata.add(tikaMDName, v);
+ if (tikaMDName.equalsIgnoreCase("robots")
+ && nutchMetadata.get("robots") == null) {
+ // NUTCH-2720 force lowercase robots directive
+ nutchMetadata.add("robots", v);
+ }
+ }
}
// no outlinks? try OutlinkExtractor e.g works for mime types where no