Merge pull request #481 from sebastian-nagel/NUTCH-1559-dupl-metatags

NUTCH-1559 parse-metatags duplicates extracted metatags
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index 51bf25c..8deaf18 100644
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -23,8 +23,6 @@
 import java.util.Properties;
 import java.util.Set;
 
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.HTMLMetaTags;
@@ -32,6 +30,8 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
 
 /**
@@ -42,7 +42,7 @@
 public class MetaTagsParser implements HtmlParseFilter {
 
   private static final Logger LOG = LoggerFactory
-		      .getLogger(MethodHandles.lookup().lookupClass());
+      .getLogger(MethodHandles.lookup().lookupClass());
 
   private Configuration conf;
 
@@ -71,7 +71,7 @@
     String lcMetatag = metatag.toLowerCase(Locale.ROOT);
     if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+        LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
       }
       metadata.add("metatag." + lcMetatag, value);
     }
@@ -85,11 +85,12 @@
       String[] values) {
     String lcMetatag = metatag.toLowerCase(Locale.ROOT);
     if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+      String key = "metatag." + lcMetatag;
       for (String value : values) {
         if (LOG.isDebugEnabled()) {
-          LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+          LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
         }
-        metadata.add("metatag." + lcMetatag, value);
+        metadata.add(key, value);
       }
     }
   }
@@ -100,11 +101,10 @@
     Parse parse = parseResult.get(content.getUrl());
     Metadata metadata = parse.getData().getParseMeta();
 
-    // check in the metadata first : the tika-parser
-    // might have stored the values there already
-    for (String mdName : metadata.names()) {
-      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
-    }
+    /*
+     * NUTCH-1559: do not extract meta values from ParseData's metadata to avoid
+     * duplicate metatag values
+     */
 
     Metadata generalMetaTags = metaTags.getGeneralTags();
     for (String tagName : generalMetaTags.names()) {
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
index 3f03af9..5702c10 100644
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
+++ b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
@@ -16,6 +16,8 @@
  */
 package org.apache.nutch.parse.metatags;
 
+import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
 import java.util.Set;
 import java.util.TreeSet;
 
@@ -31,6 +33,8 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class TestMetatagParser {
 
@@ -41,6 +45,9 @@
   private String description = "This is a test of description";
   private String keywords = "This is a test of keywords";
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
   public Metadata parseMeta(String fileName, Configuration conf) {
     Metadata metadata = null;
     try {
@@ -100,4 +107,25 @@
     }
   }
 
+  @Test
+  public void testDuplicatedMetatags() {
+    String[] parsePlugins = { "parse-html", "parse-tika" };
+
+    for (String parsePlugin : parsePlugins) {
+
+      Configuration conf = NutchConfiguration.create();
+      conf.set("plugin.includes",
+          "protocol-file|parse-metatags|" + parsePlugin);
+      conf.set("metatags.names", "keywords");
+      conf.set("index.parse.md", "metatag.keywords");
+
+      Metadata parseMeta = parseMeta(sampleFile, conf);
+
+      LOG.info("metatags ({}): {}", parsePlugin,
+          Arrays.toString(parseMeta.getValues("metatag.keywords")));
+      Assert.assertEquals(
+          "Test document contains a single value of <meta name=keywords>, metatag.keywords should be also single-valued",
+          1, parseMeta.getValues("metatag.keywords").length);
+    }
+  }
 }