Merge pull request #481 from sebastian-nagel/NUTCH-1559-dupl-metatags
NUTCH-1559 parse-metatags duplicates extracted metatags
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index 51bf25c..8deaf18 100644
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -23,8 +23,6 @@
import java.util.Properties;
import java.util.Set;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
@@ -32,6 +30,8 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
/**
@@ -42,7 +42,7 @@
public class MetaTagsParser implements HtmlParseFilter {
private static final Logger LOG = LoggerFactory
- .getLogger(MethodHandles.lookup().lookupClass());
+ .getLogger(MethodHandles.lookup().lookupClass());
private Configuration conf;
@@ -71,7 +71,7 @@
String lcMetatag = metatag.toLowerCase(Locale.ROOT);
if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+ LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
}
metadata.add("metatag." + lcMetatag, value);
}
@@ -85,11 +85,12 @@
String[] values) {
String lcMetatag = metatag.toLowerCase(Locale.ROOT);
if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+ String key = "metatag." + lcMetatag;
for (String value : values) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+ LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
}
- metadata.add("metatag." + lcMetatag, value);
+ metadata.add(key, value);
}
}
}
@@ -100,11 +101,10 @@
Parse parse = parseResult.get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
- // check in the metadata first : the tika-parser
- // might have stored the values there already
- for (String mdName : metadata.names()) {
- addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
- }
+ /*
+ * NUTCH-1559: do not extract meta values from ParseData's metadata to avoid
+ * duplicate metatag values
+ */
Metadata generalMetaTags = metaTags.getGeneralTags();
for (String tagName : generalMetaTags.names()) {
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
index 3f03af9..5702c10 100644
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
+++ b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
@@ -16,6 +16,8 @@
*/
package org.apache.nutch.parse.metatags;
+import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
import java.util.Set;
import java.util.TreeSet;
@@ -31,6 +33,8 @@
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Assert;
import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class TestMetatagParser {
@@ -41,6 +45,9 @@
private String description = "This is a test of description";
private String keywords = "This is a test of keywords";
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
public Metadata parseMeta(String fileName, Configuration conf) {
Metadata metadata = null;
try {
@@ -100,4 +107,25 @@
}
}
+ @Test
+ public void testDuplicatedMetatags() {
+ String[] parsePlugins = { "parse-html", "parse-tika" };
+
+ for (String parsePlugin : parsePlugins) {
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes",
+ "protocol-file|parse-metatags|" + parsePlugin);
+ conf.set("metatags.names", "keywords");
+ conf.set("index.parse.md", "metatag.keywords");
+
+ Metadata parseMeta = parseMeta(sampleFile, conf);
+
+ LOG.info("metatags ({}): {}", parsePlugin,
+ Arrays.toString(parseMeta.getValues("metatag.keywords")));
+ Assert.assertEquals(
+ "Test document contains a single value of <meta name=keywords>, metatag.keywords should be also single-valued",
+ 1, parseMeta.getValues("metatag.keywords").length);
+ }
+ }
}