Merge pull request #483 from sju/NUTCH-2750

Fix for NUTCH-2750 Improve CrawlDbReader & LinkDbReader reader handling
- re-opens readers only if CrawlDb/LinkDb has changed (do not reopen for every query/URL)
diff --git a/src/java/org/apache/nutch/tools/Benchmark.java b/src/java/org/apache/nutch/tools/Benchmark.java
index df57e9c..203496b 100644
--- a/src/java/org/apache/nutch/tools/Benchmark.java
+++ b/src/java/org/apache/nutch/tools/Benchmark.java
@@ -17,6 +17,7 @@
 package org.apache.nutch.tools;
 
 import java.io.OutputStream;
+import java.lang.invoke.MethodHandles;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
 import java.util.Date;
@@ -24,8 +25,6 @@
 import java.util.List;
 import java.util.Map;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.fs.FileSystem;
@@ -42,9 +41,13 @@
 import org.apache.nutch.parse.ParseSegment;
 import org.apache.nutch.util.NutchConfiguration;
 import org.apache.nutch.util.NutchJob;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class Benchmark extends Configured implements Tool {
-  private static final Log LOG = LogFactory.getLog(Benchmark.class);
+  
+  private static final Logger LOG = LoggerFactory
+	      .getLogger(MethodHandles.lookup().lookupClass());
 
   public static void main(String[] args) throws Exception {
     Configuration conf = NutchConfiguration.create();
@@ -170,7 +173,7 @@
       } else if (args[i].equalsIgnoreCase("-maxPerHost")) {
         maxPerHost = Integer.parseInt(args[++i]);
       } else {
-        LOG.fatal("Invalid argument: '" + args[i] + "'");
+        LOG.error("Invalid argument: '" + args[i] + "'");
         return -1;
       }
     }
diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
index 4592de9..503310a 100644
--- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
+++ b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
@@ -16,6 +16,7 @@
  */
 package org.apache.nutch.indexer.replace;
 
+import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.LinkedHashMap;
@@ -25,8 +26,9 @@
 import java.util.regex.Pattern;
 import java.util.regex.PatternSyntaxException;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.io.Text;
 import org.apache.nutch.crawl.CrawlDatum;
@@ -83,8 +85,8 @@
  */
 public class ReplaceIndexer implements IndexingFilter {
 
-  private static final Log LOG = LogFactory.getLog(ReplaceIndexer.class
-      .getName());
+  private static final Logger LOG = LoggerFactory
+		      .getLogger(MethodHandles.lookup().lookupClass());
 
   /** Special field name signifying the start of a host-specific match set */
   private static final String HOSTMATCH = "hostmatch";
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index 96c56fc..8deaf18 100644
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -16,14 +16,13 @@
  */
 package org.apache.nutch.parse.metatags;
 
+import java.lang.invoke.MethodHandles;
 import java.util.Enumeration;
 import java.util.HashSet;
 import java.util.Locale;
 import java.util.Properties;
 import java.util.Set;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.parse.HTMLMetaTags;
@@ -31,6 +30,8 @@
 import org.apache.nutch.parse.Parse;
 import org.apache.nutch.parse.ParseResult;
 import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.w3c.dom.DocumentFragment;
 
 /**
@@ -40,8 +41,8 @@
  */
 public class MetaTagsParser implements HtmlParseFilter {
 
-  private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
-      .getName());
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
 
   private Configuration conf;
 
@@ -70,7 +71,7 @@
     String lcMetatag = metatag.toLowerCase(Locale.ROOT);
     if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
       if (LOG.isDebugEnabled()) {
-        LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+        LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
       }
       metadata.add("metatag." + lcMetatag, value);
     }
@@ -84,11 +85,12 @@
       String[] values) {
     String lcMetatag = metatag.toLowerCase(Locale.ROOT);
     if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+      String key = "metatag." + lcMetatag;
       for (String value : values) {
         if (LOG.isDebugEnabled()) {
-          LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+          LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
         }
-        metadata.add("metatag." + lcMetatag, value);
+        metadata.add(key, value);
       }
     }
   }
@@ -99,11 +101,10 @@
     Parse parse = parseResult.get(content.getUrl());
     Metadata metadata = parse.getData().getParseMeta();
 
-    // check in the metadata first : the tika-parser
-    // might have stored the values there already
-    for (String mdName : metadata.names()) {
-      addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
-    }
+    /*
+     * NUTCH-1559: do not extract meta values from ParseData's metadata to avoid
+     * duplicate metatag values
+     */
 
     Metadata generalMetaTags = metaTags.getGeneralTags();
     for (String tagName : generalMetaTags.names()) {
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
index 3f03af9..5702c10 100644
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
+++ b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
@@ -16,6 +16,8 @@
  */
 package org.apache.nutch.parse.metatags;
 
+import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
 import java.util.Set;
 import java.util.TreeSet;
 
@@ -31,6 +33,8 @@
 import org.apache.nutch.util.NutchConfiguration;
 import org.junit.Assert;
 import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 public class TestMetatagParser {
 
@@ -41,6 +45,9 @@
   private String description = "This is a test of description";
   private String keywords = "This is a test of keywords";
 
+  private static final Logger LOG = LoggerFactory
+      .getLogger(MethodHandles.lookup().lookupClass());
+
   public Metadata parseMeta(String fileName, Configuration conf) {
     Metadata metadata = null;
     try {
@@ -100,4 +107,25 @@
     }
   }
 
+  @Test
+  public void testDuplicatedMetatags() {
+    String[] parsePlugins = { "parse-html", "parse-tika" };
+
+    for (String parsePlugin : parsePlugins) {
+
+      Configuration conf = NutchConfiguration.create();
+      conf.set("plugin.includes",
+          "protocol-file|parse-metatags|" + parsePlugin);
+      conf.set("metatags.names", "keywords");
+      conf.set("index.parse.md", "metatag.keywords");
+
+      Metadata parseMeta = parseMeta(sampleFile, conf);
+
+      LOG.info("metatags ({}): {}", parsePlugin,
+          Arrays.toString(parseMeta.getValues("metatag.keywords")));
+      Assert.assertEquals(
+          "Test document contains a single value of <meta name=keywords>, metatag.keywords should be also single-valued",
+          1, parseMeta.getValues("metatag.keywords").length);
+    }
+  }
 }
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
index 8b6108d..23e8ddb 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
@@ -16,43 +16,46 @@
  */
 package org.apache.nutch.parse.tika;
 
+import java.lang.invoke.MethodHandles;
 import java.util.HashMap;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 import de.l3s.boilerpipe.BoilerpipeExtractor;
 
 class BoilerpipeExtractorRepository {
 
-    public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class);
-    public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<>();
+  private static final Logger LOG = LoggerFactory
+		      .getLogger(MethodHandles.lookup().lookupClass());
+  public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<>();
  
     /**
      * Returns an instance of the specified extractor
      */
-    public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
+  public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
       // Check if there's no instance of this extractor
-      if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
+    if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
         // FQCN
-        boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
+      boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
 
         // Attempt to load the class
-        try {
-          ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
-          Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+      try {
+        ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+        Class extractorClass = loader.loadClass(boilerpipeExtractorName);
 
           // Add an instance to the repository
-          extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.getConstructor().newInstance());
+        extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.getConstructor().newInstance());
 
-        } catch (ClassNotFoundException e) {
-          LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
-        } catch (InstantiationException e) {
-          LOG.error("Could not instantiate " + boilerpipeExtractorName);
-        } catch (Exception e) {
-          LOG.error(e);
-        }
+      } catch (ClassNotFoundException e) {
+        LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
+      } catch (InstantiationException e) {
+        LOG.error("Could not instantiate " + boilerpipeExtractorName);
+      } catch (Exception e) {
+        LOG.error(e.getLocalizedMessage());
       }
-
-      return extractorRepository.get(boilerpipeExtractorName);
     }
 
+    return extractorRepository.get(boilerpipeExtractorName);
+  }
+
 }
diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
index 0d32e19..29b119b 100644
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
+++ b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
@@ -16,13 +16,14 @@
  */
 package org.apache.nutch.scoring.depth;
 
+import java.lang.invoke.MethodHandles;
 import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map.Entry;
 
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.conf.Configured;
 import org.apache.hadoop.io.IntWritable;
@@ -43,7 +44,9 @@
  * effectively stopping further crawling along this path.
  */
 public class DepthScoringFilter extends Configured implements ScoringFilter {
-  private static final Log LOG = LogFactory.getLog(DepthScoringFilter.class);
+
+  private static final Logger LOG = LoggerFactory
+		      .getLogger(MethodHandles.lookup().lookupClass());
 
   public static final String DEPTH_KEY = "_depth_";
   public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY);