Merge pull request #483 from sju/NUTCH-2750
Fix for NUTCH-2750 Improve CrawlDbReader & LinkDbReader reader handling
- re-opens readers only if CrawlDb/LinkDb has changed (do not reopen for every query/URL)
diff --git a/src/java/org/apache/nutch/tools/Benchmark.java b/src/java/org/apache/nutch/tools/Benchmark.java
index df57e9c..203496b 100644
--- a/src/java/org/apache/nutch/tools/Benchmark.java
+++ b/src/java/org/apache/nutch/tools/Benchmark.java
@@ -17,6 +17,7 @@
package org.apache.nutch.tools;
import java.io.OutputStream;
+import java.lang.invoke.MethodHandles;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
@@ -24,8 +25,6 @@
import java.util.List;
import java.util.Map;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
@@ -42,9 +41,13 @@
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class Benchmark extends Configured implements Tool {
- private static final Log LOG = LogFactory.getLog(Benchmark.class);
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
public static void main(String[] args) throws Exception {
Configuration conf = NutchConfiguration.create();
@@ -170,7 +173,7 @@
} else if (args[i].equalsIgnoreCase("-maxPerHost")) {
maxPerHost = Integer.parseInt(args[++i]);
} else {
- LOG.fatal("Invalid argument: '" + args[i] + "'");
+ LOG.error("Invalid argument: '" + args[i] + "'");
return -1;
}
}
diff --git a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
index 4592de9..503310a 100644
--- a/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
+++ b/src/plugin/index-replace/src/java/org/apache/nutch/indexer/replace/ReplaceIndexer.java
@@ -16,6 +16,7 @@
*/
package org.apache.nutch.indexer.replace;
+import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedHashMap;
@@ -25,8 +26,9 @@
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
@@ -83,8 +85,8 @@
*/
public class ReplaceIndexer implements IndexingFilter {
- private static final Log LOG = LogFactory.getLog(ReplaceIndexer.class
- .getName());
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
/** Special field name signifying the start of a host-specific match set */
private static final String HOSTMATCH = "hostmatch";
diff --git a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
index 96c56fc..8deaf18 100644
--- a/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
+++ b/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/metatags/MetaTagsParser.java
@@ -16,14 +16,13 @@
*/
package org.apache.nutch.parse.metatags;
+import java.lang.invoke.MethodHandles;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
@@ -31,6 +30,8 @@
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.w3c.dom.DocumentFragment;
/**
@@ -40,8 +41,8 @@
*/
public class MetaTagsParser implements HtmlParseFilter {
- private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
- .getName());
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
private Configuration conf;
@@ -70,7 +71,7 @@
String lcMetatag = metatag.toLowerCase(Locale.ROOT);
if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+ LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
}
metadata.add("metatag." + lcMetatag, value);
}
@@ -84,11 +85,12 @@
String[] values) {
String lcMetatag = metatag.toLowerCase(Locale.ROOT);
if (metatagset.contains("*") || metatagset.contains(lcMetatag)) {
+ String key = "metatag." + lcMetatag;
for (String value : values) {
if (LOG.isDebugEnabled()) {
- LOG.debug("Found meta tag: " + lcMetatag + "\t" + value);
+ LOG.debug("Found meta tag: {}\t{}", lcMetatag, value);
}
- metadata.add("metatag." + lcMetatag, value);
+ metadata.add(key, value);
}
}
}
@@ -99,11 +101,10 @@
Parse parse = parseResult.get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
- // check in the metadata first : the tika-parser
- // might have stored the values there already
- for (String mdName : metadata.names()) {
- addIndexedMetatags(metadata, mdName, metadata.getValues(mdName));
- }
+ /*
+ * NUTCH-1559: do not extract meta values from ParseData's metadata to avoid
+ * duplicate metatag values
+ */
Metadata generalMetaTags = metaTags.getGeneralTags();
for (String tagName : generalMetaTags.names()) {
diff --git a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
index 3f03af9..5702c10 100644
--- a/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
+++ b/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/metatags/TestMetatagParser.java
@@ -16,6 +16,8 @@
*/
package org.apache.nutch.parse.metatags;
+import java.lang.invoke.MethodHandles;
+import java.util.Arrays;
import java.util.Set;
import java.util.TreeSet;
@@ -31,6 +33,8 @@
import org.apache.nutch.util.NutchConfiguration;
import org.junit.Assert;
import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
public class TestMetatagParser {
@@ -41,6 +45,9 @@
private String description = "This is a test of description";
private String keywords = "This is a test of keywords";
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+
public Metadata parseMeta(String fileName, Configuration conf) {
Metadata metadata = null;
try {
@@ -100,4 +107,25 @@
}
}
+ @Test
+ public void testDuplicatedMetatags() {
+ String[] parsePlugins = { "parse-html", "parse-tika" };
+
+ for (String parsePlugin : parsePlugins) {
+
+ Configuration conf = NutchConfiguration.create();
+ conf.set("plugin.includes",
+ "protocol-file|parse-metatags|" + parsePlugin);
+ conf.set("metatags.names", "keywords");
+ conf.set("index.parse.md", "metatag.keywords");
+
+ Metadata parseMeta = parseMeta(sampleFile, conf);
+
+ LOG.info("metatags ({}): {}", parsePlugin,
+ Arrays.toString(parseMeta.getValues("metatag.keywords")));
+ Assert.assertEquals(
+ "Test document contains a single value of <meta name=keywords>, metatag.keywords should be also single-valued",
+ 1, parseMeta.getValues("metatag.keywords").length);
+ }
+ }
}
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
index 8b6108d..23e8ddb 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/BoilerpipeExtractorRepository.java
@@ -16,43 +16,46 @@
*/
package org.apache.nutch.parse.tika;
+import java.lang.invoke.MethodHandles;
import java.util.HashMap;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import de.l3s.boilerpipe.BoilerpipeExtractor;
class BoilerpipeExtractorRepository {
- public static final Log LOG = LogFactory.getLog(BoilerpipeExtractorRepository.class);
- public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<>();
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
+ public static final HashMap<String, BoilerpipeExtractor> extractorRepository = new HashMap<>();
/**
* Returns an instance of the specified extractor
*/
- public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
+ public static synchronized BoilerpipeExtractor getExtractor(String boilerpipeExtractorName) {
// Check if there's no instance of this extractor
- if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
+ if (!extractorRepository.containsKey(boilerpipeExtractorName)) {
// FQCN
- boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
+ boilerpipeExtractorName = "de.l3s.boilerpipe.extractors." + boilerpipeExtractorName;
// Attempt to load the class
- try {
- ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
- Class extractorClass = loader.loadClass(boilerpipeExtractorName);
+ try {
+ ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
+ Class extractorClass = loader.loadClass(boilerpipeExtractorName);
// Add an instance to the repository
- extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.getConstructor().newInstance());
+ extractorRepository.put(boilerpipeExtractorName, (BoilerpipeExtractor)extractorClass.getConstructor().newInstance());
- } catch (ClassNotFoundException e) {
- LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
- } catch (InstantiationException e) {
- LOG.error("Could not instantiate " + boilerpipeExtractorName);
- } catch (Exception e) {
- LOG.error(e);
- }
+ } catch (ClassNotFoundException e) {
+ LOG.error("BoilerpipeExtractor " + boilerpipeExtractorName + " not found!");
+ } catch (InstantiationException e) {
+ LOG.error("Could not instantiate " + boilerpipeExtractorName);
+ } catch (Exception e) {
+ LOG.error(e.getLocalizedMessage());
}
-
- return extractorRepository.get(boilerpipeExtractorName);
}
+ return extractorRepository.get(boilerpipeExtractorName);
+ }
+
}
diff --git a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
index 0d32e19..29b119b 100644
--- a/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
+++ b/src/plugin/scoring-depth/src/java/org/apache/nutch/scoring/depth/DepthScoringFilter.java
@@ -16,13 +16,14 @@
*/
package org.apache.nutch.scoring.depth;
+import java.lang.invoke.MethodHandles;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
-import org.apache.commons.logging.Log;
-import org.apache.commons.logging.LogFactory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.IntWritable;
@@ -43,7 +44,9 @@
* effectively stopping further crawling along this path.
*/
public class DepthScoringFilter extends Configured implements ScoringFilter {
- private static final Log LOG = LogFactory.getLog(DepthScoringFilter.class);
+
+ private static final Logger LOG = LoggerFactory
+ .getLogger(MethodHandles.lookup().lookupClass());
public static final String DEPTH_KEY = "_depth_";
public static final Text DEPTH_KEY_W = new Text(DEPTH_KEY);