Merge pull request #532 from pmezard/NUTCH-2790
NUTCH-2790 indexer-csv: escape field leading quote character
diff --git a/src/bin/crawl b/src/bin/crawl
index 9b77ce4..23a2940 100755
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -370,10 +370,19 @@
echo "CrawlDB update"
__bin_nutch updatedb "${commonOptions[@]}" "$CRAWL_PATH"/crawldb "$CRAWL_PATH"/segments/$SEGMENT
- # note that the link inversion - indexing routine can be done within the main loop
- # on a per segment basis
+ echo "HostDB update"
+ if $HOSTDBUPDATE; then
+ __update_hostdb
+ fi
+
+ # Note that all steps below in this loop (link inversion, deduplication, indexing)
+ # can be done
+ # - either inside the loop on a per segment basis
+ # - or after the loop over all segments created in all loop iterations
+ # (both invertlinks and index accept multiple segments as input)
+ # The latter is more efficient but the index is then updated later.
echo "Link inversion"
- __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT
+ __bin_nutch invertlinks "${commonOptions[@]}" "$CRAWL_PATH"/linkdb "$CRAWL_PATH"/segments/$SEGMENT -noNormalize -nofilter
echo "Dedup on crawldb"
__bin_nutch dedup "${commonOptions[@]}" "$CRAWL_PATH"/crawldb
@@ -385,30 +394,25 @@
echo "Skipping indexing ..."
fi
- echo "HostDB update"
- if $HOSTDBUPDATE; then
- __update_hostdb
- fi
-
- #######################################################
- # The following commands fall into WebGraph territory
- # and should be uncommented based on your requirements
- #######################################################
- #echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
- #__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
-
- #echo "Running Loops Job on WebGraph within $CRAWL_PATH"
- #__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"
-
- #echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
- #__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"
-
- #echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
- #__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
-
- #echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
- #__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
-
done
+#######################################################
+# The following commands fall into WebGraph territory
+# and should be uncommented based on your requirements
+#######################################################
+#echo "Building WebGraph within $CRAWL_PATH on all segments in $CRAWL_PATH/segments/"
+#__bin_nutch webgraph "${commonOptions[@]}" -filter -normalize -segmentDir "$CRAWL_PATH"/segments/ -webgraphdb "$CRAWL_PATH"
+
+#echo "Running Loops Job on WebGraph within $CRAWL_PATH"
+#__bin_nutch org.apache.nutch.scoring.webgraph.Loops "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"
+
+#echo "Running LinkRank Algorithm on WebGraph within $CRAWL_PATH"
+#__bin_nutch linkrank "${commonOptions[@]}" -webgraphdb "$CRAWL_PATH"
+
+#echo "Running ScoreUpdater Job with $CRAWL_PATH/crawldb and WebGraph within $CRAWL_PATH"
+#__bin_nutch scoreupdater "${commonOptions[@]}" -crawldb "$CRAWL_PATH"/crawldb -webgraphdb "$CRAWL_PATH"
+
+#echo "Running NodeDumper on WebGraph within $CRAWL_PATH and dumping output to $CRAWL_PATH/dump/scores"
+#__bin_nutch nodedumper "${commonOptions[@]}" -scores -topn 1000 -webgraphdb "$CRAWL_PATH" -output "$CRAWL_PATH"/dump/scores
+
exit 0
diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
index 3e9bc15..42093b7 100644
--- a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
+++ b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -19,6 +19,8 @@
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.Collection;
+import java.util.Locale;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.commons.codec.binary.Base64;
@@ -274,11 +276,11 @@
// Handle robots meta? https://issues.apache.org/jira/browse/NUTCH-1434
if (deleteRobotsNoIndex) {
// Get the robots meta data
- String robotsMeta = parseData.getMeta("robots");
+ String robotsMeta = parseData.getMeta(Nutch.ROBOTS_METATAG);
// Has it a noindex for this url?
- if (robotsMeta != null
- && robotsMeta.toLowerCase().indexOf("noindex") != -1) {
+ if (robotsMeta != null && robotsMeta.toLowerCase(Locale.ROOT)
+ .indexOf("noindex") != -1) {
// Delete it!
context.write(key, DELETE_ACTION);
context.getCounter("IndexerStatus", "deleted (robots=noindex)").increment(1);
diff --git a/src/java/org/apache/nutch/metadata/Nutch.java b/src/java/org/apache/nutch/metadata/Nutch.java
index d28808d..0cfb263 100644
--- a/src/java/org/apache/nutch/metadata/Nutch.java
+++ b/src/java/org/apache/nutch/metadata/Nutch.java
@@ -52,6 +52,12 @@
public static final String FETCH_STATUS_KEY = "_fst_";
+ /**
+ * Name to store the <a href="https://www.robotstxt.org/meta.html">robots
+ * metatag</a> in {@link org.apache.nutch.parse.ParseData}'s metadata.
+ */
+ public static final String ROBOTS_METATAG = "robots";
+
/**
* Sites may request that search engines don't provide access to cached
* documents.
diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
index 4e7ef14..d655a96 100644
--- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
+++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HTMLMetaProcessor.java
@@ -18,6 +18,7 @@
import java.net.URL;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
@@ -79,7 +80,7 @@
if (contentNode != null) {
String name = nameNode.getNodeValue().toLowerCase();
metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
- if ("robots".equals(name)) {
+ if (Nutch.ROBOTS_METATAG.equals(name)) {
String directives = contentNode.getNodeValue().toLowerCase();
int index = directives.indexOf("none");
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
index 58f93ac..8584df7 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/HTMLMetaProcessor.java
@@ -18,7 +18,9 @@
import java.net.MalformedURLException;
import java.net.URL;
+import java.util.Locale;
+import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.HTMLMetaTags;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
@@ -66,7 +68,7 @@
// Retrieves name, http-equiv and content attribues
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
- String attrName = attr.getNodeName().toLowerCase();
+ String attrName = attr.getNodeName().toLowerCase(Locale.ROOT);
if (attrName.equals("name")) {
nameNode = attr;
} else if (attrName.equals("http-equiv")) {
@@ -78,10 +80,11 @@
if (nameNode != null) {
if (contentNode != null) {
- String name = nameNode.getNodeValue().toLowerCase();
+ String name = nameNode.getNodeValue().toLowerCase(Locale.ROOT);
metaTags.getGeneralTags().add(name, contentNode.getNodeValue());
- if ("robots".equals(name)) {
- String directives = contentNode.getNodeValue().toLowerCase();
+ if (Nutch.ROBOTS_METATAG.equals(name)) {
+ String directives = contentNode.getNodeValue()
+ .toLowerCase(Locale.ROOT);
int index = directives.indexOf("none");
if (index >= 0) {
@@ -112,12 +115,14 @@
} // end if (name == robots)
// meta names added/transformed by Tika
else if (name.equals("pragma")) {
- String content = contentNode.getNodeValue().toLowerCase();
+ String content = contentNode.getNodeValue()
+ .toLowerCase(Locale.ROOT);
if (content.contains("no-cache")) {
metaTags.setNoCache();
}
} else if (name.equals("refresh")) {
- String content = contentNode.getNodeValue().toLowerCase();
+ String content = contentNode.getNodeValue()
+ .toLowerCase(Locale.ROOT);
setRefresh(metaTags, content, currURL);
} else if (name.equals("content-location")) {
String urlString = contentNode.getNodeValue();
@@ -138,11 +143,11 @@
if (equivNode != null) {
if (contentNode != null) {
- String name = equivNode.getNodeValue().toLowerCase();
+ String name = equivNode.getNodeValue().toLowerCase(Locale.ROOT);
String content = contentNode.getNodeValue();
metaTags.getHttpEquivTags().setProperty(name, content);
if ("pragma".equals(name)) {
- content = content.toLowerCase();
+ content = content.toLowerCase(Locale.ROOT);
int index = content.indexOf("no-cache");
if (index >= 0)
metaTags.setNoCache();
@@ -203,7 +208,7 @@
}
URL refreshUrl = null;
if (metaTags.getRefresh() && idx != -1) { // set the URL
- idx = content.toLowerCase().indexOf("url=");
+ idx = content.toLowerCase(Locale.ROOT).indexOf("url=");
if (idx == -1) { // assume a mis-formatted entry with just the
// url
idx = content.indexOf(';') + 1;
diff --git a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
index f2461fe..d97e8b4 100644
--- a/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
+++ b/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java
@@ -218,8 +218,14 @@
if (tikaMDName.equalsIgnoreCase(Metadata.TITLE))
continue;
String[] values = tikamd.getValues(tikaMDName);
- for (String v : values)
+ for (String v : values) {
nutchMetadata.add(tikaMDName, v);
+ if (tikaMDName.equalsIgnoreCase(Nutch.ROBOTS_METATAG)
+ && nutchMetadata.get(Nutch.ROBOTS_METATAG) == null) {
+ // NUTCH-2720 force lowercase robots directive
+ nutchMetadata.add(Nutch.ROBOTS_METATAG, v);
+ }
+ }
}
// no outlinks? try OutlinkExtractor e.g works for mime types where no