NUTCH-1228 Change mapred.task.timeout to mapreduce.task.timeout in fetcher - update Hadoop properties (deprecated / old MapReduce API)

commit: 418f93a4609658cffe8b02841a9db0c0025de865 [log] [tgz]
author: Sebastian Nagel <snagel@apache.org> Sat Apr 21 19:12:09 2018 +0200
committer: Sebastian Nagel <snagel@apache.org> Sat Apr 21 19:12:09 2018 +0200
tree: 3b9cc0c04f63a10c143b2cf240d4c18049f44087
parent: 9c659d3a3032c736eceaa263dc290c8c78da8005 [diff]
diff --git a/src/bin/crawl b/src/bin/crawl
index 1a31d7d..27db6de 100644
--- a/src/bin/crawl
+++ b/src/bin/crawl

@@ -61,7 +61,7 @@
 numSlaves=1
 
 # and the total number of available tasks
-# sets Hadoop parameter "mapred.reduce.tasks"
+# sets Hadoop parameter "mapreduce.job.reduces"
 numTasks=`expr $numSlaves \* 2`
 
 # number of urls to fetch in one iteration
@@ -88,7 +88,7 @@
 
 # note that some of the options listed here could be set in the 
 # corresponding hadoop site xml param file 
-commonOptions="-D mapred.reduce.tasks=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"
+commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
 
  # check that hadoop can be found on the path 
 if [ $mode = "distributed" ]; then
@@ -161,7 +161,7 @@
   echo "Parsing : "
   # enable the skipping of records for the parsing so that a dodgy document 
   # so that it does not fail the full task
-  skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
+  skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
   __bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId "$CRAWL_ID"
 
   # updatedb with this batch

diff --git a/src/java/org/apache/nutch/crawl/WebTableReader.java b/src/java/org/apache/nutch/crawl/WebTableReader.java
index 5985dd6..941ae9a 100644
--- a/src/java/org/apache/nutch/crawl/WebTableReader.java
+++ b/src/java/org/apache/nutch/crawl/WebTableReader.java

@@ -539,7 +539,7 @@
   // for now handles only -stat
   @Override
   public Map<String, Object> run(Map<String, Object> args) throws Exception {
-    Path tmpFolder = new Path(getConf().get("mapred.temp.dir", ".")
+    Path tmpFolder = new Path(getConf().get("mapreduce.cluster.temp.dir", ".")
         + "stat_tmp" + System.currentTimeMillis());
 
     numJobs = 1;

diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java
index bd06121..82e7a12 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherJob.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java

@@ -214,7 +214,7 @@
     StorageUtils.initReducerJob(currentJob, FetcherReducer.class);
     if (numTasks == null || numTasks < 1) {
       currentJob.setNumReduceTasks(currentJob.getConfiguration().getInt(
-          "mapred.map.tasks", currentJob.getNumReduceTasks()));
+          "mapreduce.job.maps", currentJob.getNumReduceTasks()));
     } else {
       currentJob.setNumReduceTasks(numTasks);
     }
@@ -247,7 +247,7 @@
    * @param shouldResume
    * @param numTasks
    *          number of fetching tasks (reducers). If set to &lt; 1 then use the
-   *          default, which is mapred.map.tasks.
+   *          default, which is mapreduce.job.maps.
    * @return 0 on success
    * @throws Exception
    */
@@ -267,7 +267,7 @@
    * @param shouldResume
    * @param numTasks
    *          number of fetching tasks (reducers). If set to &lt; 1 then use the
-   *          default, which is mapred.map.tasks.
+   *          default, which is mapreduce.job.maps.
    * @param stmDetect
    *          If set true, sitemap detection is run.
    * @param sitemap
@@ -326,7 +326,7 @@
         + "    -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t    (default: storage.crawl.id)\n"
         + "    -threads N    - number of fetching threads per task\n"
         + "    -resume       - resume interrupted job\n"
-        + "    -numTasks N   - if N > 0 then use this many reduce tasks for fetching \n \t \t    (default: mapred.map.tasks)"
+        + "    -numTasks N   - if N > 0 then use this many reduce tasks for fetching \n \t \t    (default: mapreduce.job.maps)"
         + "    -sitemap      - only sitemap files are fetched, defaults to false"
         + "    -stmDetect    - sitemap files are detected from robot.txt file";
 

diff --git a/src/java/org/apache/nutch/fetcher/FetcherReducer.java b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
index 364bf7e..4f71954 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherReducer.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherReducer.java

@@ -842,7 +842,7 @@
       ft.start();
     }
     // select a timeout that avoids a task timeout
-    final long timeout = conf.getInt("mapred.task.timeout", 10 * 60 * 1000) / 2;
+    final long timeout = conf.getInt("mapreduce.task.timeout", 10 * 60 * 1000) / 2;
 
     // Used for threshold check, holds pages and bytes processed in the last sec
     float pagesLastSec;

diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java
index eaa8420..4869efa 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java

@@ -138,7 +138,7 @@
     getConf().setBoolean(ARG_COMMIT, (Boolean) args.get(ARG_COMMIT));
     currentJob = NutchJob.getInstance(getConf(), "CleaningJob");
     currentJob.getConfiguration().setClass(
-        "mapred.output.key.comparator.class", StringComparator.class,
+        "mapreduce.job.output.key.comparator.class", StringComparator.class,
         RawComparator.class);
 
     Collection<WebPage.Field> fields = getFields(currentJob);

diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java
index f98d40d..ec04384 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java

@@ -145,7 +145,7 @@
 
     Job job = NutchJob.getInstance(conf, "Indexer");
     // TODO: Figure out why this needs to be here
-    job.getConfiguration().setClass("mapred.output.key.comparator.class",
+    job.getConfiguration().setClass("mapreduce.job.output.key.comparator.class",
         StringComparator.class, RawComparator.class);
 
     Collection<WebPage.Field> fields = getFields(job);
commit	418f93a4609658cffe8b02841a9db0c0025de865	[log] [tgz]
author	Sebastian Nagel <snagel@apache.org>	Sat Apr 21 19:12:09 2018 +0200
committer	Sebastian Nagel <snagel@apache.org>	Sat Apr 21 19:12:09 2018 +0200
tree	3b9cc0c04f63a10c143b2cf240d4c18049f44087
parent	9c659d3a3032c736eceaa263dc290c8c78da8005 [diff]