NUTCH-1228 Change mapred.task.timeout to mapreduce.task.timeout in fetcher
- update Hadoop properties (deprecated / old MapReduce API)
diff --git a/src/bin/crawl b/src/bin/crawl
index 1a31d7d..27db6de 100644
--- a/src/bin/crawl
+++ b/src/bin/crawl
@@ -61,7 +61,7 @@
numSlaves=1
# and the total number of available tasks
-# sets Hadoop parameter "mapred.reduce.tasks"
+# sets Hadoop parameter "mapreduce.job.reduces"
numTasks=`expr $numSlaves \* 2`
# number of urls to fetch in one iteration
@@ -88,7 +88,7 @@
# note that some of the options listed here could be set in the
# corresponding hadoop site xml param file
-commonOptions="-D mapred.reduce.tasks=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapred.reduce.tasks.speculative.execution=false -D mapred.map.tasks.speculative.execution=false -D mapred.compress.map.output=true"
+commonOptions="-D mapreduce.job.reduces=$numTasks -D mapred.child.java.opts=-Xmx1000m -D mapreduce.reduce.speculative=false -D mapreduce.map.speculative=false -D mapreduce.map.output.compress=true"
# check that hadoop can be found on the path
if [ $mode = "distributed" ]; then
@@ -161,7 +161,7 @@
echo "Parsing : "
# enable the skipping of records for the parsing so that a dodgy document
# so that it does not fail the full task
- skipRecordsOptions="-D mapred.skip.attempts.to.start.skipping=2 -D mapred.skip.map.max.skip.records=1"
+ skipRecordsOptions="-D mapreduce.task.skip.start.attempts=2 -D mapreduce.map.skip.maxrecords=1"
__bin_nutch parse $commonOptions $skipRecordsOptions $batchId -crawlId "$CRAWL_ID"
# updatedb with this batch
diff --git a/src/java/org/apache/nutch/crawl/WebTableReader.java b/src/java/org/apache/nutch/crawl/WebTableReader.java
index 5985dd6..941ae9a 100644
--- a/src/java/org/apache/nutch/crawl/WebTableReader.java
+++ b/src/java/org/apache/nutch/crawl/WebTableReader.java
@@ -539,7 +539,7 @@
// for now handles only -stat
@Override
public Map<String, Object> run(Map<String, Object> args) throws Exception {
- Path tmpFolder = new Path(getConf().get("mapred.temp.dir", ".")
+ Path tmpFolder = new Path(getConf().get("mapreduce.cluster.temp.dir", ".")
+ "stat_tmp" + System.currentTimeMillis());
numJobs = 1;
diff --git a/src/java/org/apache/nutch/fetcher/FetcherJob.java b/src/java/org/apache/nutch/fetcher/FetcherJob.java
index bd06121..82e7a12 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherJob.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherJob.java
@@ -214,7 +214,7 @@
StorageUtils.initReducerJob(currentJob, FetcherReducer.class);
if (numTasks == null || numTasks < 1) {
currentJob.setNumReduceTasks(currentJob.getConfiguration().getInt(
- "mapred.map.tasks", currentJob.getNumReduceTasks()));
+ "mapreduce.job.maps", currentJob.getNumReduceTasks()));
} else {
currentJob.setNumReduceTasks(numTasks);
}
@@ -247,7 +247,7 @@
* @param shouldResume
* @param numTasks
* number of fetching tasks (reducers). If set to < 1 then use the
- * default, which is mapred.map.tasks.
+ * default, which is mapreduce.job.maps.
* @return 0 on success
* @throws Exception
*/
@@ -267,7 +267,7 @@
* @param shouldResume
* @param numTasks
* number of fetching tasks (reducers). If set to < 1 then use the
- * default, which is mapred.map.tasks.
+ * default, which is mapreduce.job.maps.
* @param stmDetect
* If set true, sitemap detection is run.
* @param sitemap
@@ -326,7 +326,7 @@
+ " -crawlId <id> - the id to prefix the schemas to operate on, \n \t \t (default: storage.crawl.id)\n"
+ " -threads N - number of fetching threads per task\n"
+ " -resume - resume interrupted job\n"
- + " -numTasks N - if N > 0 then use this many reduce tasks for fetching \n \t \t (default: mapred.map.tasks)"
+ + " -numTasks N - if N > 0 then use this many reduce tasks for fetching \n \t \t (default: mapreduce.job.maps)"
+ " -sitemap - only sitemap files are fetched, defaults to false"
+ " -stmDetect - sitemap files are detected from robot.txt file";
diff --git a/src/java/org/apache/nutch/fetcher/FetcherReducer.java b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
index 364bf7e..4f71954 100644
--- a/src/java/org/apache/nutch/fetcher/FetcherReducer.java
+++ b/src/java/org/apache/nutch/fetcher/FetcherReducer.java
@@ -842,7 +842,7 @@
ft.start();
}
// select a timeout that avoids a task timeout
- final long timeout = conf.getInt("mapred.task.timeout", 10 * 60 * 1000) / 2;
+ final long timeout = conf.getInt("mapreduce.task.timeout", 10 * 60 * 1000) / 2;
// Used for threshold check, holds pages and bytes processed in the last sec
float pagesLastSec;
diff --git a/src/java/org/apache/nutch/indexer/CleaningJob.java b/src/java/org/apache/nutch/indexer/CleaningJob.java
index eaa8420..4869efa 100644
--- a/src/java/org/apache/nutch/indexer/CleaningJob.java
+++ b/src/java/org/apache/nutch/indexer/CleaningJob.java
@@ -138,7 +138,7 @@
getConf().setBoolean(ARG_COMMIT, (Boolean) args.get(ARG_COMMIT));
currentJob = NutchJob.getInstance(getConf(), "CleaningJob");
currentJob.getConfiguration().setClass(
- "mapred.output.key.comparator.class", StringComparator.class,
+ "mapreduce.job.output.key.comparator.class", StringComparator.class,
RawComparator.class);
Collection<WebPage.Field> fields = getFields(currentJob);
diff --git a/src/java/org/apache/nutch/indexer/IndexingJob.java b/src/java/org/apache/nutch/indexer/IndexingJob.java
index f98d40d..ec04384 100644
--- a/src/java/org/apache/nutch/indexer/IndexingJob.java
+++ b/src/java/org/apache/nutch/indexer/IndexingJob.java
@@ -145,7 +145,7 @@
Job job = NutchJob.getInstance(conf, "Indexer");
// TODO: Figure out why this needs to be here
- job.getConfiguration().setClass("mapred.output.key.comparator.class",
+ job.getConfiguration().setClass("mapreduce.job.output.key.comparator.class",
StringComparator.class, RawComparator.class);
Collection<WebPage.Field> fields = getFields(job);