conf/hadoop-default.xml - hadoop - Git at Google

 <?xml version="1.0"?>
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

 <!-- Do not modify this file directly.  Instead, copy entries that you -->
 <!-- wish to modify from this file into hadoop-site.xml and change them -->
 <!-- there.  If hadoop-site.xml does not already exist, create it.      -->

 <configuration>

 <!--- logging properties -->

 <property>
   <name>hadoop.logfile.size</name>
   <value>10000000</value>
   <description>The max size of each log file</description>
 </property>

 <property>
   <name>hadoop.logfile.count</name>
   <value>10</value>
   <description>The max number of log files</description>
 </property>

 <property>
   <name>dfs.namenode.logging.level</name>
   <value>info</value>
   <description>The logging level for dfs namenode. Other values are "dir"(trac
 e namespace mutations), "block"(trace block under/over replications and block
 creations/deletions), or "all".</description>
 </property>

 <!-- i/o properties -->

 <property>
   <name>io.sort.factor</name>
   <value>10</value>
   <description>The number of streams to merge at once while sorting
   files.  This determines the number of open file handles.</description>
 </property>

 <property>
   <name>io.sort.mb</name>
   <value>100</value>
   <description>The total amount of buffer memory to use while sorting
   files, in megabytes.  By default, gives each merge stream 1MB, which
   should minimize seeks.</description>
 </property>

 <property>
   <name>io.file.buffer.size</name>
   <value>4096</value>
   <description>The size of buffer for use in sequence files.
   The size of this buffer should probably be a multiple of hardware
   page size (4096 on Intel x86), and it determines how much data is
   buffered during read and write operations.</description>
 </property>

 <property>
   <name>io.bytes.per.checksum</name>
   <value>512</value>
   <description>The number of bytes per checksum.  Must not be larger than
   io.file.buffer.size.</description>
 </property>

 <property>
   <name>io.skip.checksum.errors</name>
   <value>false</value>
   <description>If true, when a checksum error is encountered while
   reading a sequence file, entries are skipped, instead of throwing an
   exception.</description>
 </property>

 <property>
   <name>io.map.index.skip</name>
   <value>0</value>
   <description>Number of index entries to skip between each entry.
   Zero by default. Setting this to values larger than zero can
   facilitate opening large map files using less memory.</description>
 </property>

 <!-- file system properties -->

 <property>
   <name>fs.default.name</name>
   <value>local</value>
   <description>The name of the default file system.  Either the
   literal string "local" or a host:port for DFS.</description>
 </property>

 <property>
   <name>dfs.datanode.port</name>
   <value>50010</value>
   <description>The port number that the dfs datanode server uses as a starting
 	       point to look for a free port to listen on.
 </description>
 </property>

 <property>
   <name>dfs.name.dir</name>
   <value>/tmp/hadoop/dfs/name</value>
   <description>Determines where on the local filesystem the DFS name node
       should store the name table.</description>
 </property>

 <property>
   <name>dfs.data.dir</name>
   <value>/tmp/hadoop/dfs/data</value>
   <description>Determines where on the local filesystem an DFS data node
   should store its blocks.  If this is a comma-delimited
   list of directories, then data will be stored in all named
   directories, typically on different devices.</description>
 </property>

 <property>
   <name>dfs.replication</name>
   <value>3</value>
   <description>Default block replication.
   The actual number of replications can be specified when the file is created.
   The default is used if replication is not specified in create time.
   </description>
 </property>

 <property>
   <name>dfs.replication.max</name>
   <value>512</value>
   <description>Maximal block replication.
   </description>
 </property>

 <property>
   <name>dfs.replication.min</name>
   <value>1</value>
   <description>Minimal block replication.
   </description>
 </property>

 <property>
   <name>dfs.block.size</name>
   <value>67108864</value>
   <description>The default block size for new files.</description>
 </property>

 <property>
   <name>dfs.df.interval</name>
   <value>3000</value>
   <description>Disk usage statistics refresh interval in msec.</description>
 </property>

 <property>
   <name>dfs.client.block.write.retries</name>
   <value>3</value>
   <description>The number of retries for writing blocks to the data nodes,
   before we signal failure to the application.
   </description>
 </property>

 <!-- map/reduce properties -->

 <property>
   <name>mapred.job.tracker</name>
   <value>local</value>
   <description>The host and port that the MapReduce job tracker runs
   at.  If "local", then jobs are run in-process as a single map
   and reduce task.
   </description>
 </property>

 <property>
   <name>mapred.job.tracker.info.port</name>
   <value>50030</value>
   <description>The port that the MapReduce job tracker info webserver runs at.
   </description>
 </property>

 <property>
   <name>mapred.task.tracker.output.port</name>
   <value>50040</value>
   <description>The port number that the MapReduce task tracker output server uses as a starting
                point to look for a free port to listen on.
   </description>
 </property>

 <property>
   <name>mapred.task.tracker.report.port</name>
   <value>50050</value>
   <description>The port number that the MapReduce task tracker report server uses as a starting
                point to look for a free port to listen on.
   </description>
 </property>

 <property>
   <name>mapred.local.dir</name>
   <value>/tmp/hadoop/mapred/local</value>
   <description>The local directory where MapReduce stores intermediate
   data files.  May be a comma-separated list of
   directories on different devices in order to spread disk i/o.
   </description>
 </property>

 <property>
   <name>mapred.system.dir</name>
   <value>/tmp/hadoop/mapred/system</value>
   <description>The shared directory where MapReduce stores control files.
   </description>
 </property>

 <property>
   <name>mapred.temp.dir</name>
   <value>/tmp/hadoop/mapred/temp</value>
   <description>A shared directory for temporary files.
   </description>
 </property>

 <property>
   <name>mapred.map.tasks</name>
   <value>2</value>
   <description>The default number of map tasks per job.  Typically set
   to a prime several times greater than number of available hosts.
   Ignored when mapred.job.tracker is "local".
   </description>
 </property>

 <property>
   <name>mapred.reduce.tasks</name>
   <value>1</value>
   <description>The default number of reduce tasks per job.  Typically set
   to a prime close to the number of available hosts.  Ignored when
   mapred.job.tracker is "local".
   </description>
 </property>

 <property>
   <name>mapred.reduce.parallel.copies</name>
   <value>5</value>
   <description>The default number of parallel transfers run by reduce
   during the copy(shuffle) phase.
   </description>
 </property>

 <property>
   <name>mapred.task.timeout</name>
   <value>600000</value>
   <description>The number of milliseconds before a task will be
   terminated if it neither reads an input, writes an output, nor
   updates its status string.
   </description>
 </property>

 <property>
   <name>mapred.tasktracker.tasks.maximum</name>
   <value>2</value>
   <description>The maximum number of tasks that will be run
   simultaneously by a task tracker.
   </description>
 </property>

 <property>
   <name>mapred.child.java.opts</name>
   <value>-Xmx200m</value>
   <description>Java opts for the task tracker child processes.  Subsumes
   'mapred.child.heap.size' (If a mapred.child.heap.size value is found
   in a configuration, its maximum heap size will be used and a warning
   emitted that heap.size has been deprecated). Also, the following symbols,
   if present, will be interpolated: @taskid@ is replaced by current TaskID;
   and @port@ will be replaced by mapred.task.tracker.report.port + 1 (A second
   child will fail with a port-in-use if mapred.tasktracker.tasks.maximum is
   greater than one). Any other occurrences of '@' will go unchanged. For
   example, to enable verbose gc logging to a file named for the taskid in
   /tmp and to set the heap maximum to be a gigabyte, pass a 'value' of:

         -Xmx1024m -verbose:gc -Xloggc:/tmp/@taskid@.gc
   </description>
 </property>

 <property>
   <name>mapred.combine.buffer.size</name>
   <value>100000</value>
   <description>The number of entries the combining collector caches before
   combining them and writing to disk.</description>
 </property>

 <property>
   <name>mapred.speculative.execution</name>
   <value>true</value>
   <description>If true, then multiple instances of some map tasks may
   be executed in parallel.</description>
 </property>

 <property>
   <name>mapred.min.split.size</name>
   <value>0</value>
   <description>The minimum size chunk that map input should be split
   into.  Note that some file formats may have minimum split sizes that
   take priority over this setting.</description>
 </property>

 <property>
   <name>mapred.submit.replication</name>
   <value>10</value>
   <description>The replication level for submitted job files.  This
   should be around the square root of the number of nodes.
   </description>
 </property>

 <property>
   <name>tasktracker.http.threads</name>
   <value>40</value>
   <description>The number of worker threads that for the http server. This is
                used for map output fetching
   </description>
 </property>

 <property>
   <name>tasktracker.http.port</name>
   <value>50060</value>
   <description>The default port for task trackers to use as their http server.
   </description>
 </property>

 <!-- ipc properties -->

 <property>
   <name>ipc.client.timeout</name>
   <value>60000</value>
   <description>Defines the timeout for IPC calls in milliseconds.</description>
 </property>

 </configuration>
	<?xml version="1.0"?>
	<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

	<!-- Do not modify this file directly. Instead, copy entries that you -->
	<!-- wish to modify from this file into hadoop-site.xml and change them -->
	<!-- there. If hadoop-site.xml does not already exist, create it. -->

	<configuration>

	<!--- logging properties -->

	<property>
	<name>hadoop.logfile.size</name>
	<value>10000000</value>
	<description>The max size of each log file</description>
	</property>

	<property>
	<name>hadoop.logfile.count</name>
	<value>10</value>
	<description>The max number of log files</description>
	</property>

	<property>
	<name>dfs.namenode.logging.level</name>
	<value>info</value>
	<description>The logging level for dfs namenode. Other values are "dir"(trac
	e namespace mutations), "block"(trace block under/over replications and block
	creations/deletions), or "all".</description>
	</property>

	<!-- i/o properties -->

	<property>
	<name>io.sort.factor</name>
	<value>10</value>
	<description>The number of streams to merge at once while sorting
	files. This determines the number of open file handles.</description>
	</property>

	<property>
	<name>io.sort.mb</name>
	<value>100</value>
	<description>The total amount of buffer memory to use while sorting
	files, in megabytes. By default, gives each merge stream 1MB, which
	should minimize seeks.</description>
	</property>

	<property>
	<name>io.file.buffer.size</name>
	<value>4096</value>
	<description>The size of buffer for use in sequence files.
	The size of this buffer should probably be a multiple of hardware
	page size (4096 on Intel x86), and it determines how much data is
	buffered during read and write operations.</description>
	</property>

	<property>
	<name>io.bytes.per.checksum</name>
	<value>512</value>
	<description>The number of bytes per checksum. Must not be larger than
	io.file.buffer.size.</description>
	</property>

	<property>
	<name>io.skip.checksum.errors</name>
	<value>false</value>
	<description>If true, when a checksum error is encountered while
	reading a sequence file, entries are skipped, instead of throwing an
	exception.</description>
	</property>

	<property>
	<name>io.map.index.skip</name>
	<value>0</value>
	<description>Number of index entries to skip between each entry.
	Zero by default. Setting this to values larger than zero can
	facilitate opening large map files using less memory.</description>
	</property>

	<!-- file system properties -->

	<property>
	<name>fs.default.name</name>
	<value>local</value>
	<description>The name of the default file system. Either the
	literal string "local" or a host:port for DFS.</description>
	</property>

	<property>
	<name>dfs.datanode.port</name>
	<value>50010</value>
	<description>The port number that the dfs datanode server uses as a starting
	point to look for a free port to listen on.
	</description>
	</property>

	<property>
	<name>dfs.name.dir</name>
	<value>/tmp/hadoop/dfs/name</value>
	<description>Determines where on the local filesystem the DFS name node
	should store the name table.</description>
	</property>

	<property>
	<name>dfs.data.dir</name>
	<value>/tmp/hadoop/dfs/data</value>
	<description>Determines where on the local filesystem an DFS data node
	should store its blocks. If this is a comma-delimited
	list of directories, then data will be stored in all named
	directories, typically on different devices.</description>
	</property>

	<property>
	<name>dfs.replication</name>
	<value>3</value>
	<description>Default block replication.
	The actual number of replications can be specified when the file is created.
	The default is used if replication is not specified in create time.
	</description>
	</property>

	<property>
	<name>dfs.replication.max</name>
	<value>512</value>
	<description>Maximal block replication.
	</description>
	</property>

	<property>
	<name>dfs.replication.min</name>
	<value>1</value>
	<description>Minimal block replication.
	</description>
	</property>

	<property>
	<name>dfs.block.size</name>
	<value>67108864</value>
	<description>The default block size for new files.</description>
	</property>

	<property>
	<name>dfs.df.interval</name>
	<value>3000</value>
	<description>Disk usage statistics refresh interval in msec.</description>
	</property>

	<property>
	<name>dfs.client.block.write.retries</name>
	<value>3</value>
	<description>The number of retries for writing blocks to the data nodes,
	before we signal failure to the application.
	</description>
	</property>

	<!-- map/reduce properties -->

	<property>
	<name>mapred.job.tracker</name>
	<value>local</value>
	<description>The host and port that the MapReduce job tracker runs
	at. If "local", then jobs are run in-process as a single map
	and reduce task.
	</description>
	</property>

	<property>
	<name>mapred.job.tracker.info.port</name>
	<value>50030</value>
	<description>The port that the MapReduce job tracker info webserver runs at.
	</description>
	</property>

	<property>
	<name>mapred.task.tracker.output.port</name>
	<value>50040</value>
	<description>The port number that the MapReduce task tracker output server uses as a starting
	point to look for a free port to listen on.
	</description>
	</property>

	<property>
	<name>mapred.task.tracker.report.port</name>
	<value>50050</value>
	<description>The port number that the MapReduce task tracker report server uses as a starting
	point to look for a free port to listen on.
	</description>
	</property>

	<property>
	<name>mapred.local.dir</name>
	<value>/tmp/hadoop/mapred/local</value>
	<description>The local directory where MapReduce stores intermediate
	data files. May be a comma-separated list of
	directories on different devices in order to spread disk i/o.
	</description>
	</property>

	<property>
	<name>mapred.system.dir</name>
	<value>/tmp/hadoop/mapred/system</value>
	<description>The shared directory where MapReduce stores control files.
	</description>
	</property>

	<property>
	<name>mapred.temp.dir</name>
	<value>/tmp/hadoop/mapred/temp</value>
	<description>A shared directory for temporary files.
	</description>
	</property>

	<property>
	<name>mapred.map.tasks</name>
	<value>2</value>
	<description>The default number of map tasks per job. Typically set
	to a prime several times greater than number of available hosts.
	Ignored when mapred.job.tracker is "local".
	</description>
	</property>

	<property>
	<name>mapred.reduce.tasks</name>
	<value>1</value>
	<description>The default number of reduce tasks per job. Typically set
	to a prime close to the number of available hosts. Ignored when
	mapred.job.tracker is "local".
	</description>
	</property>

	<property>
	<name>mapred.reduce.parallel.copies</name>
	<value>5</value>
	<description>The default number of parallel transfers run by reduce
	during the copy(shuffle) phase.
	</description>
	</property>

	<property>
	<name>mapred.task.timeout</name>
	<value>600000</value>
	<description>The number of milliseconds before a task will be
	terminated if it neither reads an input, writes an output, nor
	updates its status string.
	</description>
	</property>

	<property>
	<name>mapred.tasktracker.tasks.maximum</name>
	<value>2</value>
	<description>The maximum number of tasks that will be run
	simultaneously by a task tracker.
	</description>
	</property>

	<property>
	<name>mapred.child.java.opts</name>
	<value>-Xmx200m</value>
	<description>Java opts for the task tracker child processes. Subsumes
	'mapred.child.heap.size' (If a mapred.child.heap.size value is found
	in a configuration, its maximum heap size will be used and a warning
	emitted that heap.size has been deprecated). Also, the following symbols,
	if present, will be interpolated: @taskid@ is replaced by current TaskID;
	and @port@ will be replaced by mapred.task.tracker.report.port + 1 (A second
	child will fail with a port-in-use if mapred.tasktracker.tasks.maximum is
	greater than one). Any other occurrences of '@' will go unchanged. For
	example, to enable verbose gc logging to a file named for the taskid in
	/tmp and to set the heap maximum to be a gigabyte, pass a 'value' of:

	-Xmx1024m -verbose:gc -Xloggc:/tmp/@taskid@.gc
	</description>
	</property>

	<property>
	<name>mapred.combine.buffer.size</name>
	<value>100000</value>
	<description>The number of entries the combining collector caches before
	combining them and writing to disk.</description>
	</property>

	<property>
	<name>mapred.speculative.execution</name>
	<value>true</value>
	<description>If true, then multiple instances of some map tasks may
	be executed in parallel.</description>
	</property>

	<property>
	<name>mapred.min.split.size</name>
	<value>0</value>
	<description>The minimum size chunk that map input should be split
	into. Note that some file formats may have minimum split sizes that
	take priority over this setting.</description>
	</property>

	<property>
	<name>mapred.submit.replication</name>
	<value>10</value>
	<description>The replication level for submitted job files. This
	should be around the square root of the number of nodes.
	</description>
	</property>

	<property>
	<name>tasktracker.http.threads</name>
	<value>40</value>
	<description>The number of worker threads that for the http server. This is
	used for map output fetching
	</description>
	</property>

	<property>
	<name>tasktracker.http.port</name>
	<value>50060</value>
	<description>The default port for task trackers to use as their http server.
	</description>
	</property>

	<!-- ipc properties -->

	<property>
	<name>ipc.client.timeout</name>
	<value>60000</value>
	<description>Defines the timeout for IPC calls in milliseconds.</description>
	</property>

	</configuration>