conf/pig.properties - pig - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # Pig configuration file. All values can be overwritten by command line arguments.

 # Use the "-h properties" command to see description of the properties

 # log4jconf log4j configuration file
 # log4jconf=./conf/log4j.properties

 # a file that contains pig script
 #file=

 # load jarfile, colon separated
 #jar=

 #verbose print all log messages to screen (default to print only INFO and above to screen)
 #verbose=true

 #exectype local|mapreduce, mapreduce is default
 #exectype=local

 #the default timezone: if it is not set, the default timezone for this host is used.
 #the correct timezone format is the UTC offset: e.g., +08:00.
 #pig.datetime.default.tz=

 #pig.logfile=

 #Do not spill temp files smaller than this size (bytes)
 #pig.spill.size.threshold=5000000

 #EXPERIMENT: Activate garbage collection when spilling a file bigger than this size (bytes)
 #This should help reduce the number of files being spilled.
 #pig.spill.gc.activation.size=40000000

 #the following two parameters are to help estimate the reducer number
 #pig.exec.reducers.bytes.per.reducer=1000000000
 #pig.exec.reducers.max=999

 #Logging properties
 #verbose=false
 #brief=false
 #debug=INFO
 #aggregate.warning=true

 #Performance tuning properties
 #pig.cachedbag.memusage=0.2
 #pig.skewedjoin.reduce.memusage=0.3
 #pig.exec.nocombiner=false
 #opt.multiquery=true

 #Following parameters are for configuring intermediate storage format
 #Supported storage types are seqfile and tfile
 #Supported codec types: tfile supports gz(gzip) and lzo, seqfile support gz(gzip), lzo, snappy, bzip2
 #pig.tmpfilecompression=false
 #pig.tmpfilecompression.storage=seqfile
 #pig.tmpfilecompression.codec=gz

 #pig.noSplitCombination=true

 #pig.exec.mapPartAgg=false
 #pig.exec.mapPartAgg.minReduction=10

 #exectype=mapreduce
 #pig.additional.jars=<comma seperated list of jars>
 #udf.import.list=<comma seperated list of imports>
 #stop.on.failure=false

 #Use this option only when your Pig job will otherwise die because of
 #using more counters than hadoop configured limit
 #pig.disable.counter=true

 # By default, pig will allow 1GB of data to be replicated using
 # the distributed cache when doing fragment-replicated join.
 # pig.join.replicated.max.bytes=1000000000

 # Use this option to turn on UDF timers. This will cause two
 # counters to be tracked for every UDF and LoadFunc in your script:
 # approx_microsecs measures approximate time spent inside a UDF
 # approx_invocations reports the approximate number of times the UDF was invoked
 # pig.udf.profile=false

 #When enabled, 'describe' prints a multi-line formatted schema
 #(similar to an indended json) rather than on a single line.
 #pig.pretty.print.schema=true

 #pig.sql.type=hcat
 hcat.bin=/usr/local/hcat/bin/hcat

 ############################ SchemaTuple ############################

 # Setting this value will turn on the SchemaTuple feature (PIG-2632)
 # This will attempt to use code generation for more efficient within
 # the pig code. This can lead to both CPU, serialization, and memory
 # benefits (currently, the potential memory benefits are the largest).

 # This parameter will enable the optimization in all available cases
 #pig.schematuple=true

 # Certain cases can be turned off by uncommenting the following. These will
 # all be off by default, but will all be turned on if pig.schematuple is set
 # to true.

 # This will disable SchemaTuples in the case of udfs. Currently,
 # the input to UDF's will be SchemaTuples.

 #pig.schematuple.udf=false

 # This is currently not implemented. In the future, LoadFunc's with known
 # schema's should output SchemaTuples

 #pig.schematuple.load=false

 # This will use SchemaTuples in replicated joins. The potential memory saving
 # here is significant. It will use SchemaTuples when it builds the HashMap of
 # the join key to related values.

 #pig.schematuple.fr_join=false

 # In the current implementation of merge join, all of the Tuples in the left relation
 # that share a given key will be stored in a List in memory. This will use SchemaTuples
 # instead in that List.

 #pig.schematuple.merge_join=false

 #####################################################################

 ##### Set up optional Pig Progress Notification Listener ############

 # Note that only one PPNL can be set up. If you need several, write a PPNL that will chain them.
 # pig.notification.listener = <fully qualified class name of a PPNL implementation>

 # Optionally, you can supply a single String argument to pass to your PPNL.
 # pig.notification.listener.arg = <somevalue>

 #####################################################################

 ########## Override the default Reducer Estimator logic #############

 # By default, the logic to estimate the number of reducers to use for a given job lives in:
 #   org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.InputSizeReducerEstimator
 # This logic can be replaced by implementing the following interface:
 #   org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigReducerEstimator

 # This class will be invoked to estimate the number of reducers to use.
 # pig.exec.reducer.estimator = <fully qualified class name of a PigReducerEstimator implementation>

 # Optionally, you can supply a single String argument to pass to your PigReducerEstimator.
 # pig.exec.reducer.estimator.arg = <somevalue>

 #####################################################################

 ###### Override the default Pig Stats Output Size Reader logic ######

 # By default, the size of reducers output is computed as the total size of
 # output files. But since not every storage is file-based, this logic is not
 # always applicable. If that is the case, the logic can be replaced by
 # implementing the following interface:
 #   org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigStatsOutputSizeReader

 # This class will be invoked to compute the size of reducers output.
 # pig.stats.output.size.reader = <fully qualified class name of a PigStatsOutputSizeReader implementation>

 # If you need to register more than one reader, you can register them as a comma
 # separated list. Every reader implements a boolean supports(POStore sto) method.
 # When there are more than one reader, they are consulted in order, and the
 # first one whose supports() method returns true will be used.
 #
 #####################################################################

 #pig.load.default.statements=

 #####################################################################

 ########### Override hadoop configs programatically #################

 # By default, Pig expects hadoop configs (hadoop-site.xml and core-site.xml)
 # to be present on the classpath. There are cases when these configs are
 # needed to be passed programatically, such as while using the PigServer API.
 # In such cases, you can override hadoop configs by setting the property
 # "pig.use.overriden.hadoop.configs".
 #
 # When this property is set to true, Pig ignores looking for hadoop configs
 # in the classpath and instead picks it up from Properties/Configuration
 # object passed to it.

 # pig.use.overriden.hadoop.configs=false
 #
 ######################################################################

 # Check if the script needs to check multiple stores writing
 # to the same location. When set to true, stops the execution
 # of script right away.
 pig.location.check.strict=false

 ######################################################################

 # This key is used to define the default load func. Pig will fallback
 # on PigStorage as default in case this is undefined.

 # pig.default.load.func=<fully qualified class name of a LoadFunc implementation>
 # For eg, pig.default.load.func=org.apache.pig.custom.MyCustomStorage

 # This key is used to define the default store func. Pig will fallback
 # on PigStorage as default in case this is undefined.

 # pig.default.store.func=<fully qualified class name of a StoreFunc implementation>
 # For eg, pig.default.store.func=org.apache.pig.custom.MyCustomStorage

 # This option is used to define whether to support recovery to handle the
 # application master getting restarted.
 # pig.output.committer.recovery.support=true

 # Set this option to true if you need to use the old partition filter optimizer.
 # Note: Old filter optimizer PColFilterOptimizer will be deprecated in the future.
 # pig.exec.useOldPartitionFilterOptimizer=true
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# Pig configuration file. All values can be overwritten by command line arguments.

	# Use the "-h properties" command to see description of the properties

	# log4jconf log4j configuration file
	# log4jconf=./conf/log4j.properties

	# a file that contains pig script
	#file=

	# load jarfile, colon separated
	#jar=

	#verbose print all log messages to screen (default to print only INFO and above to screen)
	#verbose=true

	#exectype local\|mapreduce, mapreduce is default
	#exectype=local

	#the default timezone: if it is not set, the default timezone for this host is used.
	#the correct timezone format is the UTC offset: e.g., +08:00.
	#pig.datetime.default.tz=

	#pig.logfile=

	#Do not spill temp files smaller than this size (bytes)
	#pig.spill.size.threshold=5000000

	#EXPERIMENT: Activate garbage collection when spilling a file bigger than this size (bytes)
	#This should help reduce the number of files being spilled.
	#pig.spill.gc.activation.size=40000000

	#the following two parameters are to help estimate the reducer number
	#pig.exec.reducers.bytes.per.reducer=1000000000
	#pig.exec.reducers.max=999

	#Logging properties
	#verbose=false
	#brief=false
	#debug=INFO
	#aggregate.warning=true

	#Performance tuning properties
	#pig.cachedbag.memusage=0.2
	#pig.skewedjoin.reduce.memusage=0.3
	#pig.exec.nocombiner=false
	#opt.multiquery=true

	#Following parameters are for configuring intermediate storage format
	#Supported storage types are seqfile and tfile
	#Supported codec types: tfile supports gz(gzip) and lzo, seqfile support gz(gzip), lzo, snappy, bzip2
	#pig.tmpfilecompression=false
	#pig.tmpfilecompression.storage=seqfile
	#pig.tmpfilecompression.codec=gz

	#pig.noSplitCombination=true

	#pig.exec.mapPartAgg=false
	#pig.exec.mapPartAgg.minReduction=10

	#exectype=mapreduce
	#pig.additional.jars=<comma seperated list of jars>
	#udf.import.list=<comma seperated list of imports>
	#stop.on.failure=false

	#Use this option only when your Pig job will otherwise die because of
	#using more counters than hadoop configured limit
	#pig.disable.counter=true

	# By default, pig will allow 1GB of data to be replicated using
	# the distributed cache when doing fragment-replicated join.
	# pig.join.replicated.max.bytes=1000000000

	# Use this option to turn on UDF timers. This will cause two
	# counters to be tracked for every UDF and LoadFunc in your script:
	# approx_microsecs measures approximate time spent inside a UDF
	# approx_invocations reports the approximate number of times the UDF was invoked
	# pig.udf.profile=false

	#When enabled, 'describe' prints a multi-line formatted schema
	#(similar to an indended json) rather than on a single line.
	#pig.pretty.print.schema=true

	#pig.sql.type=hcat
	hcat.bin=/usr/local/hcat/bin/hcat

	############################ SchemaTuple ############################

	# Setting this value will turn on the SchemaTuple feature (PIG-2632)
	# This will attempt to use code generation for more efficient within
	# the pig code. This can lead to both CPU, serialization, and memory
	# benefits (currently, the potential memory benefits are the largest).

	# This parameter will enable the optimization in all available cases
	#pig.schematuple=true

	# Certain cases can be turned off by uncommenting the following. These will
	# all be off by default, but will all be turned on if pig.schematuple is set
	# to true.

	# This will disable SchemaTuples in the case of udfs. Currently,
	# the input to UDF's will be SchemaTuples.

	#pig.schematuple.udf=false

	# This is currently not implemented. In the future, LoadFunc's with known
	# schema's should output SchemaTuples

	#pig.schematuple.load=false

	# This will use SchemaTuples in replicated joins. The potential memory saving
	# here is significant. It will use SchemaTuples when it builds the HashMap of
	# the join key to related values.

	#pig.schematuple.fr_join=false

	# In the current implementation of merge join, all of the Tuples in the left relation
	# that share a given key will be stored in a List in memory. This will use SchemaTuples
	# instead in that List.

	#pig.schematuple.merge_join=false

	#####################################################################

	##### Set up optional Pig Progress Notification Listener ############

	# Note that only one PPNL can be set up. If you need several, write a PPNL that will chain them.
	# pig.notification.listener = <fully qualified class name of a PPNL implementation>

	# Optionally, you can supply a single String argument to pass to your PPNL.
	# pig.notification.listener.arg = <somevalue>

	#####################################################################

	########## Override the default Reducer Estimator logic #############

	# By default, the logic to estimate the number of reducers to use for a given job lives in:
	# org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.InputSizeReducerEstimator
	# This logic can be replaced by implementing the following interface:
	# org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigReducerEstimator

	# This class will be invoked to estimate the number of reducers to use.
	# pig.exec.reducer.estimator = <fully qualified class name of a PigReducerEstimator implementation>

	# Optionally, you can supply a single String argument to pass to your PigReducerEstimator.
	# pig.exec.reducer.estimator.arg = <somevalue>

	#####################################################################

	###### Override the default Pig Stats Output Size Reader logic ######

	# By default, the size of reducers output is computed as the total size of
	# output files. But since not every storage is file-based, this logic is not
	# always applicable. If that is the case, the logic can be replaced by
	# implementing the following interface:
	# org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigStatsOutputSizeReader

	# This class will be invoked to compute the size of reducers output.
	# pig.stats.output.size.reader = <fully qualified class name of a PigStatsOutputSizeReader implementation>

	# If you need to register more than one reader, you can register them as a comma
	# separated list. Every reader implements a boolean supports(POStore sto) method.
	# When there are more than one reader, they are consulted in order, and the
	# first one whose supports() method returns true will be used.
	#
	#####################################################################

	#pig.load.default.statements=

	#####################################################################

	########### Override hadoop configs programatically #################

	# By default, Pig expects hadoop configs (hadoop-site.xml and core-site.xml)
	# to be present on the classpath. There are cases when these configs are
	# needed to be passed programatically, such as while using the PigServer API.
	# In such cases, you can override hadoop configs by setting the property
	# "pig.use.overriden.hadoop.configs".
	#
	# When this property is set to true, Pig ignores looking for hadoop configs
	# in the classpath and instead picks it up from Properties/Configuration
	# object passed to it.

	# pig.use.overriden.hadoop.configs=false
	#
	######################################################################

	# Check if the script needs to check multiple stores writing
	# to the same location. When set to true, stops the execution
	# of script right away.
	pig.location.check.strict=false

	######################################################################

	# This key is used to define the default load func. Pig will fallback
	# on PigStorage as default in case this is undefined.

	# pig.default.load.func=<fully qualified class name of a LoadFunc implementation>
	# For eg, pig.default.load.func=org.apache.pig.custom.MyCustomStorage

	# This key is used to define the default store func. Pig will fallback
	# on PigStorage as default in case this is undefined.

	# pig.default.store.func=<fully qualified class name of a StoreFunc implementation>
	# For eg, pig.default.store.func=org.apache.pig.custom.MyCustomStorage

	# This option is used to define whether to support recovery to handle the
	# application master getting restarted.
	# pig.output.committer.recovery.support=true

	# Set this option to true if you need to use the old partition filter optimizer.
	# Note: Old filter optimizer PColFilterOptimizer will be deprecated in the future.
	# pig.exec.useOldPartitionFilterOptimizer=true