conf/pig.properties - pig - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #  http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # Pig configuration file. All values can be overwritten by command line
 # arguments; for a description of the properties, run
 #
 #     pig -h properties
 #

 ############################################################################
 #
 # == Logging properties
 #

 # Location of pig log file. If blank, a file with a timestamped slug
 # ('pig_1399336559369.log') will be generated in the current working directory.
 #
 # pig.logfile=
 # pig.logfile=/tmp/pig-err.log

 # Log4j configuration file. Set at runtime with the -4 parameter. The source
 # distribution has a ./conf/log4j.properties.template file you can rename and
 # customize.
 #
 # log4jconf=./conf/log4j.properties

 # Verbose Output.
 # * false (default): print only INFO and above to screen
 # * true: Print all log messages to screen
 #
 # verbose=false

 # Omit timestamps on log messages. (default: false)
 #
 # brief=false

 # Logging level. debug=OFF|ERROR|WARN|INFO|DEBUG (default: INFO)
 #
 # debug=INFO

 # Roll up warnings across tasks, so that when millions of mappers suddenly cry
 # out in error they are partially silenced. (default, recommended: true)
 #
 # aggregate.warning=true

 # Should DESCRIBE pretty-print its schema?
 # * false (default): print on a single-line, suitable for pasting back in to your script
 # * true (recommended): prints on multiple lines with indentation, much more readable
 #
 # pig.pretty.print.schema=false

 # === Profiling UDFs  ===

 # Turn on UDF timers? This will cause two counters to be
 # tracked for every UDF and LoadFunc in your script: approx_microsecs measures
 # approximate time spent inside a UDF approx_invocations reports the approximate
 # number of times the UDF was invoked.
 #
 # * false (default): do not record timing information of UDFs.
 # * true: report UDF performance. Uses more counters, but gives more insight
 #   into script operation
 #
 # pig.udf.profile=false

 # Specify frequency of profiling (default: every 100th).
 # pig.udf.profile.frequency=100

 ############################################################################
 #
 # == Site-specific Properties
 #

 # Execution Mode. Local mode is much faster, but only suitable for small amounts
 # of data. Local mode interprets paths on the local file system; Mapreduce mode
 # on the HDFS. Read more under 'Execution Modes' within the Getting Started
 # documentation.
 #
 # * mapreduce (default): use the Hadoop cluster defined in your Hadoop config files
 # * local: use local mode
 # * tez: use Tez on Hadoop cluster
 # * tez_local: use Tez local mode
 #
 # exectype=mapreduce

 # Bootstrap file with default statements to execute in every Pig job, similar to
 # .bashrc.  If blank, uses the file '.pigbootup' from your home directory; If a
 # value is supplied, that file is NOT loaded.  This does not do tilde expansion
 # -- you must supply the full path to the file.
 #
 # pig.load.default.statements=
 # pig.load.default.statements=/home/bob/.pigrc

 # Kill all waiting/running MR jobs upon a MR job failure? (default: false) If
 # false, jobs that can proceed independently will do so unless a parent stage
 # fails. If true, the failure of any stage in the script kills all jobs.
 #
 # stop.on.failure=false

 # File containing the pig script to run. Rarely set in the properties file.
 # Commandline: -f
 #
 # file=

 # Jarfile to load, colon separated. Rarely used.
 #
 # jar=

 # Register additional .jar files to use with your Pig script.
 # Most typically used as a command line option (see http://pig.apache.org/docs/r0.12.0/basic.html#register):
 #
 #     pig -Dpig.additional.jars=hdfs://nn.mydomain.com:9020/myjars/my.jar
 #
 # pig.additional.jars=<colon separated list of jars with optional wildcards>
 # pig.additional.jars=/usr/local/share/pig/pig/contrib/piggybank/java/piggybank.jar:/usr/local/share/pig/datafu/datafu-pig/build/libs/datafu-pig-1.2.1.jar

 # Specify potential packages to which a UDF or a group of UDFs belong,
 # eliminating the need to qualify the UDF on every call. See
 # http://pig.apache.org/docs/r0.12.0/udf.html#use-short-names
 #
 # Commandline use:
 #
 #     pig \
 #       -Dpig.additional.jars=$PIG_HOME/contrib/piggybank/java/piggybank.jar:$PIG_HOME/../datafu/datafu-pig/build/libs/datafu-pig-1.2.1.jar \
 #       -Dudf.import.list=org.apache.pig.piggybank.evaluation:datafu.pig.util \
 #       happy_job.pig
 #
 # udf.import.list=<colon separated list of imports>
 # udf.import.list=org.apache.pig.piggybank.evaluation:datafu.pig.bags:datafu.pig.hash:datafu.pig.stats:datafu.pig.util

 #
 # Reuse jars across jobs run by the same user? (default: false) If enabled, jars
 # are placed in ${pig.user.cache.location}/${user.name}/.pigcache. Since most
 # jars change infrequently, this gives a minor speedup.
 #
 # pig.user.cache.enabled=false

 # Base path for storing jars cached by the pig.user.cache.enabled feature. (default: /tmp)
 #
 # pig.user.cache.location=/tmp

 # Replication factor for cached jars. If not specified mapred.submit.replication
 # is used, whose default is 10.
 #
 # pig.user.cache.replication=10

 # Default UTC offset. (default: the host's current UTC offset) Supply a UTC
 # offset in Java's timezone format: e.g., +08:00.
 #
 # pig.datetime.default.tz=

 # Path to download the artifacts when registering ivy coordinates. This defaults
 # to the directory grape uses for downloading libraries.
 # (default: ~/.groovy/grapes)
 #
 # pig.artifacts.download.location=

 ############################################################################
 #
 # Memory impacting properties
 #

 # Amount of memory (as fraction of heap) allocated to bags before a spill is
 # forced. Default is 0.2, meaning 20% of available memory. Note that this memory
 # is shared across all large bags used by the application. See
 # http://pig.apache.org/docs/r0.12.0/perf.html#memory-management
 #
 # pig.cachedbag.memusage=0.2

 # Don't spill bags smaller than this size (bytes). Default: 5000000, or about
 # 5MB. Usually, the more spilling the longer runtime, so you might want to tune
 # it according to heap size of each task and so forth.
 #
 # pig.spill.size.threshold=5000000

 # EXPERIMENTAL: If a file bigger than this size (bytes) is spilled -- thus
 # freeing a bunch of ram -- tell the JVM to perform garbage collection.  This
 # should help reduce the number of files being spilled, but causes more-frequent
 # garbage collection. Default: 40000000 (about 40 MB)
 #
 # pig.spill.gc.activation.size=40000000

 # Spill will be triggered if the fraction of Old Generation heap exceeds the usage or collection threshold.
 # For bigger heap sizes, using a fixed size for collection and usage thresholds will
 # utilize memory better than a percentage of the heap.
 # So usage threshold is calculated as
 #     Max(HeapSize * pig.spill.memory.usage.threshold.fraction, HeapSize - pig.spill.unused.memory.threshold.size)
 # So collection threshold is calculated as
 #     Max(HeapSize * pig.spill.collection.threshold.fraction, HeapSize - pig.spill.unused.memory.threshold.size)

 # pig.spill.memory.usage.threshold.fraction=0.7
 # pig.spill.collection.threshold.fraction=0.7
 # pig.spill.unused.memory.threshold.size=367001600

 # Maximum amount of data to replicate using the distributed cache when doing
 # fragment-replicated join. (default: 1000000000, about 1GB) Consider increasing
 # this in a production environment, but carefully.
 #
 # pig.join.replicated.max.bytes=1000000000

 # Fraction of heap available for the reducer to perform a skewed join. A low
 # fraction forces Pig to use more reducers, but increases the copying cost. See
 # http://pig.apache.org/docs/r0.12.0/perf.html#skewed-joins
 #
 # pig.skewedjoin.reduce.memusage=0.3

 #
 # === SchemaTuple ===
 #
 # The SchemaTuple feature (PIG-2632) uses a tuple's schema (when known) to
 # generate a custom Java class to hold records. Otherwise, tuples are loaded as
 # a plain list that is unaware of its contents' schema -- and so each element
 # has to be wrapped as a Java object on its own. This can provide more efficient
 # CPU utilization, serialization, and most of all memory usage.
 #
 # This feature is considered experimental and is off by default. You can
 # selectively enable it for specific operations using pig.schematuple.udf,
 # pig.schematuple.load, pig.schematuple.fr_join and pig.schematuple.merge_join
 #

 # Enable the SchemaTuple optimization in all available cases? (default: false; recommended: true)
 #
 # pig.schematuple=false

 # EXPERIMENTAL: Use SchemaTuples with UDFs (default: value of pig.schematuple).
 # pig.schematuple.udf=false

 # EXPERIMENTAL, CURRENTLY NOT IMPLEMENTED, but in the future, LoadFunc's with
 # known schemas should output SchemaTuples. (default: value of pig.schematuple)
 # pig.schematuple.load=false

 # EXPERIMENTAL: Use SchemaTuples in replicated joins. The potential memory
 # saving here is significant. (default: value of pig.schematuple)
 # pig.schematuple.fr_join=false

 # EXPERIMENTAL: Use SchemaTuples in merge joins. (default: value of pig.schematuple).
 # pig.schematuple.merge_join=false

 ############################################################################
 #
 # Serialization options
 #

 # Omit empty part files from the output? (default: false)
 #
 # * false (default): reducers generates an output file, even if output is empty
 # * true (recommended): do not generate zero-byte part files
 #
 # The default behavior of MapReduce is to generate an empty file for no data, so
 # Pig follows that. But many small files can cause annoying extra map tasks and
 # put load on the HDFS, so consider setting this to 'true'
 #
 # pig.output.lazy=false

 #
 # === Tempfile Handling
 #

 # EXPERIMENTAL: Storage format for temporary files generated by intermediate
 # stages of Pig jobs. This can provide significant speed increases for certain
 # codecs, as reducing the amount of data transferred to and from disk can more
 # than make up for the cost of compression/compression. Recommend that you set
 # up LZO compression in Hadoop and specify tfile storage.
 #
 # Compress temporary files?
 # * false (default): do not compress
 # * true (recommended): compress temporary files.
 #
 # pig.tmpfilecompression=false
 # pig.tmpfilecompression=true

 # Tempfile storage container type.
 #
 # * tfile (default, recommended): more efficient, but only supports supports gz(gzip) and lzo compression.
 #   https://issues.apache.org/jira/secure/attachment/12396286/TFile%20Specification%2020081217.pdf
 # * seqfile: only supports gz(gzip), lzo, snappy, and bzip2 compression
 #
 # pig.tmpfilecompression.storage=tfile

 # Codec types for intermediate job files. tfile supports gz(gzip) and lzo;
 # seqfile support gz(gzip), lzo, snappy, bzip2
 #
 # * lzo (recommended with caveats): moderate compression, low cpu burden;
 #   typically leads to a noticeable speedup. Best default choice, but you must
 #   set up LZO independently due to license incompatibility
 # * snappy: moderate compression, low cpu burden; typically leads to a noticeable speedup..
 # * gz (default): higher compression, high CPU burden. Typically leads to a noticeable slowdown.
 # * bzip2: most compression, major CPU burden. Typically leads to a noticeable slowdown.
 #
 # pig.tmpfilecompression.codec=gzip

 #
 # === Split Combining
 #

 #
 # Should pig try to combine small files for fewer map tasks? This improves the
 # efficiency of jobs with many small input files, reduces the overhead on the
 # jobtracker, and reduces the number of output files a map-only job
 # produces. However, it only works with certain loaders and increases non-local
 # map tasks. See http://pig.apache.org/docs/r0.12.0/perf.html#combine-files
 #
 # * false (default, recommended): _do_ combine files
 # * true: do not combine files
 #
 # pig.noSplitCombination=false

 #
 # Size, in bytes, of data to be processed by a single map. Smaller files are
 # combined untill this size is reached. If unset, defaults to the file system's
 # default block size.
 #
 # pig.maxCombinedSplitSize=

 # ###########################################################################
 #
 # Execution options
 #

 # Should pig omit combiners? (default, recommended: false -- meaning pig _will_
 # use combiners)
 #
 # When combiners work well, they eliminate a significant amount of
 # data. However, if they do not eliminate much data -- say, a DISTINCT operation
 # that only eliminates 5% of the records -- they add a noticeable overhead to
 # the job. So the recommended default is false (use combiners), selectively
 # disabling them per-job:
 #
 #     pig -Dpig.exec.nocombiner=true distinct_but_not_too_much.pig
 #
 # pig.exec.nocombiner=false

 # Enable or disable use of combiners only in reducer shuffle-merge phase.
 # pig.exec.nocombiner turns off combiner for both map and reduce phases.
 # Valid values are auto, true or false. Default is auto in which Pig turns off combiner
 # on per combine plan basis when bags are present in a particular plan.
 # Value of true or false will apply to all combine plans in the script.
 # Currently this only applies to Tez as Mapreduce does not run combiners in reducer (MAPREDUCE-5221).

 # pig.exec.nocombiner.reducer=auto

 # EXPERIMENTAL: Aggregate records in map task before sending to the combiner?
 # (default: false, 10; recommended: true, 10). In cases where there is a massive
 # reduction of data in the aggregation step, pig can do a first pass of
 # aggregation before the data even leaves the mapper, saving much serialization
 # overhead. It's off by default but can give a major improvement to
 # group-and-aggregate operations. Pig skips partial aggregation unless reduction
 # is better than a factor of minReduction (default: 10). See
 # http://pig.apache.org/docs/r0.12.0/perf.html#hash-based-aggregation
 #
 # pig.exec.mapPartAgg=false
 # pig.exec.mapPartAgg.minReduction=10

 #
 # === Control how many reducers are used.
 #

 # Estimate number of reducers naively using a fixed amount of data per
 # reducer. Optimally, you have both fewer reducers than available reduce slots,
 # and reducers that are neither getting too little data (less than a half-GB or
 # so) nor too much data (more than 2-3 times the reducer child process max heap
 # size). The default of 1000000000 (about 1GB) is probably low for a production
 # cluster -- however it's much worse to set this too high (reducers spill many
 # times over in group-sort) than too low (delay waiting for reduce slots).
 #
 # pig.exec.reducers.bytes.per.reducer=1000000000

 #
 # Don't ever use more than this many reducers. (default: 999)
 #
 # pig.exec.reducers.max=999

 #
 # === Local mode for small jobs
 #

 # EXPERIMENTAL: Use local mode for small jobs? If true, jobs with input data
 # size smaller than pig.auto.local.input.maxbytes bytes and one or no reducers
 # are run in local mode, which is much faster. Note that file paths are still
 # interpreted as pig.exectype implies.
 #
 # * true (recommended): allow local mode for small jobs, which is much faster.
 # * false (default): always use pig.exectype.
 #
 # pig.auto.local.enabled=false

 #
 # Definition of a small job for the pig.auto.local.enabled feature. Only jobs
 # with less than this may bytes are candidates to run locally (default:
 # 100000000 bytes, about 1GB)
 #
 # pig.auto.local.input.maxbytes=100000000


 #
 # Should use hadoop's BZipCodec for bzip2 input? (for PigStorage and TextLoader)
 # Only available for hadoop 2.X and after and ignored for others.(Default: true)
 #
 # pig.bzip.use.hadoop.inputformat=true


 ############################################################################
 #
 # Security Features
 #

 # Comma-delimited list of commands/operators that are disallowed. This security
 # feature can be used by administrators to block use of certain commands by
 # users.
 #
 # * <blank> (default): all commands and operators are allowed.
 # * fs,set (for example): block all filesystem commands and config changes from pig scripts.
 #
 # pig.blacklist=
 # pig.blacklist=fs,set

 # Comma-delimited list of the only commands/operators that are allowed. This
 # security feature can be used by administrators to block use of certain
 # commands by users.
 #
 # * <blank> (default): all commands and operators not on the pig.blacklist are allowed.
 # * load,store,filter,group: only LOAD, STORE, FILTER, GROUP
 #   from pig scripts. All other commands and operators will fail.
 #
 # pig.whitelist=
 # pig.whitelist=load,store,filter,group

 #####################################################################
 #
 # Advanced Site-specific Customizations
 #

 # Remove intermediate output files?
 #
 # * true (default, recommended): remove the files
 # * false: do NOT remove the files. You must clean them up yourself.
 #
 # Keeping them is useful for advanced debugging, but can be dangerous -- you
 # must clean them up yourself.  Inspect the intermediate outputs with
 #
 #     LOAD '/path/to/tmp/file' USING org.apache.pig.impl.io.TFileStorage();
 #
 # (Or ...SequenceFileInterStorage if pig.tmpfilecompression.storage is seqfile)
 #
 # pig.delete.temp.files=true

 # EXPERIMENTAL: A Pig Progress Notification Listener (PPNL) lets you wire pig's
 # progress into your visibility stack. To use a PPNL, supply the fully qualified
 # class name of a PPNL implementation. Note that only one PPNL can be set up, so
 # if you need several, write a PPNL that will chain them.
 #
 # See https://github.com/twitter/ambrose for a pretty awesome one of these
 #
 # pig.notification.listener=<fully qualified class name of a PPNL implementation>

 # String argument to pass to your PPNL constructor (optional). Only a single
 # string value is allowed. (default none)
 #
 # pig.notification.listener.arg=<somevalue>

 # EXPERIMENTAL: Class invoked to estimate the number of reducers to use.
 # (default: org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.InputSizeReducerEstimator)
 #
 # If you don't know how or why to write a PigReducerEstimator, you're unlikely
 # to use this. By default, the naive mapReduceLayer.InputSizeReducerEstimator is
 # used, but you can specify anything implementing the interface
 # org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigReducerEstimator
 #
 # pig.exec.reducer.estimator=<fully qualified class name of a PigReducerEstimator implementation>

 # Optional String argument to pass to your PigReducerEstimator. (default: none;
 # a single String argument is allowed).
 #
 # pig.exec.reducer.estimator.arg=<somevalue>

 # Class invoked to report the size of reducers output. By default, the reducers'
 # output is computed as the total size of output files. But not every storage is
 # file-based, and so this logic can be replaced by implementing the interface
 # org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigStatsOutputSizeReader
 # If you need to register more than one reader, you can register them as a comma
 # separated list. Every reader implements a boolean supports(POStore sto) method.
 # When there are more than one reader, they are consulted in order, and the
 # first one whose supports() method returns true will be used.
 #
 # pig.stats.output.size.reader=<fully qualified class name of a PigStatsOutputSizeReader implementation>
 # pig.stats.output.size.reader.unsupported=<comma separated list of StoreFuncs that are not supported by this reader>

 # By default, Pig retrieves TaskReports for every launched task to compute
 # various job statistics. But this can cause OOM if the number of tasks is
 # large. In such case, you can disable it by setting this property to true.
 # pig.stats.notaskreport=false

 #
 # Override hadoop configs programatically
 #
 # By default, Pig expects hadoop configs (hadoop-site.xml and core-site.xml)
 # to be present on the classpath. There are cases when these configs are
 # needed to be passed programatically, such as while using the PigServer API.
 # In such cases, you can override hadoop configs by setting the property
 # "pig.use.overriden.hadoop.configs".
 #
 # When this property is set to true, Pig ignores looking for hadoop configs
 # in the classpath and instead picks it up from Properties/Configuration
 # object passed to it.
 #
 # pig.use.overriden.hadoop.configs=false

 # Implied LoadFunc for the LOAD operation when no USING clause is
 # present. Supply the fully qualified class name of a LoadFunc
 # implementation. Note: setting this means you will have to modify most code
 # brought in from elsewhere on the web, as people generally omit the USING
 # clause for TSV files.
 #
 # * org.apache.pig.builtin.PigStorage (default): the traditional tab-separated-values LoadFunc
 # * my.custom.udfcollection.MyCustomLoadFunc (for example): use MyCustomLoadFunc instead
 #
 # pig.default.load.func=<fully qualified class name of a LoadFunc implementation>

 # The implied StoreFunc for STORE operations with no USING clause. Supply the
 # fully qualified class name of a StoreFunc implementation.
 #
 # * org.apache.pig.builtin.PigStorage (default): the traditional tab-separated-values StoreFunc.
 # * my.custom.udfcollection.MyCustomStoreFunc (for example): use MyCustomStoreFunc instead
 #
 # pig.default.store.func=<fully qualified class name of a StoreFunc implementation>

 # Recover jobs when the application master is restarted? (default: false). This
 # is a Hadoop 2 specific property; enable it to take advantage of AM recovery.
 #
 # pig.output.committer.recovery.support=true

 # Should scripts check to prevent multiple stores writing to the same location?
 # (default: false) When set to true, stops the execution of script right away.
 #
 pig.location.check.strict=false

 # In addition to the fs-style commands (rm, ls, etc) Pig can now execute
 # SQL-style DDL commands, eg "sql create table pig_test(name string, age int)".
 # The only implemented backend is hcat, and luckily that's also the default.
 #
 # pig.sql.type=hcat

 # Path to the hcat executable, for use with pig.sql.type=hcat (default: null)
 #
 hcat.bin=/usr/local/hcat/bin/hcat

 # Enable ATS hook to log the Pig specific ATS entry, disable only when ATS server is not deployed
 pig.ats.enabled=true

 ###########################################################################
 #
 # Overrides for extreme environments
 #
 # (Most people won't have to adjust these parameters)
 #


 # Limit the pig script length placed in the jobconf xml. (default:10240)
 # Extremely long queries can waste space in the JobConf; since its contents are
 # only advisory, the default is fine unless you are retaining it for forensics.
 #
 # pig.script.max.size=10240

 # Disable use of counters by Pig. Note that the word 'counter' is singular here.
 #
 # * false (default, recommended): do NOT disable counters.
 # * true: disable counters. Set this to true only when your Pig job will
 #   otherwise die because of using more counters than hadoop configured limit
 #
 # pig.disable.counter=true

 # Sample size (per-mapper, in number of rows) the ORDER..BY operation's
 # RandomSampleLoader uses to estimate how your data should be
 # partitioned. (default, recommended: 100 rows per task) Increase this if you
 # have exceptionally large input splits and are unhappy with the reducer skew.
 #
 # pig.random.sampler.sample.size=100

 # Process an entire script at once, reducing the amount of work and number of
 # tasks? (default, recommended: true) See http://pig.apache.org/docs/r0.12.0/perf.html#multi-query-execution
 #
 # MultiQuery optimization is very useful, and so the recommended default is
 # true. You may find a that a script fails to compile under MultiQuery. If so,
 # disable it at runtime:
 #
 #     pig -no_multiquery script_that_makes_pig_sad.pig
 #
 # opt.multiquery=true

 # For small queries, fetch data directly from the HDFS. (default, recommended:
 # true). If you want to force Pig to launch a MR job, for example when you're
 # testing a live cluster, disable with the -N option. See PIG-3642.
 #
 # opt.fetch=true

 #########################################################################
 #
 # Error Handling Properties
 #
 # By default, Pig job fails immediately on encountering an errors on writing Tuples for Store.
 # If you want Pig to allow certain errors before failing you can set this property.
 # If the propery is set to true and the StoreFunc implements ErrorHandling if will allow configurable errors
 # based on the OutputErrorHandler implementation
 # pig.error-handling.enabled = false
 #
 # Controls the minimum number of errors for store
 # pig.error-handling.min.error.records = 0
 #
 # Set the threshold for percentage of errors
 # pig.error-handling.error.threshold = 0.0f

 ###########################################################################
 #
 # Streaming properties
 #

 # Define what properties will be set in the streaming environment. Just set this
 # property to a comma-delimited list of properties to set, and those properties
 # will be set in the environment.
 #
 # pig.streaming.environment=<comma-delimited list of propertes>

 # Specify a comma-delimited list of local files to ship to distributed cache for
 # streaming job.
 #
 # pig.streaming.ship.files=<comma-delimited list of local files>

 # Specify a comma-delimited list of remote files to cache on distributed cache
 # for streaming job.
 #
 # pig.streaming.cache.files=<comma-delimited list of remote files>

 # Specify the python command to be used for python streaming udf. By default,
 # python is used, but you can overwrite it with a non-default version such as
 # python2.7.
 #
 # pig.streaming.udf.python.command=python

 ###########################################################################
 #
 # Tez specific properties
 #

 # Enable auto/grace parallelism in tez. Default is true and these should be
 # used by default unless you encounter some bug in automatic parallelism.
 # If pig.tez.auto.parallelism is set to false, 1 is used as default parallelism

 #pig.tez.auto.parallelism=true
 #pig.tez.grace.parallelism=true

 # Union optimization (pig.tez.opt.union=true) in tez uses vertex groups to store
 # output from different vertices into one final output location.
 # If a StoreFunc's OutputCommitter does not work with multiple vertices
 # writing to same location, then you can disable union optimization just
 # for that StoreFunc. Refer PIG-4649. You can also specify a whitelist of StoreFuncs
 # that are known to work with multiple vertices writing to same location instead of a blacklist

 #pig.tez.opt.union.unsupported.storefuncs=org.apache.hcatalog.pig.HCatStorer,org.apache.hive.hcatalog.pig.HCatStorer
 #pig.tez.opt.union.supported.storefuncs=


 # Pig only reads once from datasource for LoadFuncs specified here during sort instead of
 # loading once for sampling and loading again for partitioning.
 # Used to avoid hitting external non-filesystem datasources like HBase and Accumulo twice.

 pig.sort.readonce.loadfuncs=org.apache.pig.backend.hadoop.hbase.HBaseStorage,org.apache.pig.backend.hadoop.accumulo.AccumuloStorage

 # If set, Pig will override tez.am.launch.cmd-opts and tez.am.resource.memory.mb to optimal
 # even they are set to a different value. Default value is true.
 #pig.tez.configure.am.memory=false