| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| # Pig configuration file. All values can be overwritten by command line |
| # arguments; for a description of the properties, run |
| # |
| # pig -h properties |
| # |
| |
| ############################################################################ |
| # |
| # == Logging properties |
| # |
| |
| # Location of pig log file. If blank, a file with a timestamped slug |
| # ('pig_1399336559369.log') will be generated in the current working directory. |
| # |
| # pig.logfile= |
| # pig.logfile=/tmp/pig-err.log |
| |
| # Log4j configuration file. Set at runtime with the -4 parameter. The source |
| # distribution has a ./conf/log4j.properties.template file you can rename and |
| # customize. |
| # |
| # log4jconf=./conf/log4j.properties |
| |
| # Verbose Output. |
| # * false (default): print only INFO and above to screen |
| # * true: Print all log messages to screen |
| # |
| # verbose=false |
| |
| # Omit timestamps on log messages. (default: false) |
| # |
| # brief=false |
| |
| # Logging level. debug=OFF|ERROR|WARN|INFO|DEBUG (default: INFO) |
| # |
| # debug=INFO |
| |
| # Roll up warnings across tasks, so that when millions of mappers suddenly cry |
| # out in error they are partially silenced. (default, recommended: true) |
| # |
| # aggregate.warning=true |
| |
| # Should DESCRIBE pretty-print its schema? |
| # * false (default): print on a single-line, suitable for pasting back in to your script |
| # * true (recommended): prints on multiple lines with indentation, much more readable |
| # |
| # pig.pretty.print.schema=false |
| |
| # === Profiling UDFs === |
| |
| # Turn on UDF timers? This will cause two counters to be |
| # tracked for every UDF and LoadFunc in your script: approx_microsecs measures |
| # approximate time spent inside a UDF approx_invocations reports the approximate |
| # number of times the UDF was invoked. |
| # |
| # * false (default): do not record timing information of UDFs. |
| # * true: report UDF performance. Uses more counters, but gives more insight |
| # into script operation |
| # |
| # pig.udf.profile=false |
| |
| # Specify frequency of profiling (default: every 100th). |
| # pig.udf.profile.frequency=100 |
| |
| ############################################################################ |
| # |
| # == Site-specific Properties |
| # |
| |
| # Execution Mode. Local mode is much faster, but only suitable for small amounts |
| # of data. Local mode interprets paths on the local file system; Mapreduce mode |
| # on the HDFS. Read more under 'Execution Modes' within the Getting Started |
| # documentation. |
| # |
| # * mapreduce (default): use the Hadoop cluster defined in your Hadoop config files |
| # * local: use local mode |
| # * tez: use Tez on Hadoop cluster |
| # * tez_local: use Tez local mode |
| # |
| # exectype=mapreduce |
| |
| # Bootstrap file with default statements to execute in every Pig job, similar to |
| # .bashrc. If blank, uses the file '.pigbootup' from your home directory; If a |
| # value is supplied, that file is NOT loaded. This does not do tilde expansion |
| # -- you must supply the full path to the file. |
| # |
| # pig.load.default.statements= |
| # pig.load.default.statements=/home/bob/.pigrc |
| |
| # Kill all waiting/running MR jobs upon a MR job failure? (default: false) If |
| # false, jobs that can proceed independently will do so unless a parent stage |
| # fails. If true, the failure of any stage in the script kills all jobs. |
| # |
| # stop.on.failure=false |
| |
| # File containing the pig script to run. Rarely set in the properties file. |
| # Commandline: -f |
| # |
| # file= |
| |
| # Jarfile to load, colon separated. Rarely used. |
| # |
| # jar= |
| |
| # Register additional .jar files to use with your Pig script. |
| # Most typically used as a command line option (see http://pig.apache.org/docs/r0.12.0/basic.html#register): |
| # |
| # pig -Dpig.additional.jars=hdfs://nn.mydomain.com:9020/myjars/my.jar |
| # |
| # pig.additional.jars=<colon separated list of jars with optional wildcards> |
| # pig.additional.jars=/usr/local/share/pig/pig/contrib/piggybank/java/piggybank.jar:/usr/local/share/pig/datafu/datafu-pig/build/libs/datafu-pig-1.2.1.jar |
| |
| # Specify potential packages to which a UDF or a group of UDFs belong, |
| # eliminating the need to qualify the UDF on every call. See |
| # http://pig.apache.org/docs/r0.12.0/udf.html#use-short-names |
| # |
| # Commandline use: |
| # |
| # pig \ |
| # -Dpig.additional.jars=$PIG_HOME/contrib/piggybank/java/piggybank.jar:$PIG_HOME/../datafu/datafu-pig/build/libs/datafu-pig-1.2.1.jar \ |
| # -Dudf.import.list=org.apache.pig.piggybank.evaluation:datafu.pig.util \ |
| # happy_job.pig |
| # |
| # udf.import.list=<colon separated list of imports> |
| # udf.import.list=org.apache.pig.piggybank.evaluation:datafu.pig.bags:datafu.pig.hash:datafu.pig.stats:datafu.pig.util |
| |
| # |
| # Reuse jars across jobs run by the same user? (default: false) If enabled, jars |
| # are placed in ${pig.user.cache.location}/${user.name}/.pigcache. Since most |
| # jars change infrequently, this gives a minor speedup. |
| # |
| # pig.user.cache.enabled=false |
| |
| # Base path for storing jars cached by the pig.user.cache.enabled feature. (default: /tmp) |
| # |
| # pig.user.cache.location=/tmp |
| |
| # Replication factor for cached jars. If not specified mapred.submit.replication |
| # is used, whose default is 10. |
| # |
| # pig.user.cache.replication=10 |
| |
| # Default UTC offset. (default: the host's current UTC offset) Supply a UTC |
| # offset in Java's timezone format: e.g., +08:00. |
| # |
| # pig.datetime.default.tz= |
| |
| # Path to download the artifacts when registering ivy coordinates. This defaults |
| # to the directory grape uses for downloading libraries. |
| # (default: ~/.groovy/grapes) |
| # |
| # pig.artifacts.download.location= |
| |
| ############################################################################ |
| # |
| # Memory impacting properties |
| # |
| |
| # Amount of memory (as fraction of heap) allocated to bags before a spill is |
| # forced. Default is 0.2, meaning 20% of available memory. Note that this memory |
| # is shared across all large bags used by the application. See |
| # http://pig.apache.org/docs/r0.12.0/perf.html#memory-management |
| # |
| # pig.cachedbag.memusage=0.2 |
| |
| # Don't spill bags smaller than this size (bytes). Default: 5000000, or about |
| # 5MB. Usually, the more spilling the longer runtime, so you might want to tune |
| # it according to heap size of each task and so forth. |
| # |
| # pig.spill.size.threshold=5000000 |
| |
| # EXPERIMENTAL: If a file bigger than this size (bytes) is spilled -- thus |
| # freeing a bunch of ram -- tell the JVM to perform garbage collection. This |
| # should help reduce the number of files being spilled, but causes more-frequent |
| # garbage collection. Default: 40000000 (about 40 MB) |
| # |
| # pig.spill.gc.activation.size=40000000 |
| |
| # Spill will be triggered if the fraction of Old Generation heap exceeds the usage or collection threshold. |
| # For bigger heap sizes, using a fixed size for collection and usage thresholds will |
| # utilize memory better than a percentage of the heap. |
| # So usage threshold is calculated as |
| # Max(HeapSize * pig.spill.memory.usage.threshold.fraction, HeapSize - pig.spill.unused.memory.threshold.size) |
| # So collection threshold is calculated as |
| # Max(HeapSize * pig.spill.collection.threshold.fraction, HeapSize - pig.spill.unused.memory.threshold.size) |
| |
| # pig.spill.memory.usage.threshold.fraction=0.7 |
| # pig.spill.collection.threshold.fraction=0.7 |
| # pig.spill.unused.memory.threshold.size=367001600 |
| |
| # Maximum amount of data to replicate using the distributed cache when doing |
| # fragment-replicated join. (default: 1000000000, about 1GB) Consider increasing |
| # this in a production environment, but carefully. |
| # |
| # pig.join.replicated.max.bytes=1000000000 |
| |
| # Fraction of heap available for the reducer to perform a skewed join. A low |
| # fraction forces Pig to use more reducers, but increases the copying cost. See |
| # http://pig.apache.org/docs/r0.12.0/perf.html#skewed-joins |
| # |
| # pig.skewedjoin.reduce.memusage=0.3 |
| |
| # |
| # === SchemaTuple === |
| # |
| # The SchemaTuple feature (PIG-2632) uses a tuple's schema (when known) to |
| # generate a custom Java class to hold records. Otherwise, tuples are loaded as |
| # a plain list that is unaware of its contents' schema -- and so each element |
| # has to be wrapped as a Java object on its own. This can provide more efficient |
| # CPU utilization, serialization, and most of all memory usage. |
| # |
| # This feature is considered experimental and is off by default. You can |
| # selectively enable it for specific operations using pig.schematuple.udf, |
| # pig.schematuple.load, pig.schematuple.fr_join and pig.schematuple.merge_join |
| # |
| |
| # Enable the SchemaTuple optimization in all available cases? (default: false; recommended: true) |
| # |
| # pig.schematuple=false |
| |
| # EXPERIMENTAL: Use SchemaTuples with UDFs (default: value of pig.schematuple). |
| # pig.schematuple.udf=false |
| |
| # EXPERIMENTAL, CURRENTLY NOT IMPLEMENTED, but in the future, LoadFunc's with |
| # known schemas should output SchemaTuples. (default: value of pig.schematuple) |
| # pig.schematuple.load=false |
| |
| # EXPERIMENTAL: Use SchemaTuples in replicated joins. The potential memory |
| # saving here is significant. (default: value of pig.schematuple) |
| # pig.schematuple.fr_join=false |
| |
| # EXPERIMENTAL: Use SchemaTuples in merge joins. (default: value of pig.schematuple). |
| # pig.schematuple.merge_join=false |
| |
| ############################################################################ |
| # |
| # Serialization options |
| # |
| |
| # Omit empty part files from the output? (default: false) |
| # |
| # * false (default): reducers generates an output file, even if output is empty |
| # * true (recommended): do not generate zero-byte part files |
| # |
| # The default behavior of MapReduce is to generate an empty file for no data, so |
| # Pig follows that. But many small files can cause annoying extra map tasks and |
| # put load on the HDFS, so consider setting this to 'true' |
| # |
| # pig.output.lazy=false |
| |
| # |
| # === Tempfile Handling |
| # |
| |
| # EXPERIMENTAL: Storage format for temporary files generated by intermediate |
| # stages of Pig jobs. This can provide significant speed increases for certain |
| # codecs, as reducing the amount of data transferred to and from disk can more |
| # than make up for the cost of compression/compression. Recommend that you set |
| # up LZO compression in Hadoop and specify tfile storage. |
| # |
| # Compress temporary files? |
| # * false (default): do not compress |
| # * true (recommended): compress temporary files. |
| # |
| # pig.tmpfilecompression=false |
| # pig.tmpfilecompression=true |
| |
| # Tempfile storage container type. |
| # |
| # * tfile (default, recommended): more efficient, but only supports supports gz(gzip) and lzo compression. |
| # https://issues.apache.org/jira/secure/attachment/12396286/TFile%20Specification%2020081217.pdf |
| # * seqfile: only supports gz(gzip), lzo, snappy, and bzip2 compression |
| # |
| # pig.tmpfilecompression.storage=tfile |
| |
| # Codec types for intermediate job files. tfile supports gz(gzip) and lzo; |
| # seqfile support gz(gzip), lzo, snappy, bzip2 |
| # |
| # * lzo (recommended with caveats): moderate compression, low cpu burden; |
| # typically leads to a noticeable speedup. Best default choice, but you must |
| # set up LZO independently due to license incompatibility |
| # * snappy: moderate compression, low cpu burden; typically leads to a noticeable speedup.. |
| # * gz (default): higher compression, high CPU burden. Typically leads to a noticeable slowdown. |
| # * bzip2: most compression, major CPU burden. Typically leads to a noticeable slowdown. |
| # |
| # pig.tmpfilecompression.codec=gzip |
| |
| # |
| # === Split Combining |
| # |
| |
| # |
| # Should pig try to combine small files for fewer map tasks? This improves the |
| # efficiency of jobs with many small input files, reduces the overhead on the |
| # jobtracker, and reduces the number of output files a map-only job |
| # produces. However, it only works with certain loaders and increases non-local |
| # map tasks. See http://pig.apache.org/docs/r0.12.0/perf.html#combine-files |
| # |
| # * false (default, recommended): _do_ combine files |
| # * true: do not combine files |
| # |
| # pig.noSplitCombination=false |
| |
| # |
| # Size, in bytes, of data to be processed by a single map. Smaller files are |
| # combined untill this size is reached. If unset, defaults to the file system's |
| # default block size. |
| # |
| # pig.maxCombinedSplitSize= |
| |
| # ########################################################################### |
| # |
| # Execution options |
| # |
| |
| # Should pig omit combiners? (default, recommended: false -- meaning pig _will_ |
| # use combiners) |
| # |
| # When combiners work well, they eliminate a significant amount of |
| # data. However, if they do not eliminate much data -- say, a DISTINCT operation |
| # that only eliminates 5% of the records -- they add a noticeable overhead to |
| # the job. So the recommended default is false (use combiners), selectively |
| # disabling them per-job: |
| # |
| # pig -Dpig.exec.nocombiner=true distinct_but_not_too_much.pig |
| # |
| # pig.exec.nocombiner=false |
| |
| # Enable or disable use of combiners only in reducer shuffle-merge phase. |
| # pig.exec.nocombiner turns off combiner for both map and reduce phases. |
| # Valid values are auto, true or false. Default is auto in which Pig turns off combiner |
| # on per combine plan basis when bags are present in a particular plan. |
| # Value of true or false will apply to all combine plans in the script. |
| # Currently this only applies to Tez as Mapreduce does not run combiners in reducer (MAPREDUCE-5221). |
| |
| # pig.exec.nocombiner.reducer=auto |
| |
| # EXPERIMENTAL: Aggregate records in map task before sending to the combiner? |
| # (default: false, 10; recommended: true, 10). In cases where there is a massive |
| # reduction of data in the aggregation step, pig can do a first pass of |
| # aggregation before the data even leaves the mapper, saving much serialization |
| # overhead. It's off by default but can give a major improvement to |
| # group-and-aggregate operations. Pig skips partial aggregation unless reduction |
| # is better than a factor of minReduction (default: 10). See |
| # http://pig.apache.org/docs/r0.12.0/perf.html#hash-based-aggregation |
| # |
| # pig.exec.mapPartAgg=false |
| # pig.exec.mapPartAgg.minReduction=10 |
| |
| # |
| # === Control how many reducers are used. |
| # |
| |
| # Estimate number of reducers naively using a fixed amount of data per |
| # reducer. Optimally, you have both fewer reducers than available reduce slots, |
| # and reducers that are neither getting too little data (less than a half-GB or |
| # so) nor too much data (more than 2-3 times the reducer child process max heap |
| # size). The default of 1000000000 (about 1GB) is probably low for a production |
| # cluster -- however it's much worse to set this too high (reducers spill many |
| # times over in group-sort) than too low (delay waiting for reduce slots). |
| # |
| # pig.exec.reducers.bytes.per.reducer=1000000000 |
| |
| # |
| # Don't ever use more than this many reducers. (default: 999) |
| # |
| # pig.exec.reducers.max=999 |
| |
| # |
| # === Local mode for small jobs |
| # |
| |
| # EXPERIMENTAL: Use local mode for small jobs? If true, jobs with input data |
| # size smaller than pig.auto.local.input.maxbytes bytes and one or no reducers |
| # are run in local mode, which is much faster. Note that file paths are still |
| # interpreted as pig.exectype implies. |
| # |
| # * true (recommended): allow local mode for small jobs, which is much faster. |
| # * false (default): always use pig.exectype. |
| # |
| # pig.auto.local.enabled=false |
| |
| # |
| # Definition of a small job for the pig.auto.local.enabled feature. Only jobs |
| # with less than this may bytes are candidates to run locally (default: |
| # 100000000 bytes, about 1GB) |
| # |
| # pig.auto.local.input.maxbytes=100000000 |
| |
| |
| # |
| # Should use hadoop's BZipCodec for bzip2 input? (for PigStorage and TextLoader) |
| # Only available for hadoop 2.X and after and ignored for others.(Default: true) |
| # |
| # pig.bzip.use.hadoop.inputformat=true |
| |
| |
| ############################################################################ |
| # |
| # Security Features |
| # |
| |
| # Comma-delimited list of commands/operators that are disallowed. This security |
| # feature can be used by administrators to block use of certain commands by |
| # users. |
| # |
| # * <blank> (default): all commands and operators are allowed. |
| # * fs,set (for example): block all filesystem commands and config changes from pig scripts. |
| # |
| # pig.blacklist= |
| # pig.blacklist=fs,set |
| |
| # Comma-delimited list of the only commands/operators that are allowed. This |
| # security feature can be used by administrators to block use of certain |
| # commands by users. |
| # |
| # * <blank> (default): all commands and operators not on the pig.blacklist are allowed. |
| # * load,store,filter,group: only LOAD, STORE, FILTER, GROUP |
| # from pig scripts. All other commands and operators will fail. |
| # |
| # pig.whitelist= |
| # pig.whitelist=load,store,filter,group |
| |
| ##################################################################### |
| # |
| # Advanced Site-specific Customizations |
| # |
| |
| # Remove intermediate output files? |
| # |
| # * true (default, recommended): remove the files |
| # * false: do NOT remove the files. You must clean them up yourself. |
| # |
| # Keeping them is useful for advanced debugging, but can be dangerous -- you |
| # must clean them up yourself. Inspect the intermediate outputs with |
| # |
| # LOAD '/path/to/tmp/file' USING org.apache.pig.impl.io.TFileStorage(); |
| # |
| # (Or ...SequenceFileInterStorage if pig.tmpfilecompression.storage is seqfile) |
| # |
| # pig.delete.temp.files=true |
| |
| # EXPERIMENTAL: A Pig Progress Notification Listener (PPNL) lets you wire pig's |
| # progress into your visibility stack. To use a PPNL, supply the fully qualified |
| # class name of a PPNL implementation. Note that only one PPNL can be set up, so |
| # if you need several, write a PPNL that will chain them. |
| # |
| # See https://github.com/twitter/ambrose for a pretty awesome one of these |
| # |
| # pig.notification.listener=<fully qualified class name of a PPNL implementation> |
| |
| # String argument to pass to your PPNL constructor (optional). Only a single |
| # string value is allowed. (default none) |
| # |
| # pig.notification.listener.arg=<somevalue> |
| |
| # EXPERIMENTAL: Class invoked to estimate the number of reducers to use. |
| # (default: org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.InputSizeReducerEstimator) |
| # |
| # If you don't know how or why to write a PigReducerEstimator, you're unlikely |
| # to use this. By default, the naive mapReduceLayer.InputSizeReducerEstimator is |
| # used, but you can specify anything implementing the interface |
| # org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigReducerEstimator |
| # |
| # pig.exec.reducer.estimator=<fully qualified class name of a PigReducerEstimator implementation> |
| |
| # Optional String argument to pass to your PigReducerEstimator. (default: none; |
| # a single String argument is allowed). |
| # |
| # pig.exec.reducer.estimator.arg=<somevalue> |
| |
| # Class invoked to report the size of reducers output. By default, the reducers' |
| # output is computed as the total size of output files. But not every storage is |
| # file-based, and so this logic can be replaced by implementing the interface |
| # org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigStatsOutputSizeReader |
| # If you need to register more than one reader, you can register them as a comma |
| # separated list. Every reader implements a boolean supports(POStore sto) method. |
| # When there are more than one reader, they are consulted in order, and the |
| # first one whose supports() method returns true will be used. |
| # |
| # pig.stats.output.size.reader=<fully qualified class name of a PigStatsOutputSizeReader implementation> |
| # pig.stats.output.size.reader.unsupported=<comma separated list of StoreFuncs that are not supported by this reader> |
| |
| # By default, Pig retrieves TaskReports for every launched task to compute |
| # various job statistics. But this can cause OOM if the number of tasks is |
| # large. In such case, you can disable it by setting this property to true. |
| # pig.stats.notaskreport=false |
| |
| # |
| # Override hadoop configs programatically |
| # |
| # By default, Pig expects hadoop configs (hadoop-site.xml and core-site.xml) |
| # to be present on the classpath. There are cases when these configs are |
| # needed to be passed programatically, such as while using the PigServer API. |
| # In such cases, you can override hadoop configs by setting the property |
| # "pig.use.overriden.hadoop.configs". |
| # |
| # When this property is set to true, Pig ignores looking for hadoop configs |
| # in the classpath and instead picks it up from Properties/Configuration |
| # object passed to it. |
| # |
| # pig.use.overriden.hadoop.configs=false |
| |
| # Implied LoadFunc for the LOAD operation when no USING clause is |
| # present. Supply the fully qualified class name of a LoadFunc |
| # implementation. Note: setting this means you will have to modify most code |
| # brought in from elsewhere on the web, as people generally omit the USING |
| # clause for TSV files. |
| # |
| # * org.apache.pig.builtin.PigStorage (default): the traditional tab-separated-values LoadFunc |
| # * my.custom.udfcollection.MyCustomLoadFunc (for example): use MyCustomLoadFunc instead |
| # |
| # pig.default.load.func=<fully qualified class name of a LoadFunc implementation> |
| |
| # The implied StoreFunc for STORE operations with no USING clause. Supply the |
| # fully qualified class name of a StoreFunc implementation. |
| # |
| # * org.apache.pig.builtin.PigStorage (default): the traditional tab-separated-values StoreFunc. |
| # * my.custom.udfcollection.MyCustomStoreFunc (for example): use MyCustomStoreFunc instead |
| # |
| # pig.default.store.func=<fully qualified class name of a StoreFunc implementation> |
| |
| # Recover jobs when the application master is restarted? (default: false). This |
| # is a Hadoop 2 specific property; enable it to take advantage of AM recovery. |
| # |
| # pig.output.committer.recovery.support=true |
| |
| # Should scripts check to prevent multiple stores writing to the same location? |
| # (default: false) When set to true, stops the execution of script right away. |
| # |
| pig.location.check.strict=false |
| |
| # In addition to the fs-style commands (rm, ls, etc) Pig can now execute |
| # SQL-style DDL commands, eg "sql create table pig_test(name string, age int)". |
| # The only implemented backend is hcat, and luckily that's also the default. |
| # |
| # pig.sql.type=hcat |
| |
| # Path to the hcat executable, for use with pig.sql.type=hcat (default: null) |
| # |
| hcat.bin=/usr/local/hcat/bin/hcat |
| |
| # Enable ATS hook to log the Pig specific ATS entry, disable only when ATS server is not deployed |
| pig.ats.enabled=true |
| |
| ########################################################################### |
| # |
| # Overrides for extreme environments |
| # |
| # (Most people won't have to adjust these parameters) |
| # |
| |
| |
| # Limit the pig script length placed in the jobconf xml. (default:10240) |
| # Extremely long queries can waste space in the JobConf; since its contents are |
| # only advisory, the default is fine unless you are retaining it for forensics. |
| # |
| # pig.script.max.size=10240 |
| |
| # Disable use of counters by Pig. Note that the word 'counter' is singular here. |
| # |
| # * false (default, recommended): do NOT disable counters. |
| # * true: disable counters. Set this to true only when your Pig job will |
| # otherwise die because of using more counters than hadoop configured limit |
| # |
| # pig.disable.counter=true |
| |
| # Sample size (per-mapper, in number of rows) the ORDER..BY operation's |
| # RandomSampleLoader uses to estimate how your data should be |
| # partitioned. (default, recommended: 100 rows per task) Increase this if you |
| # have exceptionally large input splits and are unhappy with the reducer skew. |
| # |
| # pig.random.sampler.sample.size=100 |
| |
| # Process an entire script at once, reducing the amount of work and number of |
| # tasks? (default, recommended: true) See http://pig.apache.org/docs/r0.12.0/perf.html#multi-query-execution |
| # |
| # MultiQuery optimization is very useful, and so the recommended default is |
| # true. You may find a that a script fails to compile under MultiQuery. If so, |
| # disable it at runtime: |
| # |
| # pig -no_multiquery script_that_makes_pig_sad.pig |
| # |
| # opt.multiquery=true |
| |
| # For small queries, fetch data directly from the HDFS. (default, recommended: |
| # true). If you want to force Pig to launch a MR job, for example when you're |
| # testing a live cluster, disable with the -N option. See PIG-3642. |
| # |
| # opt.fetch=true |
| |
| ######################################################################### |
| # |
| # Error Handling Properties |
| # |
| # By default, Pig job fails immediately on encountering an errors on writing Tuples for Store. |
| # If you want Pig to allow certain errors before failing you can set this property. |
| # If the propery is set to true and the StoreFunc implements ErrorHandling if will allow configurable errors |
| # based on the OutputErrorHandler implementation |
| # pig.error-handling.enabled = false |
| # |
| # Controls the minimum number of errors for store |
| # pig.error-handling.min.error.records = 0 |
| # |
| # Set the threshold for percentage of errors |
| # pig.error-handling.error.threshold = 0.0f |
| |
| ########################################################################### |
| # |
| # Streaming properties |
| # |
| |
| # Define what properties will be set in the streaming environment. Just set this |
| # property to a comma-delimited list of properties to set, and those properties |
| # will be set in the environment. |
| # |
| # pig.streaming.environment=<comma-delimited list of propertes> |
| |
| # Specify a comma-delimited list of local files to ship to distributed cache for |
| # streaming job. |
| # |
| # pig.streaming.ship.files=<comma-delimited list of local files> |
| |
| # Specify a comma-delimited list of remote files to cache on distributed cache |
| # for streaming job. |
| # |
| # pig.streaming.cache.files=<comma-delimited list of remote files> |
| |
| # Specify the python command to be used for python streaming udf. By default, |
| # python is used, but you can overwrite it with a non-default version such as |
| # python2.7. |
| # |
| # pig.streaming.udf.python.command=python |
| |
| ########################################################################### |
| # |
| # Tez specific properties |
| # |
| |
| # Enable auto/grace parallelism in tez. Default is true and these should be |
| # used by default unless you encounter some bug in automatic parallelism. |
| # If pig.tez.auto.parallelism is set to false, 1 is used as default parallelism |
| |
| #pig.tez.auto.parallelism=true |
| #pig.tez.grace.parallelism=true |
| |
| # Union optimization (pig.tez.opt.union=true) in tez uses vertex groups to store |
| # output from different vertices into one final output location. |
| # If a StoreFunc's OutputCommitter does not work with multiple vertices |
| # writing to same location, then you can disable union optimization just |
| # for that StoreFunc. Refer PIG-4649. You can also specify a whitelist of StoreFuncs |
| # that are known to work with multiple vertices writing to same location instead of a blacklist |
| |
| #pig.tez.opt.union.unsupported.storefuncs=org.apache.hcatalog.pig.HCatStorer,org.apache.hive.hcatalog.pig.HCatStorer |
| #pig.tez.opt.union.supported.storefuncs= |
| |
| |
| # Pig only reads once from datasource for LoadFuncs specified here during sort instead of |
| # loading once for sampling and loading again for partitioning. |
| # Used to avoid hitting external non-filesystem datasources like HBase and Accumulo twice. |
| |
| pig.sort.readonce.loadfuncs=org.apache.pig.backend.hadoop.hbase.HBaseStorage,org.apache.pig.backend.hadoop.accumulo.AccumuloStorage |
| |
| # If set, Pig will override tez.am.launch.cmd-opts and tez.am.resource.memory.mb to optimal |
| # even they are set to a different value. Default value is true. |
| #pig.tez.configure.am.memory=false |