hivesterix/hivesterix-dist/src/test/resources/optimizerts/hive/conf/hive-default.xml - asterixdb - Git at Google

 <?xml version="1.0"?>
 <!--
  ! Copyright 2009-2013 by The Regents of the University of California
  ! Licensed under the Apache License, Version 2.0 (the "License");
  ! you may not use this file except in compliance with the License.
  ! you may obtain a copy of the License from
  !
  !     http://www.apache.org/licenses/LICENSE-2.0
  !
  ! Unless required by applicable law or agreed to in writing, software
  ! distributed under the License is distributed on an "AS IS" BASIS,
  ! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ! See the License for the specific language governing permissions and
  ! limitations under the License.
  !-->
 <?xml-stylesheet type="text/xsl" href="configuration.xsl"?>

 <configuration>

 	<!-- Hive Configuration can either be stored in this file or in the hadoop
 		configuration files -->
 	<!-- that are implied by Hadoop setup variables. -->
 	<!-- Aside from Hadoop setup variables - this file is provided as a convenience
 		so that Hive -->
 	<!-- users do not have to edit hadoop configuration files (that may be managed
 		as a centralized -->
 	<!-- resource). -->

 	<!-- Hive Execution Parameters -->
 	<property>
 		<name>mapred.reduce.tasks</name>
 		<value>-1</value>
 		<description>The default number of reduce tasks per job. Typically set
 			to a prime close to the number of available hosts. Ignored when
 			mapred.job.tracker is "local". Hadoop set this to 1 by default,
 			whereas hive uses -1 as its default value.
 			By setting this property to
 			-1, Hive will automatically figure out what
 			should be the number of
 			reducers.
   </description>
 	</property>

 	<property>
 		<name>hive.hyracks.host</name>
 		<value>127.0.0.1</value>
 	</property>

 	<property>
 		<name>hive.hyracks.port</name>
 		<value>13099</value>
 	</property>

 	<property>
 		<name>hive.hyracks.app</name>
 		<value>hivesterix</value>
 	</property>


 	<property>
 		<name>hive.hyracks.parrallelism</name>
 		<value>2</value>
 	</property>

 	<property>
 		<name>hive.algebricks.groupby.external</name>
 		<value>true</value>
 	</property>

 	<property>
 		<name>hive.algebricks.groupby.external.memory</name>
 		<value>3072</value>
 	</property>

 	<property>
 		<name>hive.algebricks.sort.memory</name>
 		<value>3072</value>
 	</property>

 	<property>
 		<name>hive.algebricks.framesize</name>
 		<value>768</value>
 	</property>

 	<property>
 		<name>hive.exec.reducers.bytes.per.reducer</name>
 		<value>1000000000</value>
 		<description>size per reducer.The default is 1G, i.e if the input size
 			is 10G, it will use 10 reducers.</description>
 	</property>

 	<property>
 		<name>hive.exec.reducers.max</name>
 		<value>999</value>
 		<description>max number of reducers will be used. If the one
 			specified
 			in the configuration parameter mapred.reduce.tasks is
 			negative, hive
 			will use this one as the max number of reducers when
 			automatically
 			determine number of reducers.</description>
 	</property>

 	<property>
 		<name>hive.exec.scratchdir</name>
 		<value>/tmp/hive-${user.name}</value>
 		<description>Scratch space for Hive jobs</description>
 	</property>

 	<property>
 		<name>hive.test.mode</name>
 		<value>false</value>
 		<description>whether hive is running in test mode. If yes, it turns on
 			sampling and prefixes the output tablename</description>
 	</property>

 	<property>
 		<name>hive.test.mode.prefix</name>
 		<value>test_</value>
 		<description>if hive is running in test mode, prefixes the output
 			table by this string</description>
 	</property>

 	<!-- If the input table is not bucketed, the denominator of the tablesample
 		is determinied by the parameter below -->
 	<!-- For example, the following query: -->
 	<!-- INSERT OVERWRITE TABLE dest -->
 	<!-- SELECT col1 from src -->
 	<!-- would be converted to -->
 	<!-- INSERT OVERWRITE TABLE test_dest -->
 	<!-- SELECT col1 from src TABLESAMPLE (BUCKET 1 out of 32 on rand(1)) -->
 	<property>
 		<name>hive.test.mode.samplefreq</name>
 		<value>32</value>
 		<description>if hive is running in test mode and table is not
 			bucketed, sampling frequency</description>
 	</property>

 	<property>
 		<name>hive.test.mode.nosamplelist</name>
 		<value></value>
 		<description>if hive is running in test mode, dont sample the above
 			comma seperated list of tables</description>
 	</property>

 	<property>
 		<name>hive.metastore.local</name>
 		<value>true</value>
 		<description>controls whether to connect to remove metastore server or
 			open a new metastore server in Hive Client JVM</description>
 	</property>

 	<property>
 		<name>javax.jdo.option.ConnectionURL</name>
 		<value>jdbc:derby:;databaseName=metastore_db;create=true</value>
 		<description>JDBC connect string for a JDBC metastore</description>
 	</property>

 	<property>
 		<name>javax.jdo.option.ConnectionDriverName</name>
 		<value>org.apache.derby.jdbc.EmbeddedDriver</value>
 		<description>Driver class name for a JDBC metastore</description>
 	</property>

 	<property>
 		<name>javax.jdo.PersistenceManagerFactoryClass</name>
 		<value>org.datanucleus.jdo.JDOPersistenceManagerFactory</value>
 		<description>class implementing the jdo persistence</description>
 	</property>

 	<property>
 		<name>datanucleus.connectionPoolingType</name>
 		<value>DBCP</value>
 		<description>Uses a DBCP connection pool for JDBC metastore
 		</description>
 	</property>

 	<property>
 		<name>javax.jdo.option.DetachAllOnCommit</name>
 		<value>true</value>
 		<description>detaches all objects from session so that they can be
 			used after transaction is committed</description>
 	</property>

 	<property>
 		<name>javax.jdo.option.NonTransactionalRead</name>
 		<value>true</value>
 		<description>reads outside of transactions</description>
 	</property>

 	<property>
 		<name>javax.jdo.option.ConnectionUserName</name>
 		<value>APP</value>
 		<description>username to use against metastore database</description>
 	</property>

 	<property>
 		<name>javax.jdo.option.ConnectionPassword</name>
 		<value>mine</value>
 		<description>password to use against metastore database</description>
 	</property>

 	<property>
 		<name>datanucleus.validateTables</name>
 		<value>false</value>
 		<description>validates existing schema against code. turn this on if
 			you want to verify existing schema </description>
 	</property>

 	<property>
 		<name>datanucleus.validateColumns</name>
 		<value>false</value>
 		<description>validates existing schema against code. turn this on if
 			you want to verify existing schema </description>
 	</property>

 	<property>
 		<name>datanucleus.validateConstraints</name>
 		<value>false</value>
 		<description>validates existing schema against code. turn this on if
 			you want to verify existing schema </description>
 	</property>

 	<property>
 		<name>datanucleus.storeManagerType</name>
 		<value>rdbms</value>
 		<description>metadata store type</description>
 	</property>

 	<property>
 		<name>datanucleus.autoCreateSchema</name>
 		<value>true</value>
 		<description>creates necessary schema on a startup if one doesn't
 			exist. set this to false, after creating it once</description>
 	</property>

 	<property>
 		<name>datanucleus.autoStartMechanismMode</name>
 		<value>checked</value>
 		<description>throw exception if metadata tables are incorrect
 		</description>
 	</property>

 	<property>
 		<name>datanucleus.transactionIsolation</name>
 		<value>read-committed</value>
 		<description>Default transaction isolation level for identity
 			generation. </description>
 	</property>

 	<property>
 		<name>datanucleus.cache.level2</name>
 		<value>false</value>
 		<description>Use a level 2 cache. Turn this off if metadata is changed
 			independently of hive metastore server</description>
 	</property>

 	<property>
 		<name>datanucleus.cache.level2.type</name>
 		<value>SOFT</value>
 		<description>SOFT=soft reference based cache, WEAK=weak reference
 			based cache.</description>
 	</property>

 	<property>
 		<name>datanucleus.identifierFactory</name>
 		<value>datanucleus</value>
 		<description>Name of the identifier factory to use when generating
 			table/column names etc. 'datanucleus' is used for backward
 			compatibility</description>
 	</property>

 	<property>
 		<name>hive.metastore.warehouse.dir</name>
 		<value>/tmp/hivesterix</value>
 		<description>location of default database for the warehouse
 		</description>
 	</property>

 	<property>
 		<name>hive.metastore.connect.retries</name>
 		<value>5</value>
 		<description>Number of retries while opening a connection to metastore
 		</description>
 	</property>

 	<property>
 		<name>hive.metastore.rawstore.impl</name>
 		<value>org.apache.hadoop.hive.metastore.ObjectStore</value>
 		<description>Name of the class that implements
 			org.apache.hadoop.hive.metastore.rawstore interface. This class is
 			used to store and retrieval of raw metadata objects such as table,
 			database</description>
 	</property>

 	<property>
 		<name>hive.default.fileformat</name>
 		<value>TextFile</value>
 		<description>Default file format for CREATE TABLE statement. Options
 			are TextFile and SequenceFile. Users can explicitly say CREATE TABLE
 			... STORED AS &lt;TEXTFILE|SEQUENCEFILE&gt; to override</description>
 	</property>

 	<property>
 		<name>hive.fileformat.check</name>
 		<value>true</value>
 		<description>Whether to check file format or not when loading data
 			files</description>
 	</property>

 	<property>
 		<name>hive.map.aggr</name>
 		<value>true</value>
 		<description>Whether to use map-side aggregation in Hive Group By
 			queries</description>
 	</property>

 	<property>
 		<name>hive.groupby.skewindata</name>
 		<value>false</value>
 		<description>Whether there is skew in data to optimize group by
 			queries</description>
 	</property>

 	<property>
 		<name>hive.groupby.mapaggr.checkinterval</name>
 		<value>100000</value>
 		<description>Number of rows after which size of the grouping
 			keys/aggregation classes is performed</description>
 	</property>

 	<property>
 		<name>hive.mapred.local.mem</name>
 		<value>0</value>
 		<description>For local mode, memory of the mappers/reducers
 		</description>
 	</property>

 	<property>
 		<name>hive.map.aggr.hash.percentmemory</name>
 		<value>0.5</value>
 		<description>Portion of total memory to be used by map-side grup
 			aggregation hash table</description>
 	</property>

 	<property>
 		<name>hive.map.aggr.hash.min.reduction</name>
 		<value>0.5</value>
 		<description>Hash aggregation will be turned off if the ratio between
 			hash
 			table size and input rows is bigger than this number. Set to 1 to
 			make
 			sure
 			hash aggregation is never turned off.</description>
 	</property>

 	<property>
 		<name>hive.optimize.cp</name>
 		<value>true</value>
 		<description>Whether to enable column pruner</description>
 	</property>

 	<property>
 		<name>hive.optimize.ppd</name>
 		<value>true</value>
 		<description>Whether to enable predicate pushdown</description>
 	</property>

 	<property>
 		<name>hive.optimize.pruner</name>
 		<value>true</value>
 		<description>Whether to enable the new partition pruner which depends
 			on predicate pushdown. If this is disabled,
 			the old partition pruner
 			which is based on AST will be enabled.
 		</description>
 	</property>

 	<property>
 		<name>hive.optimize.groupby</name>
 		<value>true</value>
 		<description>Whether to enable the bucketed group by from bucketed
 			partitions/tables.</description>
 	</property>

 	<property>
 		<name>hive.join.emit.interval</name>
 		<value>1000</value>
 		<description>How many rows in the right-most join operand Hive should
 			buffer before emitting the join result. </description>
 	</property>

 	<property>
 		<name>hive.join.cache.size</name>
 		<value>25000</value>
 		<description>How many rows in the joining tables (except the streaming
 			table) should be cached in memory. </description>
 	</property>

 	<property>
 		<name>hive.mapjoin.bucket.cache.size</name>
 		<value>100</value>
 		<description>How many values in each keys in the map-joined table
 			should be cached in memory. </description>
 	</property>

 	<property>
 		<name>hive.mapjoin.maxsize</name>
 		<value>100000</value>
 		<description>Maximum # of rows of the small table that can be handled
 			by map-side join. If the size is reached and hive.task.progress is
 			set, a fatal error counter is set and the job will be killed.
 		</description>
 	</property>

 	<property>
 		<name>hive.mapjoin.cache.numrows</name>
 		<value>25000</value>
 		<description>How many rows should be cached by jdbm for map join.
 		</description>
 	</property>

 	<property>
 		<name>hive.optimize.skewjoin</name>
 		<value>false</value>
 		<description>Whether to enable skew join optimization. </description>
 	</property>

 	<property>
 		<name>hive.skewjoin.key</name>
 		<value>100000</value>
 		<description>Determine if we get a skew key in join. If we see more
 			than the specified number of rows with the same key in join operator,
 			we think the key as a skew join key. </description>
 	</property>

 	<property>
 		<name>hive.skewjoin.mapjoin.map.tasks</name>
 		<value>10000</value>
 		<description> Determine the number of map task used in the follow up
 			map join job
 			for a skew join. It should be used together with
 			hive.skewjoin.mapjoin.min.split
 			to perform a fine grained control.
 		</description>
 	</property>

 	<property>
 		<name>hive.skewjoin.mapjoin.min.split</name>
 		<value>33554432</value>
 		<description> Determine the number of map task at most used in the
 			follow up map join job
 			for a skew join by specifying the minimum split
 			size. It should be used
 			together with
 			hive.skewjoin.mapjoin.map.tasks
 			to perform a fine grained control.</description>
 	</property>

 	<property>
 		<name>hive.mapred.mode</name>
 		<value>nonstrict</value>
 		<description>The mode in which the hive operations are being
 			performed. In strict mode, some risky queries are not allowed to run
 		</description>
 	</property>

 	<property>
 		<name>hive.exec.script.maxerrsize</name>
 		<value>100000</value>
 		<description>Maximum number of bytes a script is allowed to emit to
 			standard error (per map-reduce task). This prevents runaway scripts
 			from filling logs partitions to capacity </description>
 	</property>

 	<property>
 		<name>hive.exec.script.allow.partial.consumption</name>
 		<value>false</value>
 		<description> When enabled, this option allows a user script to exit
 			successfully without consuming all the data from the standard input.
 		</description>
 	</property>

 	<property>
 		<name>hive.script.operator.id.env.var</name>
 		<value>HIVE_SCRIPT_OPERATOR_ID</value>
 		<description> Name of the environment variable that holds the unique
 			script operator ID in the user's transform function (the custom
 			mapper/reducer that the user has specified in the query)
 		</description>
 	</property>

 	<property>
 		<name>hive.exec.compress.output</name>
 		<value>false</value>
 		<description> This controls whether the final outputs of a query (to a
 			local/hdfs file or a hive table) is compressed. The compression codec
 			and other options are determined from hadoop config variables
 			mapred.output.compress* </description>
 	</property>

 	<property>
 		<name>hive.exec.compress.intermediate</name>
 		<value>false</value>
 		<description> This controls whether intermediate files produced by
 			hive between multiple map-reduce jobs are compressed. The compression
 			codec and other options are determined from hadoop config variables
 			mapred.output.compress* </description>
 	</property>

 	<property>
 		<name>hive.exec.parallel</name>
 		<value>false</value>
 		<description>Whether to execute jobs in parallel</description>
 	</property>

 	<property>
 		<name>hive.exec.parallel.thread.number</name>
 		<value>8</value>
 		<description>How many jobs at most can be executed in parallel
 		</description>
 	</property>

 	<property>
 		<name>hive.hwi.war.file</name>
 		<value>lib\hive-hwi-0.7.0.war</value>
 		<description>This sets the path to the HWI war file, relative to
 			${HIVE_HOME}. </description>
 	</property>

 	<property>
 		<name>hive.hwi.listen.host</name>
 		<value>0.0.0.0</value>
 		<description>This is the host address the Hive Web Interface will
 			listen on</description>
 	</property>

 	<property>
 		<name>hive.hwi.listen.port</name>
 		<value>9999</value>
 		<description>This is the port the Hive Web Interface will listen on
 		</description>
 	</property>

 	<property>
 		<name>hive.exec.pre.hooks</name>
 		<value></value>
 		<description>Pre Execute Hook for Tests</description>
 	</property>

 	<property>
 		<name>hive.merge.mapfiles</name>
 		<value>true</value>
 		<description>Merge small files at the end of a map-only job
 		</description>
 	</property>

 	<property>
 		<name>hive.merge.mapredfiles</name>
 		<value>false</value>
 		<description>Merge small files at the end of a map-reduce job
 		</description>
 	</property>

 	<property>
 		<name>hive.heartbeat.interval</name>
 		<value>1000</value>
 		<description>Send a heartbeat after this interval - used by mapjoin
 			and filter operators</description>
 	</property>

 	<property>
 		<name>hive.merge.size.per.task</name>
 		<value>256000000</value>
 		<description>Size of merged files at the end of the job</description>
 	</property>

 	<property>
 		<name>hive.merge.size.smallfiles.avgsize</name>
 		<value>16000000</value>
 		<description>When the average output file size of a job is less than
 			this number, Hive will start an additional map-reduce job to merge
 			the output files into bigger files. This is only done for map-only
 			jobs if hive.merge.mapfiles is true, and for map-reduce jobs if
 			hive.merge.mapredfiles is true.</description>
 	</property>

 	<property>
 		<name>hive.script.auto.progress</name>
 		<value>false</value>
 		<description>Whether Hive Tranform/Map/Reduce Clause should
 			automatically send progress information to TaskTracker to avoid the
 			task getting killed because of inactivity. Hive sends progress
 			information when the script is outputting to stderr. This option
 			removes the need of periodically producing stderr messages, but users
 			should be cautious because this may prevent infinite loops in the
 			scripts to be killed by TaskTracker.  </description>
 	</property>

 	<property>
 		<name>hive.script.serde</name>
 		<value>org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe</value>
 		<description>The default serde for trasmitting input data to and
 			reading output data from the user scripts. </description>
 	</property>

 	<property>
 		<name>hive.script.recordreader</name>
 		<value>org.apache.hadoop.hive.ql.exec.TextRecordReader</value>
 		<description>The default record reader for reading data from the user
 			scripts. </description>
 	</property>

 	<property>
 		<name>hive.script.recordwriter</name>
 		<value>org.apache.hadoop.hive.ql.exec.TextRecordWriter</value>
 		<description>The default record writer for writing data to the user
 			scripts. </description>
 	</property>

 	<property>
 		<name>hive.input.format</name>
 		<value>org.apache.hadoop.hive.ql.io.HiveInputFormat</value>
 		<description>The default input format, if it is not specified, the
 			system assigns it. It is set to HiveInputFormat for hadoop versions
 			17, 18 and 19, whereas it is set to CombinedHiveInputFormat for
 			hadoop 20. The user can always overwrite it - if there is a bug in
 			CombinedHiveInputFormat, it can always be manually set to
 			HiveInputFormat. </description>
 	</property>

 	<property>
 		<name>hive.udtf.auto.progress</name>
 		<value>false</value>
 		<description>Whether Hive should automatically send progress
 			information to TaskTracker when using UDTF's to prevent the task
 			getting killed because of inactivity. Users should be cautious
 			because this may prevent TaskTracker from killing tasks with infinte
 			loops.  </description>
 	</property>

 	<property>
 		<name>hive.mapred.reduce.tasks.speculative.execution</name>
 		<value>true</value>
 		<description>Whether speculative execution for reducers should be
 			turned on. </description>
 	</property>

 	<property>
 		<name>hive.exec.counters.pull.interval</name>
 		<value>1000</value>
 		<description>The interval with which to poll the JobTracker for the
 			counters the running job. The smaller it is the more load there will
 			be on the jobtracker, the higher it is the less granular the caught
 			will be.</description>
 	</property>

 	<property>
 		<name>hive.enforce.bucketing</name>
 		<value>false</value>
 		<description>Whether bucketing is enforced. If true, while inserting
 			into the table, bucketing is enforced. </description>
 	</property>

 	<property>
 		<name>hive.enforce.sorting</name>
 		<value>false</value>
 		<description>Whether sorting is enforced. If true, while inserting
 			into the table, sorting is enforced. </description>
 	</property>

 	<property>
 		<name>hive.metastore.ds.connection.url.hook</name>
 		<value></value>
 		<description>Name of the hook to use for retriving the JDO connection
 			URL. If empty, the value in javax.jdo.option.ConnectionURL is used
 		</description>
 	</property>

 	<property>
 		<name>hive.metastore.ds.retry.attempts</name>
 		<value>1</value>
 		<description>The number of times to retry a metastore call if there
 			were a connection error</description>
 	</property>

 	<property>
 		<name>hive.metastore.ds.retry.interval</name>
 		<value>1000</value>
 		<description>The number of miliseconds between metastore retry
 			attempts</description>
 	</property>

 	<property>
 		<name>hive.metastore.server.min.threads</name>
 		<value>200</value>
 		<description>Minimum number of worker threads in the Thrift server's
 			pool.</description>
 	</property>

 	<property>
 		<name>hive.metastore.server.max.threads</name>
 		<value>100000</value>
 		<description>Maximum number of worker threads in the Thrift server's
 			pool.</description>
 	</property>

 	<property>
 		<name>hive.metastore.server.tcp.keepalive</name>
 		<value>true</value>
 		<description>Whether to enable TCP keepalive for the metastore server.
 			Keepalive will prevent accumulation of half-open connections.
 		</description>
 	</property>

 	<property>
 		<name>hive.optimize.reducededuplication</name>
 		<value>true</value>
 		<description>Remove extra map-reduce jobs if the data is already
 			clustered by the same key which needs to be used again. This should
 			always be set to true. Since it is a new feature, it has been made
 			configurable.</description>
 	</property>

 	<property>
 		<name>hive.exec.dynamic.partition</name>
 		<value>false</value>
 		<description>Whether or not to allow dynamic partitions in DML/DDL.
 		</description>
 	</property>

 	<property>
 		<name>hive.exec.dynamic.partition.mode</name>
 		<value>strict</value>
 		<description>In strict mode, the user must specify at least one static
 			partition in case the user accidentally overwrites all partitions.
 		</description>
 	</property>

 	<property>
 		<name>hive.exec.max.dynamic.partitions</name>
 		<value>1000</value>
 		<description>Maximum number of dynamic partitions allowed to be
 			created in total.</description>
 	</property>

 	<property>
 		<name>hive.exec.max.dynamic.partitions.pernode</name>
 		<value>100</value>
 		<description>Maximum number of dynamic partitions allowed to be
 			created in each mapper/reducer node.</description>
 	</property>

 	<property>
 		<name>hive.default.partition.name</name>
 		<value>__HIVE_DEFAULT_PARTITION__</value>
 		<description>The default partition name in case the dynamic partition
 			column value is null/empty string or anyother values that cannot be
 			escaped. This value must not contain any special character used in
 			HDFS URI (e.g., ':', '%', '/' etc). The user has to be aware that the
 			dynamic partition value should not contain this value to avoid
 			confusions.</description>
 	</property>

 	<property>
 		<name>fs.har.impl</name>
 		<value>org.apache.hadoop.hive.shims.HiveHarFileSystem</value>
 		<description>The implementation for accessing Hadoop Archives. Note
 			that this won't be applicable to Hadoop vers less than 0.20
 		</description>
 	</property>

 	<property>
 		<name>hive.archive.enabled</name>
 		<value>false</value>
 		<description>Whether archiving operations are permitted</description>
 	</property>

 	<property>
 		<name>hive.archive.har.parentdir.settable</name>
 		<value>false</value>
 		<description>In new Hadoop versions, the parent directory must be set
 			while
 			creating a HAR. Because this functionality is hard to detect
 			with just
 			version
 			numbers, this conf var needs to be set manually.
 		</description>
 	</property>

 	<!-- HBase Storage Handler Parameters -->

 	<property>
 		<name>hive.hbase.wal.enabled</name>
 		<value>true</value>
 		<description>Whether writes to HBase should be forced to the
 			write-ahead log. Disabling this improves HBase write performance at
 			the risk of lost writes in case of a crash.</description>
 	</property>

 	<property>
 		<name>hive.exec.drop.ignorenonexistent</name>
 		<value>true</value>
 		<description>drop table always works.</description>
 	</property>

 </configuration>