blob: 4479d61a42a76dc338cb635ef77ce3939b39d554 [file] [log] [blame]
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
hive_major_version = int(os.environ['IMPALA_HIVE_VERSION'][0])
kerberize = os.environ.get('IMPALA_KERBERIZE') == '1'
variant = os.environ.get('HIVE_VARIANT')
CONFIG = {
'dfs.replication': '3'
}
# General Hive configuration.
CONFIG.update({
# IMPALA-781: Impala doesn't support the default LazyBinaryColumnarSerde for RCFile.
'hive.default.rcfile.serde': 'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe',
# IMPALA-7154: Some of the HMS operations on S3 can take long time and any timeouts
# on the HMS Client side can result in flaky test failures
# Following configuration specifies the time in seconds between successive retry
# attempts by metastore client in case of failures. By default, metastore client
# retries once based on the config value of hive.metastore.failure.retries
'hive.metastore.client.connect.retry.delay': '1',
# set the metastore client timeout to 10 min
'hive.metastore.client.socket.timeout': '600',
'hive.metastore.uris': 'thrift://${INTERNAL_LISTEN_HOST}:9083',
# Location of Hive per-query log files of the form: hive_job_log_<hive_query_id>.txt
'hive.querylog.location': '${IMPALA_CLUSTER_LOGS_DIR}/hive',
'hive.sentry.conf.url': 'file:///${IMPALA_HOME}/fe/src/test/resources/sentry-site.xml',
# Change back to NOSASL when HIVE-4232 is fixed.
# With NONE, Hive uses the plain SASL transport.
'hive.server2.authentication': '${HIVE_S2_AUTH}',
# Disable user impersonation for HiveServer2 to avoid launch failure
# if username contains dots (IMPALA-6789).
'hive.server2.enable.doAs': 'false',
# TODO(todd): should we be enabling stats autogather?
'hive.stats.autogather': 'false',
'hive.support.concurrency': 'true',
})
if variant == 'changed_external_dir':
CONFIG.update({
'hive.metastore.warehouse.external.dir': '${WAREHOUSE_LOCATION_PREFIX}/test-warehouse-external',
})
# HBase-related configs.
# Impala processes need to connect to zookeeper on INTERNAL_LISTEN_HOST for HBase.
CONFIG.update({
'hive.cluster.delegation.token.store.zookeeper.connectString': '${INTERNAL_LISTEN_HOST}:2181',
})
if kerberize:
CONFIG.update({
'hive.server2.authentication.kerberos.keytab': '${KRB5_KTNAME}',
'hive.server2.authentication.kerberos.principal': '${MINIKDC_PRINC_HIVE}',
# These are necessary to connect to Kerberized HBase from within Hive jobs.
'hbase.coprocessor.region.classes': 'org.apache.hadoop.hbase.security.token.TokenProvider',
'hbase.master.kerberos.principal': '${MINIKDC_PRINC_HBSE}',
'hbase.regionserver.kerberos.principal': '${MINIKDC_PRINC_HBSE}',
'hbase.security.authentication': 'kerberos',
'hbase.zookeeper.quorum': '${INTERNAL_LISTEN_HOST}',
})
# TODO: we currently don't kerberize the metastore. If we want to, we need to
# set:
# hive.metastore.sasl.enabled
# hive.metastore.kerberos.keytab.file
# hive.metastore.kerberos.principal
# Enable Tez and ACID for Hive 3
if hive_major_version >= 3:
CONFIG.update({
'hive.tez.container.size': '512',
'hive.txn.manager': 'org.apache.hadoop.hive.ql.lockmgr.DbTxnManager',
# We run YARN with Tez on the classpath directly
'tez.ignore.lib.uris': 'true',
'tez.use.cluster.hadoop-libs': 'true',
# Some of the tests change the columns in a incompatible manner
# (eg. string to timestamp) this is disallowed by default in Hive-3 which causes
# these tests to fail. We disable this behavior in minicluster to keep running the
# same tests on both hms-2 and hms-3
'hive.metastore.disallow.incompatible.col.type.changes': 'false',
# Group input splits to run in a small number of mappers, and merge small
# files at the end of jobs if necessary, to be more similar to the legacy
# MR execution defaults. This helps ensure that we produce the same
# dataload results with Hive2-MR vs Hive3-Tez.
#
# NOTE: This currently doesn't seem to take effect on our pseudo-distributed
# test cluster, because the hostname is 'localhost' and some Tez code path
# gets triggered which ignores the min-size parameter. See TEZ-3310.
'tez.grouping.min-size': 256 * 1024 * 1024,
# Instead, we use post-process merging to make sure that we merge files
# where possible at the end of jobs.
# TODO(todd) re-evaluate whether this is necessary once TEZ-3310 is fixed
# (see above).
'hive.merge.tezfiles': 'true',
# Enable compaction workers. The compaction initiator is off by default
# but configuring a worker thread allows manual compaction.
'hive.compactor.worker.threads': 4,
# Hive 3 now requires separate directories for managed vs external tables.
# Since only transactional tables are considered managed, most tests run against
# tables that are now considered external. So, use /test-warehouse for the external
# directory so that most tests don't need to change their paths. Data snapshots
# are built around populating /test-warehouse, so use /test-warehouse/managed
# to allow that logic to remain the same.
'hive.metastore.warehouse.dir': '${WAREHOUSE_LOCATION_PREFIX}/test-warehouse/managed',
'hive.metastore.warehouse.external.dir': '${WAREHOUSE_LOCATION_PREFIX}/test-warehouse'
})
else:
CONFIG.update({
'hive.metastore.event.listeners': 'org.apache.sentry.binding.metastore.SentrySyncHMSNotificationsPostEventListener',
# HMS-2 based environments have a different set of expected configurations for event processor
'hive.metastore.alter.notifications.basic': 'false',
'hive.metastore.notification.parameters.exclude.patterns': '',
'hive.metastore.notifications.add.thrift.objects': 'true',
# HMS-2 doesn't have a distinction between external and managed warehouse directories,
# so only hive.metastore.warehouse.dir is necessary.
'hive.metastore.warehouse.dir': '${WAREHOUSE_LOCATION_PREFIX}/test-warehouse'
})
# Notifications-related configuration.
# These are for enabling notification between Hive and Sentry as well as
# metastore event processing in Impala (see IMPALA-7954)
CONFIG.update({
'hive.metastore.transactional.event.listeners': 'org.apache.hive.hcatalog.listener.DbNotificationListener,org.apache.kudu.hive.metastore.KuduMetastorePlugin',
'hcatalog.message.factory.impl.json': 'org.apache.sentry.binding.metastore.messaging.json.SentryJSONMessageFactory',
'hive.metastore.dml.events': 'true',
})
if variant == 'without_hms_config':
CONFIG.clear()
# Database and JDO-related configs:
db_type = os.environ.get('HMS_DB_TYPE', 'postgres')
CONFIG.update({
'datanucleus.autoCreateSchema': 'false',
'datanucleus.fixedDatastore': 'false',
'datanucleus.metadata.validate': 'false',
'javax.jdo.option.ConnectionUserName': 'hiveuser',
'javax.jdo.option.ConnectionPassword': 'password',
})
if db_type == 'postgres':
CONFIG.update({
'javax.jdo.option.ConnectionDriverName': 'org.postgresql.Driver',
'javax.jdo.option.ConnectionURL': 'jdbc:postgresql://localhost:5432/${METASTORE_DB}',
})
elif db_type == 'mysql':
CONFIG.update({
'javax.jdo.option.ConnectionDriverName': 'com.mysql.jdbc.Driver',
'javax.jdo.option.ConnectionURL': 'jdbc:mysql://localhost:3306/${METASTORE_DB}?createDatabaseIfNotExist=true'
})
else:
raise Exception("Unknown db type: %s", db_type)