tests/comparison/data_generator.py - impala - Git at Google

 #!/usr/bin/env impala-python3

 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 '''This module provides random data generation and database population.

    When this module is run directly for purposes of database population, the default is
    to use a fixed seed for randomization. The result should be that the generated random
    data is the same regardless of when or where the execution is done.

 '''

 from __future__ import absolute_import, division, print_function
 from builtins import filter, range
 import os
 from copy import deepcopy
 from logging import getLogger
 from random import choice, randint, seed
 from time import time

 from tests.comparison.data_generator_mapred_common import (
     estimate_rows_per_reducer,
     MB_PER_REDUCER,
     serialize,
     TextTableDataGenerator)
 from tests.comparison.common import Column, Table
 from tests.comparison.db_types import (
     Char,
     Decimal,
     EXACT_TYPES,
     get_char_class,
     get_decimal_class,
     get_varchar_class,
     String,
     Timestamp,
     TYPES,
     VarChar)
 from tests.comparison import db_connection

 LOG = getLogger(__name__)

 def index_tables_in_db_if_possible(cursor):
   if not cursor.conn.supports_index_creation:
     return
   for table_name in cursor.list_table_names():
     LOG.info('Indexing %s on %s' % (table_name, cursor.db_type))
     cursor.index_table(table_name)


 def migrate_db(src_cursor, dst_cursor, include_table_names=None):
   '''Read table metadata and data from the source database and create a replica in
      the destination database. For example, the Impala functional test database could
      be copied into Postgresql.
   '''
   for table_name in src_cursor.list_table_names():
     if include_table_names and table_name not in include_table_names:
       continue
     table = src_cursor.describe_table(table_name)
     dst_cursor.create_table(table)
     src_cursor.execute('SELECT * FROM ' + table_name)
     while True:
       rows = src_cursor.fetchmany(size=100)
       if not rows:
         break
       sql = dst_cursor.make_insert_sql_from_data(table, rows)
       dst_cursor.execute(sql)
   index_tables_in_db_if_possible(dst_cursor)


 class DbPopulator(object):
   '''This class will populate a database with randomly generated data. The population
      includes table creation and data generation. Table names are hard coded as
      table_<table number>.

   '''

   def __init__(self, db_engine=db_connection.IMPALA):
     self.cluster = None
     self.db_name = None
     self.db_engine = db_engine

     self.min_col_count = None
     self.max_col_count = None
     self.min_row_count = None
     self.max_row_count = None
     self.allowed_storage_formats = None
     self.randomization_seed = None

   def populate_db(self, table_count, postgresql_conn=None):
     '''Create tables with a random number of cols.

        The given db_name must have already been created.
     '''
     self.cluster.hdfs.ensure_home_dir()
     hdfs = self.cluster.hdfs.create_client()

     table_and_generators = list()
     for table_idx in range(table_count):
       table = self._create_random_table(
           'table_%s' % (table_idx + 1),
           self.min_col_count,
           self.max_col_count,
           self.allowed_storage_formats)
       self._prepare_table_storage(table, self.db_name)
       if table.storage_format == 'TEXTFILE':
         text_table = table
       else:
         text_table = deepcopy(table)
         text_table.name += '_text'
         text_table.storage_format = 'TEXTFILE'
         text_table.storage_location = None
         text_table.schema_location = None
         self._prepare_table_storage(text_table, self.db_name)
       table_data_generator = TextTableDataGenerator()
       table_data_generator.randomization_seed = self.randomization_seed
       table_data_generator.table = text_table
       table_data_generator.row_count = randint(self.min_row_count, self.max_row_count)
       table_and_generators.append((table, table_data_generator))

     self._run_data_generator_mr_job([g for _, g in table_and_generators], self.db_name)

     with self.cluster.hive.cursor(db_name=self.db_name) as cursor:
       for table, table_data_generator in table_and_generators:
         cursor.create_table(table)
         text_table = table_data_generator.table
         if postgresql_conn:
           with postgresql_conn.cursor() as postgresql_cursor:
             postgresql_cursor.create_table(table)
             for data_file in hdfs.list(text_table.storage_location):
               with hdfs.read(text_table.storage_location + '/' + data_file) as reader:
                 postgresql_cursor.copy_expert(
                     r"COPY %s FROM STDIN WITH DELIMITER E'\x01'" % table.name, reader)
         if table.storage_format != 'TEXTFILE':
           cursor.create_table(text_table)
           cursor.execute('INSERT INTO %s SELECT * FROM %s'
               % (table.name, text_table.name))
           cursor.drop_table(text_table.name)
     if self.db_engine is db_connection.IMPALA:
       with self.cluster.impala.cursor(db_name=self.db_name) as cursor:
         cursor.invalidate_metadata()
         cursor.compute_stats()
     elif self.db_engine is db_connection.HIVE:
       with self.cluster.hive.cursor(db_name=self.db_name) as cursor:
         cursor.invalidate_metadata()
         cursor.compute_stats()
     else:
       raise ValueError("db_engine must be of type %s or %s", db_connection.IMPALA,
                        db_connection.HIVE)
     if postgresql_conn:
       with postgresql_conn.cursor() as postgresql_cursor:
         index_tables_in_db_if_possible(postgresql_cursor)

   def _create_random_table(self,
       table_name,
       min_col_count,
       max_col_count,
       allowed_storage_formats):
     '''Create and return a Table with a random number of cols.'''
     col_count = randint(min_col_count, max_col_count)
     storage_format = choice(allowed_storage_formats)
     table = Table(table_name)
     table.storage_format = storage_format
     allowed_types = list(TYPES)
     # Avro doesn't support timestamps yet.
     if table.storage_format == 'AVRO':
       allowed_types.remove(Timestamp)
     # TODO: 'table.cols' returns a copy of all scalar cols, so 'table.cols.append()'
     #       doesn't actually modify the table's columns. 'table.cols' should be changed
     #       to allow access to the real columns.
     cols = table.cols
     for col_idx in range(col_count):
       col_type = choice(allowed_types)
       col_type = \
         choice(list(filter(lambda type_: issubclass(type_, col_type), EXACT_TYPES)))
       if issubclass(col_type, VarChar) and not issubclass(col_type, String):
         col_type = get_varchar_class(randint(1, VarChar.MAX))
       elif issubclass(col_type, Char) and not issubclass(col_type, String):
         col_type = get_char_class(randint(1, Char.MAX))
       elif issubclass(col_type, Decimal):
         max_digits = randint(1, Decimal.MAX_DIGITS)
         col_type = get_decimal_class(max_digits, randint(1, max_digits))
       col = Column(
           table,
           '%s_col_%s' % (col_type.__name__.lower(), col_idx + 1),
           col_type)
       cols.append(col)
     table.cols = cols
     return table

   def _prepare_table_storage(self, table, db_name):
     with self.cluster.hive.cursor(db_name=self.db_name) as cursor:
       cursor.ensure_storage_location(table)
     hdfs = self.cluster.hdfs.create_client()
     if hdfs.exists(table.storage_location):
       hdfs.delete(table.storage_location, recursive=True)
     hdfs.makedirs(table.storage_location, permission='777')

   def _run_data_generator_mr_job(self, table_data_generators, db_name):
     timestamp = int(time())
     mapper_input_file = '/tmp/data_gen_%s_mr_input_%s' % (db_name, timestamp)
     hdfs = self.cluster.hdfs.create_client()
     if hdfs.exists(mapper_input_file):
       hdfs.delete(mapper_input_file)
     reducer_count = 0
     mapper_input_data = list()
     for table_data_generator in table_data_generators:
       reducer_count += (table_data_generator.row_count
           // estimate_rows_per_reducer(table_data_generator, MB_PER_REDUCER)) + 1
       mapper_input_data.append(serialize(table_data_generator))
     hdfs.write(mapper_input_file, data=b'\n'.join(mapper_input_data))

     files = ['common.py', 'db_types.py', 'data_generator_mapred_common.py',
         'data_generator_mapper.py', 'data_generator_reducer.py',
         'random_val_generator.py']
     dir_path = os.path.dirname(__file__)
     files = [os.path.join(dir_path, f) for f in files]

     hdfs_output_dir = '/tmp/data_gen_%s_mr_output_%s' % (db_name, timestamp)
     if hdfs.exists(hdfs_output_dir):
       hdfs.delete(hdfs_output_dir, recursive=True)

     LOG.info('Starting MR job to generate data for %s', db_name)
     self.cluster.yarn.run_mr_job(self.cluster.yarn.find_mr_streaming_jar(), job_args=r'''
         -D mapred.reduce.tasks=%s \
         -D stream.num.map.output.key.fields=2 \
         -libjars '%s/share/hadoop/hdfs/lib/*' \
         -files %s \
         -input %s \
         -output %s \
         -mapper data_generator_mapper.py \
         -reducer data_generator_reducer.py'''.strip()
         % (reducer_count, os.environ["HADOOP_HOME"], ','.join(files), mapper_input_file,
            hdfs_output_dir))


 if __name__ == '__main__':
   from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser

   from tests.comparison import cli_options

   parser = ArgumentParser(
       usage='usage: \n'
           '  %(prog)s [options] [populate]\n\n'
           '     Create and populate database(s). The Impala database will always be \n'
           '     included. Postgres is optional. The other databases are not supported.\n\n'
           '  %(prog)s [options] migrate\n\n'
           '     Migrate an Impala database to another database type. The destination \n'
           '     database will be dropped and recreated.',
       formatter_class=ArgumentDefaultsHelpFormatter)
   cli_options.add_logging_options(parser)
   cli_options.add_cluster_options(parser)
   cli_options.add_db_name_option(parser)
   cli_options.add_connection_option_groups(parser)

   group = parser.add_argument_group('Database Population Options')
   group.add_argument('--randomization-seed', default=1, type=int,
       help='The randomization will be initialized with this seed. Using the same seed '
           'will produce the same results across runs.')
   cli_options.add_storage_format_options(group)
   group.add_argument('--create-data-files', default=False, action='store_true',
       help='Create files that can be used to repopulate the databases elsewhere.')
   group.add_argument('--table-count', default=10, type=int,
       help='The number of tables to generate.')
   group.add_argument('--min-column-count', default=1, type=int,
       help='The minimum number of columns to generate per table.')
   group.add_argument('--max-column-count', default=100, type=int,
       help='The maximum number of columns to generate per table.')
   group.add_argument('--min-row-count', default=(10 ** 3), type=int,
       help='The minimum number of rows to generate per table.')
   group.add_argument('--max-row-count', default=(10 ** 6), type=int,
       help='The maximum number of rows to generate per table.')
   parser.add_argument_group(group)

   group = parser.add_argument_group('Database Migration Options')
   group.add_argument('--migrate-table-names',
       help='Table names should be separated with commas. The default is to migrate all '
           'tables.')
   parser.add_argument_group(group)
   parser.add_argument('command', nargs='*', help='The command to run either "populate"'
       ' or "migrate".')
   args = parser.parse_args()
   if len(args.command) > 1:
     raise Exception('Only one command can be chosen. Requested commands were: %s'
         % args.command)
   command = args.command[0] if args.command else 'populate'
   if command not in ('populate', 'migrate'):
     raise Exception('Command must either be "populate" or "migrate" but was "%s"'
         % command)
   if command == 'migrate' and \
       not any((args.use_mysql, args.use_postgresql, args.use_oracle)):
     raise Exception('At least one destination database must be chosen with '
           '--use-<database type>')

   cli_options.configure_logging(args.log_level, debug_log_file=args.debug_log_file)

   seed(args.randomization_seed)

   cluster = cli_options.create_cluster(args)

   populator = DbPopulator(db_connection.HIVE if args.use_hive else db_connection.IMPALA)
   if command == 'populate':
     populator.randomization_seed = args.randomization_seed
     populator.cluster = cluster
     populator.db_name = args.db_name
     populator.min_col_count = args.min_column_count
     populator.max_col_count = args.max_column_count
     populator.min_row_count = args.min_row_count
     populator.max_row_count = args.max_row_count
     populator.allowed_storage_formats = args.storage_file_formats.split(',')

     if args.use_hive:
       with cluster.hive.connect() as conn:
         with conn.cursor() as cursor:
           cursor.ensure_empty_db(args.db_name)
     else:
       with cluster.impala.connect() as conn:
         with conn.cursor() as cursor:
           cursor.invalidate_metadata()
           cursor.ensure_empty_db(args.db_name)

     if args.use_postgresql:
       with cli_options.create_connection(args) as postgresql_conn:
         with postgresql_conn.cursor() as cursor:
           cursor.ensure_empty_db(args.db_name)
       postgresql_conn = cli_options.create_connection(args, db_name=args.db_name)
     else:
       postgresql_conn = None
     populator.populate_db(args.table_count, postgresql_conn=postgresql_conn)
   else:
     if args.migrate_table_names:
       table_names = args.migrate_table_names.split(',')
     else:
       table_names = None
     with cli_options.create_connection(args) as conn:
       with conn.cursor() as cursor:
         cursor.ensure_empty_db(args.db_name)
     with cli_options.create_connection(args, db_name=args.db_name) as conn:
       with conn.cursor() as dst:
         with cluster.impala.cursor(db_name=args.db_name) as src:
           migrate_db(src, dst, include_table_names=table_names)
	#!/usr/bin/env impala-python3

	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	'''This module provides random data generation and database population.

	When this module is run directly for purposes of database population, the default is
	to use a fixed seed for randomization. The result should be that the generated random
	data is the same regardless of when or where the execution is done.

	'''

	from __future__ import absolute_import, division, print_function
	from builtins import filter, range
	import os
	from copy import deepcopy
	from logging import getLogger
	from random import choice, randint, seed
	from time import time

	from tests.comparison.data_generator_mapred_common import (
	estimate_rows_per_reducer,
	MB_PER_REDUCER,
	serialize,
	TextTableDataGenerator)
	from tests.comparison.common import Column, Table
	from tests.comparison.db_types import (
	Char,
	Decimal,
	EXACT_TYPES,
	get_char_class,
	get_decimal_class,
	get_varchar_class,
	String,
	Timestamp,
	TYPES,
	VarChar)
	from tests.comparison import db_connection

	LOG = getLogger(__name__)

	def index_tables_in_db_if_possible(cursor):
	if not cursor.conn.supports_index_creation:
	return
	for table_name in cursor.list_table_names():
	LOG.info('Indexing %s on %s' % (table_name, cursor.db_type))
	cursor.index_table(table_name)


	def migrate_db(src_cursor, dst_cursor, include_table_names=None):
	'''Read table metadata and data from the source database and create a replica in
	the destination database. For example, the Impala functional test database could
	be copied into Postgresql.
	'''
	for table_name in src_cursor.list_table_names():
	if include_table_names and table_name not in include_table_names:
	continue
	table = src_cursor.describe_table(table_name)
	dst_cursor.create_table(table)
	src_cursor.execute('SELECT * FROM ' + table_name)
	while True:
	rows = src_cursor.fetchmany(size=100)
	if not rows:
	break
	sql = dst_cursor.make_insert_sql_from_data(table, rows)
	dst_cursor.execute(sql)
	index_tables_in_db_if_possible(dst_cursor)


	class DbPopulator(object):
	'''This class will populate a database with randomly generated data. The population
	includes table creation and data generation. Table names are hard coded as
	table_<table number>.

	'''

	def __init__(self, db_engine=db_connection.IMPALA):
	self.cluster = None
	self.db_name = None
	self.db_engine = db_engine

	self.min_col_count = None
	self.max_col_count = None
	self.min_row_count = None
	self.max_row_count = None
	self.allowed_storage_formats = None
	self.randomization_seed = None

	def populate_db(self, table_count, postgresql_conn=None):
	'''Create tables with a random number of cols.

	The given db_name must have already been created.
	'''
	self.cluster.hdfs.ensure_home_dir()
	hdfs = self.cluster.hdfs.create_client()

	table_and_generators = list()
	for table_idx in range(table_count):
	table = self._create_random_table(
	'table_%s' % (table_idx + 1),
	self.min_col_count,
	self.max_col_count,
	self.allowed_storage_formats)
	self._prepare_table_storage(table, self.db_name)
	if table.storage_format == 'TEXTFILE':
	text_table = table
	else:
	text_table = deepcopy(table)
	text_table.name += '_text'
	text_table.storage_format = 'TEXTFILE'
	text_table.storage_location = None
	text_table.schema_location = None
	self._prepare_table_storage(text_table, self.db_name)
	table_data_generator = TextTableDataGenerator()
	table_data_generator.randomization_seed = self.randomization_seed
	table_data_generator.table = text_table
	table_data_generator.row_count = randint(self.min_row_count, self.max_row_count)
	table_and_generators.append((table, table_data_generator))

	self._run_data_generator_mr_job([g for _, g in table_and_generators], self.db_name)

	with self.cluster.hive.cursor(db_name=self.db_name) as cursor:
	for table, table_data_generator in table_and_generators:
	cursor.create_table(table)
	text_table = table_data_generator.table
	if postgresql_conn:
	with postgresql_conn.cursor() as postgresql_cursor:
	postgresql_cursor.create_table(table)
	for data_file in hdfs.list(text_table.storage_location):
	with hdfs.read(text_table.storage_location + '/' + data_file) as reader:
	postgresql_cursor.copy_expert(
	r"COPY %s FROM STDIN WITH DELIMITER E'\x01'" % table.name, reader)
	if table.storage_format != 'TEXTFILE':
	cursor.create_table(text_table)
	cursor.execute('INSERT INTO %s SELECT * FROM %s'
	% (table.name, text_table.name))
	cursor.drop_table(text_table.name)
	if self.db_engine is db_connection.IMPALA:
	with self.cluster.impala.cursor(db_name=self.db_name) as cursor:
	cursor.invalidate_metadata()
	cursor.compute_stats()
	elif self.db_engine is db_connection.HIVE:
	with self.cluster.hive.cursor(db_name=self.db_name) as cursor:
	cursor.invalidate_metadata()
	cursor.compute_stats()
	else:
	raise ValueError("db_engine must be of type %s or %s", db_connection.IMPALA,
	db_connection.HIVE)
	if postgresql_conn:
	with postgresql_conn.cursor() as postgresql_cursor:
	index_tables_in_db_if_possible(postgresql_cursor)

	def _create_random_table(self,
	table_name,
	min_col_count,
	max_col_count,
	allowed_storage_formats):
	'''Create and return a Table with a random number of cols.'''
	col_count = randint(min_col_count, max_col_count)
	storage_format = choice(allowed_storage_formats)
	table = Table(table_name)
	table.storage_format = storage_format
	allowed_types = list(TYPES)
	# Avro doesn't support timestamps yet.
	if table.storage_format == 'AVRO':
	allowed_types.remove(Timestamp)
	# TODO: 'table.cols' returns a copy of all scalar cols, so 'table.cols.append()'
	# doesn't actually modify the table's columns. 'table.cols' should be changed
	# to allow access to the real columns.
	cols = table.cols
	for col_idx in range(col_count):
	col_type = choice(allowed_types)
	col_type = \
	choice(list(filter(lambda type_: issubclass(type_, col_type), EXACT_TYPES)))
	if issubclass(col_type, VarChar) and not issubclass(col_type, String):
	col_type = get_varchar_class(randint(1, VarChar.MAX))
	elif issubclass(col_type, Char) and not issubclass(col_type, String):
	col_type = get_char_class(randint(1, Char.MAX))
	elif issubclass(col_type, Decimal):
	max_digits = randint(1, Decimal.MAX_DIGITS)
	col_type = get_decimal_class(max_digits, randint(1, max_digits))
	col = Column(
	table,
	'%s_col_%s' % (col_type.__name__.lower(), col_idx + 1),
	col_type)
	cols.append(col)
	table.cols = cols
	return table

	def _prepare_table_storage(self, table, db_name):
	with self.cluster.hive.cursor(db_name=self.db_name) as cursor:
	cursor.ensure_storage_location(table)
	hdfs = self.cluster.hdfs.create_client()
	if hdfs.exists(table.storage_location):
	hdfs.delete(table.storage_location, recursive=True)
	hdfs.makedirs(table.storage_location, permission='777')

	def _run_data_generator_mr_job(self, table_data_generators, db_name):
	timestamp = int(time())
	mapper_input_file = '/tmp/data_gen_%s_mr_input_%s' % (db_name, timestamp)
	hdfs = self.cluster.hdfs.create_client()
	if hdfs.exists(mapper_input_file):
	hdfs.delete(mapper_input_file)
	reducer_count = 0
	mapper_input_data = list()
	for table_data_generator in table_data_generators:
	reducer_count += (table_data_generator.row_count
	// estimate_rows_per_reducer(table_data_generator, MB_PER_REDUCER)) + 1
	mapper_input_data.append(serialize(table_data_generator))
	hdfs.write(mapper_input_file, data=b'\n'.join(mapper_input_data))

	files = ['common.py', 'db_types.py', 'data_generator_mapred_common.py',
	'data_generator_mapper.py', 'data_generator_reducer.py',
	'random_val_generator.py']
	dir_path = os.path.dirname(__file__)
	files = [os.path.join(dir_path, f) for f in files]

	hdfs_output_dir = '/tmp/data_gen_%s_mr_output_%s' % (db_name, timestamp)
	if hdfs.exists(hdfs_output_dir):
	hdfs.delete(hdfs_output_dir, recursive=True)

	LOG.info('Starting MR job to generate data for %s', db_name)
	self.cluster.yarn.run_mr_job(self.cluster.yarn.find_mr_streaming_jar(), job_args=r'''
	-D mapred.reduce.tasks=%s \
	-D stream.num.map.output.key.fields=2 \
	-libjars '%s/share/hadoop/hdfs/lib/*' \
	-files %s \
	-input %s \
	-output %s \
	-mapper data_generator_mapper.py \
	-reducer data_generator_reducer.py'''.strip()
	% (reducer_count, os.environ["HADOOP_HOME"], ','.join(files), mapper_input_file,
	hdfs_output_dir))


	if __name__ == '__main__':
	from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser

	from tests.comparison import cli_options

	parser = ArgumentParser(
	usage='usage: \n'
	' %(prog)s [options] [populate]\n\n'
	' Create and populate database(s). The Impala database will always be \n'
	' included. Postgres is optional. The other databases are not supported.\n\n'
	' %(prog)s [options] migrate\n\n'
	' Migrate an Impala database to another database type. The destination \n'
	' database will be dropped and recreated.',
	formatter_class=ArgumentDefaultsHelpFormatter)
	cli_options.add_logging_options(parser)
	cli_options.add_cluster_options(parser)
	cli_options.add_db_name_option(parser)
	cli_options.add_connection_option_groups(parser)

	group = parser.add_argument_group('Database Population Options')
	group.add_argument('--randomization-seed', default=1, type=int,
	help='The randomization will be initialized with this seed. Using the same seed '
	'will produce the same results across runs.')
	cli_options.add_storage_format_options(group)
	group.add_argument('--create-data-files', default=False, action='store_true',
	help='Create files that can be used to repopulate the databases elsewhere.')
	group.add_argument('--table-count', default=10, type=int,
	help='The number of tables to generate.')
	group.add_argument('--min-column-count', default=1, type=int,
	help='The minimum number of columns to generate per table.')
	group.add_argument('--max-column-count', default=100, type=int,
	help='The maximum number of columns to generate per table.')
	group.add_argument('--min-row-count', default=(10 ** 3), type=int,
	help='The minimum number of rows to generate per table.')
	group.add_argument('--max-row-count', default=(10 ** 6), type=int,
	help='The maximum number of rows to generate per table.')
	parser.add_argument_group(group)

	group = parser.add_argument_group('Database Migration Options')
	group.add_argument('--migrate-table-names',
	help='Table names should be separated with commas. The default is to migrate all '
	'tables.')
	parser.add_argument_group(group)
	parser.add_argument('command', nargs='*', help='The command to run either "populate"'
	' or "migrate".')
	args = parser.parse_args()
	if len(args.command) > 1:
	raise Exception('Only one command can be chosen. Requested commands were: %s'
	% args.command)
	command = args.command[0] if args.command else 'populate'
	if command not in ('populate', 'migrate'):
	raise Exception('Command must either be "populate" or "migrate" but was "%s"'
	% command)
	if command == 'migrate' and \
	not any((args.use_mysql, args.use_postgresql, args.use_oracle)):
	raise Exception('At least one destination database must be chosen with '
	'--use-<database type>')

	cli_options.configure_logging(args.log_level, debug_log_file=args.debug_log_file)

	seed(args.randomization_seed)

	cluster = cli_options.create_cluster(args)

	populator = DbPopulator(db_connection.HIVE if args.use_hive else db_connection.IMPALA)
	if command == 'populate':
	populator.randomization_seed = args.randomization_seed
	populator.cluster = cluster
	populator.db_name = args.db_name
	populator.min_col_count = args.min_column_count
	populator.max_col_count = args.max_column_count
	populator.min_row_count = args.min_row_count
	populator.max_row_count = args.max_row_count
	populator.allowed_storage_formats = args.storage_file_formats.split(',')

	if args.use_hive:
	with cluster.hive.connect() as conn:
	with conn.cursor() as cursor:
	cursor.ensure_empty_db(args.db_name)
	else:
	with cluster.impala.connect() as conn:
	with conn.cursor() as cursor:
	cursor.invalidate_metadata()
	cursor.ensure_empty_db(args.db_name)

	if args.use_postgresql:
	with cli_options.create_connection(args) as postgresql_conn:
	with postgresql_conn.cursor() as cursor:
	cursor.ensure_empty_db(args.db_name)
	postgresql_conn = cli_options.create_connection(args, db_name=args.db_name)
	else:
	postgresql_conn = None
	populator.populate_db(args.table_count, postgresql_conn=postgresql_conn)
	else:
	if args.migrate_table_names:
	table_names = args.migrate_table_names.split(',')
	else:
	table_names = None
	with cli_options.create_connection(args) as conn:
	with conn.cursor() as cursor:
	cursor.ensure_empty_db(args.db_name)
	with cli_options.create_connection(args, db_name=args.db_name) as conn:
	with conn.cursor() as dst:
	with cluster.impala.cursor(db_name=args.db_name) as src:
	migrate_db(src, dst, include_table_names=table_names)