testdata/bin/generate-test-vectors.py - impala - Git at Google

 #!/usr/bin/env impala-python
 #
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # This script is used to generate test "vectors" based on a dimension input file.
 # A vector in this context is simply a permutation of the values in the the
 # dimension input file.  For example, in this case the script is generating test vectors
 # for the Impala / Hive benchmark suite so interesting dimensions are data set,
 # file format, and compression algorithm. More can be added later.
 # The output of running this script is a list of vectors. Currently two different vector
 # outputs are generated - an "exhaustive" vector which contains all permutations and a
 # "pairwise" vector that contains a subset of the vectors by chosing all combinations of
 # pairs (the pairwise strategy). More information about pairwise can be found at
 # http://www.pairwise.org.
 #
 # The end goal is to have a reduced set of test vectors to provide coverage but don't take
 # as long to run as the exhaustive set of vectors along with a set of vectors that provide
 # full coverage. This is especially important for benchmarks which work on very large data
 # sets.
 #
 # The output files output can then be read in by other tests by other scripts,tools,tests.
 # One major use case is the generate_scehma_statements.py script, which uses the vector
 # files to dynamically build schema for running benchmark and functional tests.
 #
 # The pairwise generation is done using the Python 'AllPairs' module. This module can be
 # downloaded from http://pypi.python.org/pypi/AllPairs/2.0.1
 #
 import collections
 import csv
 import math
 import os
 import sys
 from itertools import product
 from optparse import OptionParser
 import metacomm.combinatorics.all_pairs2
 all_pairs = metacomm.combinatorics.all_pairs2.all_pairs2

 parser = OptionParser()
 parser.add_option("-w", "--workload", dest="workload",
                   help="The workload to generate test vectors for")
 (options, args) = parser.parse_args()

 if options.workload is None:
   print "A workload name must be specified."
   parser.print_help()
   sys.exit(1)

 WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']

 # This array also defines the order of the dimension values. This ordering
 # is important because it is used to apply constraints. Add new items to the
 # end of the list.
 KNOWN_DIMENSION_NAMES = ['file_format', 'dataset', 'compression_codec',
                          'compression_type']

 FILE_FORMAT_IDX = KNOWN_DIMENSION_NAMES.index('file_format')
 DATASET_IDX = KNOWN_DIMENSION_NAMES.index('dataset')
 COMPRESSION_IDX = KNOWN_DIMENSION_NAMES.index('compression_codec')
 COMPRESSION_TYPE_IDX = KNOWN_DIMENSION_NAMES.index('compression_type')

 class VectorGenerator:
   def __init__(self, input_vectors):
     self.input_vectors = input_vectors

   def generate_pairwise_matrix(self, filter_func = None):
     if filter_func is None:
       filter_func = lambda vector: True
     return all_pairs(self.input_vectors, filter_func = is_valid_combination)

   def generate_exhaustive_matrix(self, filter_func = None):
     if filter_func is None:
       filter_func = lambda vector: True
     return [list(vec) for vec in product(*self.input_vectors) if filter_func(vec)]

 # Add vector value constraints to this function.
 def is_valid_combination(vector):
   if len(vector) == 4:
     return not (
         (vector[FILE_FORMAT_IDX] == 'text' and vector[COMPRESSION_IDX] in ['def']) or
         (vector[COMPRESSION_IDX] == 'none' and vector[COMPRESSION_TYPE_IDX] != 'none') or
         (vector[COMPRESSION_IDX] != 'none' and vector[COMPRESSION_TYPE_IDX] == 'none') or
         (vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or
         (vector[FILE_FORMAT_IDX] == 'parquet' and vector[COMPRESSION_IDX] != 'none') or
         (vector[FILE_FORMAT_IDX] == 'hbase' and vector[COMPRESSION_IDX] != 'none') or
         (vector[FILE_FORMAT_IDX] == 'kudu' and vector[COMPRESSION_IDX] != 'none') or
         (vector[FILE_FORMAT_IDX] == 'avro' and
          vector[COMPRESSION_IDX] not in ['none', 'snap', 'def']))

   # The pairwise generator may call this with different vector lengths. In that case this
   # should always return true.
   return True

 # Vector files have the format: <dimension name>: value1, value2, ... this function
 # adds all specified dimensions to a map of dimension name-to-value
 def read_dimension_file(file_name):
   dimension_map = collections.defaultdict(list)
   with open(file_name, 'rb') as input_file:
     for line in input_file.readlines():
       if line.strip().startswith('#'):
          continue
       values = line.split(':')
       if len(values) != 2:
         print 'Invalid dimension file format. Expected format is <dimension name>: val1,'\
               ' val2, ... Found: ' + line
         sys.exit(1)
       if not values[0] in KNOWN_DIMENSION_NAMES:
         print 'Unknown dimension name: ' + values[0]
         print 'Valid dimension names: ' + ', '.join(KNOWN_DIMENSION_NAMES)
         sys.exit(1)
       dimension_map[values[0]] = [val.strip() for val in values[1].split(',')]
   return dimension_map

 def write_vectors_to_csv(output_dir, output_file, matrix):
   output_text = "# Generated File."
   for row in matrix:
     row = ['%s: %s' % (KNOWN_DIMENSION_NAMES[i], row[i]) for i in range(0, len(row))]
     output_text += '\n' + ', '.join(row)

   output_path = os.path.join(output_dir, output_file)
   print 'Writing test vectors to: ' + output_path
   with open(output_path, 'wb') as output_file:
     output_file.write(output_text)
     output_file.write('\n')

 dimension_file = os.path.join(WORKLOAD_DIR, options.workload,
                               '%s_dimensions.csv' % options.workload)
 if not os.path.isfile(dimension_file):
   print 'Dimension file not found: ' + dimension_file
   sys.exit(1)

 print 'Reading dimension file: ' + dimension_file
 vector_map = read_dimension_file(dimension_file)
 vectors = []

 # This ordering matters! We need to know the order to apply the proper constraints.
 for dimension_name in KNOWN_DIMENSION_NAMES:
   vectors.append(vector_map[dimension_name])
 vg = VectorGenerator(vectors)

 output_dir = os.path.join(WORKLOAD_DIR, options.workload)
 write_vectors_to_csv(output_dir, '%s_pairwise.csv' % options.workload,
                      vg.generate_pairwise_matrix(is_valid_combination))
 write_vectors_to_csv(output_dir, '%s_exhaustive.csv' % options.workload,
                      vg.generate_exhaustive_matrix(is_valid_combination))
	#!/usr/bin/env impala-python
	#
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# This script is used to generate test "vectors" based on a dimension input file.
	# A vector in this context is simply a permutation of the values in the the
	# dimension input file. For example, in this case the script is generating test vectors
	# for the Impala / Hive benchmark suite so interesting dimensions are data set,
	# file format, and compression algorithm. More can be added later.
	# The output of running this script is a list of vectors. Currently two different vector
	# outputs are generated - an "exhaustive" vector which contains all permutations and a
	# "pairwise" vector that contains a subset of the vectors by chosing all combinations of
	# pairs (the pairwise strategy). More information about pairwise can be found at
	# http://www.pairwise.org.
	#
	# The end goal is to have a reduced set of test vectors to provide coverage but don't take
	# as long to run as the exhaustive set of vectors along with a set of vectors that provide
	# full coverage. This is especially important for benchmarks which work on very large data
	# sets.
	#
	# The output files output can then be read in by other tests by other scripts,tools,tests.
	# One major use case is the generate_scehma_statements.py script, which uses the vector
	# files to dynamically build schema for running benchmark and functional tests.
	#
	# The pairwise generation is done using the Python 'AllPairs' module. This module can be
	# downloaded from http://pypi.python.org/pypi/AllPairs/2.0.1
	#
	import collections
	import csv
	import math
	import os
	import sys
	from itertools import product
	from optparse import OptionParser
	import metacomm.combinatorics.all_pairs2
	all_pairs = metacomm.combinatorics.all_pairs2.all_pairs2

	parser = OptionParser()
	parser.add_option("-w", "--workload", dest="workload",
	help="The workload to generate test vectors for")
	(options, args) = parser.parse_args()

	if options.workload is None:
	print "A workload name must be specified."
	parser.print_help()
	sys.exit(1)

	WORKLOAD_DIR = os.environ['IMPALA_WORKLOAD_DIR']

	# This array also defines the order of the dimension values. This ordering
	# is important because it is used to apply constraints. Add new items to the
	# end of the list.
	KNOWN_DIMENSION_NAMES = ['file_format', 'dataset', 'compression_codec',
	'compression_type']

	FILE_FORMAT_IDX = KNOWN_DIMENSION_NAMES.index('file_format')
	DATASET_IDX = KNOWN_DIMENSION_NAMES.index('dataset')
	COMPRESSION_IDX = KNOWN_DIMENSION_NAMES.index('compression_codec')
	COMPRESSION_TYPE_IDX = KNOWN_DIMENSION_NAMES.index('compression_type')

	class VectorGenerator:
	def __init__(self, input_vectors):
	self.input_vectors = input_vectors

	def generate_pairwise_matrix(self, filter_func = None):
	if filter_func is None:
	filter_func = lambda vector: True
	return all_pairs(self.input_vectors, filter_func = is_valid_combination)

	def generate_exhaustive_matrix(self, filter_func = None):
	if filter_func is None:
	filter_func = lambda vector: True
	return [list(vec) for vec in product(*self.input_vectors) if filter_func(vec)]

	# Add vector value constraints to this function.
	def is_valid_combination(vector):
	if len(vector) == 4:
	return not (
	(vector[FILE_FORMAT_IDX] == 'text' and vector[COMPRESSION_IDX] in ['def']) or
	(vector[COMPRESSION_IDX] == 'none' and vector[COMPRESSION_TYPE_IDX] != 'none') or
	(vector[COMPRESSION_IDX] != 'none' and vector[COMPRESSION_TYPE_IDX] == 'none') or
	(vector[FILE_FORMAT_IDX] != 'seq' and vector[COMPRESSION_TYPE_IDX] == 'record') or
	(vector[FILE_FORMAT_IDX] == 'parquet' and vector[COMPRESSION_IDX] != 'none') or
	(vector[FILE_FORMAT_IDX] == 'hbase' and vector[COMPRESSION_IDX] != 'none') or
	(vector[FILE_FORMAT_IDX] == 'kudu' and vector[COMPRESSION_IDX] != 'none') or
	(vector[FILE_FORMAT_IDX] == 'avro' and
	vector[COMPRESSION_IDX] not in ['none', 'snap', 'def']))

	# The pairwise generator may call this with different vector lengths. In that case this
	# should always return true.
	return True

	# Vector files have the format: <dimension name>: value1, value2, ... this function
	# adds all specified dimensions to a map of dimension name-to-value
	def read_dimension_file(file_name):
	dimension_map = collections.defaultdict(list)
	with open(file_name, 'rb') as input_file:
	for line in input_file.readlines():
	if line.strip().startswith('#'):
	continue
	values = line.split(':')
	if len(values) != 2:
	print 'Invalid dimension file format. Expected format is <dimension name>: val1,'\
	' val2, ... Found: ' + line
	sys.exit(1)
	if not values[0] in KNOWN_DIMENSION_NAMES:
	print 'Unknown dimension name: ' + values[0]
	print 'Valid dimension names: ' + ', '.join(KNOWN_DIMENSION_NAMES)
	sys.exit(1)
	dimension_map[values[0]] = [val.strip() for val in values[1].split(',')]
	return dimension_map

	def write_vectors_to_csv(output_dir, output_file, matrix):
	output_text = "# Generated File."
	for row in matrix:
	row = ['%s: %s' % (KNOWN_DIMENSION_NAMES[i], row[i]) for i in range(0, len(row))]
	output_text += '\n' + ', '.join(row)

	output_path = os.path.join(output_dir, output_file)
	print 'Writing test vectors to: ' + output_path
	with open(output_path, 'wb') as output_file:
	output_file.write(output_text)
	output_file.write('\n')

	dimension_file = os.path.join(WORKLOAD_DIR, options.workload,
	'%s_dimensions.csv' % options.workload)
	if not os.path.isfile(dimension_file):
	print 'Dimension file not found: ' + dimension_file
	sys.exit(1)

	print 'Reading dimension file: ' + dimension_file
	vector_map = read_dimension_file(dimension_file)
	vectors = []

	# This ordering matters! We need to know the order to apply the proper constraints.
	for dimension_name in KNOWN_DIMENSION_NAMES:
	vectors.append(vector_map[dimension_name])
	vg = VectorGenerator(vectors)

	output_dir = os.path.join(WORKLOAD_DIR, options.workload)
	write_vectors_to_csv(output_dir, '%s_pairwise.csv' % options.workload,
	vg.generate_pairwise_matrix(is_valid_combination))
	write_vectors_to_csv(output_dir, '%s_exhaustive.csv' % options.workload,
	vg.generate_exhaustive_matrix(is_valid_combination))