tests/query_test/test_decimal_fuzz.py - impala - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 # Generates random decimal numbers and verifies that mathematical
 # operations return correct results under decimal_v2.

 import decimal
 import math
 import pytest
 import random

 from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.test_dimensions import create_single_exec_option_dimension
 from tests.common.test_vector import ImpalaTestDimension, ImpalaTestMatrix

 class TestDecimalFuzz(ImpalaTestSuite):

   # Impala's max precision for decimals is 38, so we should have the same in the tests
   decimal.getcontext().prec = 38

   @classmethod
   def get_workload(cls):
     return 'functional-query'

   @classmethod
   def add_test_dimensions(cls):
     cls.ImpalaTestMatrix = ImpalaTestMatrix()
     cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
     cls.iterations = 10000

   def weighted_choice(self, options):
     total_weight = sum(options.itervalues())
     numeric_choice = random.uniform(0, total_weight)
     last_choice = None
     for choice, weight in options.iteritems():
       if numeric_choice <= weight:
         return choice
       numeric_choice -= weight
       if weight > 0:
         last_choice = choice
     return last_choice

   def get_decimal(self):
     '''Returns a 3-tuple with string values of (value, precision, scale). The function
     does not always return completely random values, we try to bias it to select
     more interesting values.'''

     def random_precision():
       return random.randint(1, 38)

     def extreme_precision():
       return 38

     precision_weights = {}
     precision_weights[random_precision] = 0.8
     precision_weights[extreme_precision] = 0.2
     precision = self.weighted_choice(precision_weights)()

     def random_scale(precision):
       return random.randint(0, precision)

     def extreme_scale(precision):
       return random.choice([0, precision])

     scale_weights = {}
     scale_weights[random_scale] = 0.9
     scale_weights[extreme_scale] = 0.1
     scale = self.weighted_choice(scale_weights)(precision)

     def random_value(precision):
       '''Generates a completely random value.'''

       def num_digits_random(precision):
         return random.randint(1, precision)

       def num_digits_all(precision):
         return precision

       # Determine how many digits the value is going to have.
       num_digits_weights = {}
       num_digits_weights[num_digits_random] = 0.8
       num_digits_weights[num_digits_all] = 0.2
       num_digits = self.weighted_choice(num_digits_weights)(precision)

       no_zero = '123456789'
       with_zero = '0123456789'
       result = random.choice(no_zero)
       for _ in range(num_digits - 1):
         result += random.choice(with_zero)

       return result

     def special_case_binary_value(precision):
       '''Generates a value that looks like 11111... or 10000... in binary number
       system.'''

       def exponent_random(precision):
         return random.randint(0, int(precision * math.log(10, 2)))

       def exponent_max(precision):
         return int(precision * math.log(10, 2))

       exponent_weights = {}
       exponent_weights[exponent_random] = 0.8
       exponent_weights[exponent_max] = 0.2
       exponent = self.weighted_choice(exponent_weights)(precision)

       value = 2 ** exponent
       if random.random() < 0.5:
         value -= 1
       return '{0}'.format(value)

     def special_case_decimal_value(precision):
       '''Generates a value that looks like 99999... or 10000... in decimal number
       system.'''

       def num_digits_random(precision):
         return random.randint(1, precision)

       def num_digits_max(precision):
         return precision

       num_digits_weights = {}
       num_digits_weights[num_digits_random] = 8
       num_digits_weights[num_digits_max] = 0.2
       num_digits = self.weighted_choice(num_digits_weights)(precision)

       value = 10 ** num_digits

       if num_digits == precision or random.random() < 0.5:
         value -= 1

       return '{0}'.format(value)

     value_weights = {}
     value_weights[random_value] = 0.6
     value_weights[special_case_binary_value] = 0.2
     value_weights[special_case_decimal_value] = 0.2

     value = self.weighted_choice(value_weights)(precision)

     # Randomly determine the placement of the decimal mark.
     # The smallest index where the decimal mark can be placed in the number string.
     min_dot_location = max(len(value) - scale, 0)
     # The largest index where the decimal mark can be placed in the number string.
     max_dot_location = min(precision - scale, len(value))
     dot_location = random.randint(min_dot_location, max_dot_location)

     if dot_location == 0:
       value = '0.' + value
     elif dot_location == len(value):
       pass
     else:
       value = value[:dot_location] + '.' + value[dot_location:]

     if random.random() < 0.5:
       # Negate the number.
       value = '-' + value
     return (value, precision, scale)

   def result_equals(self, expected, actual):
     '''Verify that the expected result is equal to the actual result. We verify equality
     by rounding the expected result to different numbers of places and verifying that the
     actual result is matched in at least one of the cases.'''
     if actual == expected:
       return True

     if actual is None:
       # Overflow
       if abs(expected) > decimal.Decimal("9" * 32):
         # If the expected result is larger than 10^32 - 1, it's not unreasonable for
         # there to be an overflow in Impala because the minimum scale is 6 and
         # 38 (max precision) - 6 = 32.
         return True
       return False

     for num_digits_after_dot in xrange(39):
       # Reduce the number of digits after the dot in the expected_result to different
       # amounts. If it matches the actual result in at least one of the cases, we
       # consider the actual result to be acceptable.
       truncated_expected = expected.quantize(
           decimal.Decimal("1e-{0}".format(num_digits_after_dot)),
           rounding=decimal.ROUND_HALF_UP)
       if actual == truncated_expected:
         return True
     return False

   def execute_one_decimal_op(self):
     '''Executes a single query and compares the result to a result that we computed in
     Python.'''
     op = random.choice(['+', '-', '*', '/', '%'])
     value1, precision1, scale1 = self.get_decimal()
     value2, precision2, scale2 = self.get_decimal()

     query = ('select cast({value1} as decimal({precision1},{scale1})) {op} '
         'cast({value2} as decimal({precision2},{scale2}))').format(op=op,
         value1=value1, precision1=precision1, scale1=scale1,
         value2=value2, precision2=precision2, scale2=scale2)

     try:
       result = self.execute_scalar(query, query_options={'decimal_v2': 'true'})
     except ImpalaBeeswaxException as e:
       result = None
     if result is not None:
       result = decimal.Decimal(result)

     with decimal.localcontext() as ctx:
       # Set the decimal context to a large precision initially, so that the
       # mathematical operations are performed at a high precision.
       ctx.prec = 80

       try:
         if op == '+':
           expected_result = decimal.Decimal(value1) + decimal.Decimal(value2)
         elif op == '-':
           expected_result = decimal.Decimal(value1) - decimal.Decimal(value2)
         elif op == '*':
           expected_result = decimal.Decimal(value1) * decimal.Decimal(value2)
         elif op == '/':
           expected_result = decimal.Decimal(value1) / decimal.Decimal(value2)
         elif op == '%':
           expected_result = decimal.Decimal(value1) % decimal.Decimal(value2)
         else:
           assert False
       except decimal.InvalidOperation as e:
         expected_result = None
       except decimal.DivisionByZero as e:
         expected_result = None
       assert self.result_equals(expected_result, result)

   def test_decimal_ops(self, vector):
     for _ in xrange(self.iterations):
       self.execute_one_decimal_op()

   def width_bucket(self, val, min_range, max_range, num_buckets):
     # Multiplying the values by 10**40 guarantees that the numbers can be converted
     # to int without losing information.
     val_int = int(decimal.Decimal(val) * 10**40)
     min_range_int = int(decimal.Decimal(min_range) * 10**40)
     max_range_int = int(decimal.Decimal(max_range) * 10**40)

     if min_range_int >= max_range_int:
       return None
     if val_int < min_range_int:
       return 0
     if val_int > max_range_int:
       return num_buckets + 1

     range_size = max_range_int - min_range_int
     dist_from_min = val_int - min_range_int
     return (num_buckets * dist_from_min) / range_size + 1

   def execute_one_width_bucket(self):
     val, val_prec, val_scale = self.get_decimal()
     min_range, min_range_prec, min_range_scale = self.get_decimal()
     max_range, max_range_prec, max_range_scale = self.get_decimal()
     num_buckets = random.randint(1, 2147483647)

     query = ('select width_bucket('
         'cast({val} as decimal({val_prec},{val_scale})), '
         'cast({min_range} as decimal({min_range_prec},{min_range_scale})), '
         'cast({max_range} as decimal({max_range_prec},{max_range_scale})), '
         '{num_buckets})')

     query = query.format(val=val, val_prec=val_prec, val_scale=val_scale,
         min_range=min_range, min_range_prec=min_range_prec,
         min_range_scale=min_range_scale,
         max_range=max_range, max_range_prec=max_range_prec,
         max_range_scale=max_range_scale,
         num_buckets=num_buckets)

     expected_result = self.width_bucket(val, min_range, max_range, num_buckets)
     if not expected_result:
       return

     try:
       result = self.execute_scalar(query, query_options={'decimal_v2': 'true'})
       assert int(result) == expected_result
     except ImpalaBeeswaxException as e:
       if "You need to wrap the arguments in a CAST" not in str(e):
         # Sometimes the decimal inputs are incompatible with each other, so it's ok
         # to ignore this error.
         raise e

   def test_width_bucket(self, vector):
     for _ in xrange(self.iterations):
       self.execute_one_width_bucket()
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	# Generates random decimal numbers and verifies that mathematical
	# operations return correct results under decimal_v2.

	import decimal
	import math
	import pytest
	import random

	from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
	from tests.common.impala_test_suite import ImpalaTestSuite
	from tests.common.test_dimensions import create_single_exec_option_dimension
	from tests.common.test_vector import ImpalaTestDimension, ImpalaTestMatrix

	class TestDecimalFuzz(ImpalaTestSuite):

	# Impala's max precision for decimals is 38, so we should have the same in the tests
	decimal.getcontext().prec = 38

	@classmethod
	def get_workload(cls):
	return 'functional-query'

	@classmethod
	def add_test_dimensions(cls):
	cls.ImpalaTestMatrix = ImpalaTestMatrix()
	cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension())
	cls.iterations = 10000

	def weighted_choice(self, options):
	total_weight = sum(options.itervalues())
	numeric_choice = random.uniform(0, total_weight)
	last_choice = None
	for choice, weight in options.iteritems():
	if numeric_choice <= weight:
	return choice
	numeric_choice -= weight
	if weight > 0:
	last_choice = choice
	return last_choice

	def get_decimal(self):
	'''Returns a 3-tuple with string values of (value, precision, scale). The function
	does not always return completely random values, we try to bias it to select
	more interesting values.'''

	def random_precision():
	return random.randint(1, 38)

	def extreme_precision():
	return 38

	precision_weights = {}
	precision_weights[random_precision] = 0.8
	precision_weights[extreme_precision] = 0.2
	precision = self.weighted_choice(precision_weights)()

	def random_scale(precision):
	return random.randint(0, precision)

	def extreme_scale(precision):
	return random.choice([0, precision])

	scale_weights = {}
	scale_weights[random_scale] = 0.9
	scale_weights[extreme_scale] = 0.1
	scale = self.weighted_choice(scale_weights)(precision)

	def random_value(precision):
	'''Generates a completely random value.'''

	def num_digits_random(precision):
	return random.randint(1, precision)

	def num_digits_all(precision):
	return precision

	# Determine how many digits the value is going to have.
	num_digits_weights = {}
	num_digits_weights[num_digits_random] = 0.8
	num_digits_weights[num_digits_all] = 0.2
	num_digits = self.weighted_choice(num_digits_weights)(precision)

	no_zero = '123456789'
	with_zero = '0123456789'
	result = random.choice(no_zero)
	for _ in range(num_digits - 1):
	result += random.choice(with_zero)

	return result

	def special_case_binary_value(precision):
	'''Generates a value that looks like 11111... or 10000... in binary number
	system.'''

	def exponent_random(precision):
	return random.randint(0, int(precision * math.log(10, 2)))

	def exponent_max(precision):
	return int(precision * math.log(10, 2))

	exponent_weights = {}
	exponent_weights[exponent_random] = 0.8
	exponent_weights[exponent_max] = 0.2
	exponent = self.weighted_choice(exponent_weights)(precision)

	value = 2 ** exponent
	if random.random() < 0.5:
	value -= 1
	return '{0}'.format(value)

	def special_case_decimal_value(precision):
	'''Generates a value that looks like 99999... or 10000... in decimal number
	system.'''

	def num_digits_random(precision):
	return random.randint(1, precision)

	def num_digits_max(precision):
	return precision

	num_digits_weights = {}
	num_digits_weights[num_digits_random] = 8
	num_digits_weights[num_digits_max] = 0.2
	num_digits = self.weighted_choice(num_digits_weights)(precision)

	value = 10 ** num_digits

	if num_digits == precision or random.random() < 0.5:
	value -= 1

	return '{0}'.format(value)

	value_weights = {}
	value_weights[random_value] = 0.6
	value_weights[special_case_binary_value] = 0.2
	value_weights[special_case_decimal_value] = 0.2

	value = self.weighted_choice(value_weights)(precision)

	# Randomly determine the placement of the decimal mark.
	# The smallest index where the decimal mark can be placed in the number string.
	min_dot_location = max(len(value) - scale, 0)
	# The largest index where the decimal mark can be placed in the number string.
	max_dot_location = min(precision - scale, len(value))
	dot_location = random.randint(min_dot_location, max_dot_location)

	if dot_location == 0:
	value = '0.' + value
	elif dot_location == len(value):
	pass
	else:
	value = value[:dot_location] + '.' + value[dot_location:]

	if random.random() < 0.5:
	# Negate the number.
	value = '-' + value
	return (value, precision, scale)

	def result_equals(self, expected, actual):
	'''Verify that the expected result is equal to the actual result. We verify equality
	by rounding the expected result to different numbers of places and verifying that the
	actual result is matched in at least one of the cases.'''
	if actual == expected:
	return True

	if actual is None:
	# Overflow
	if abs(expected) > decimal.Decimal("9" * 32):
	# If the expected result is larger than 10^32 - 1, it's not unreasonable for
	# there to be an overflow in Impala because the minimum scale is 6 and
	# 38 (max precision) - 6 = 32.
	return True
	return False

	for num_digits_after_dot in xrange(39):
	# Reduce the number of digits after the dot in the expected_result to different
	# amounts. If it matches the actual result in at least one of the cases, we
	# consider the actual result to be acceptable.
	truncated_expected = expected.quantize(
	decimal.Decimal("1e-{0}".format(num_digits_after_dot)),
	rounding=decimal.ROUND_HALF_UP)
	if actual == truncated_expected:
	return True
	return False

	def execute_one_decimal_op(self):
	'''Executes a single query and compares the result to a result that we computed in
	Python.'''
	op = random.choice(['+', '-', '*', '/', '%'])
	value1, precision1, scale1 = self.get_decimal()
	value2, precision2, scale2 = self.get_decimal()

	query = ('select cast({value1} as decimal({precision1},{scale1})) {op} '
	'cast({value2} as decimal({precision2},{scale2}))').format(op=op,
	value1=value1, precision1=precision1, scale1=scale1,
	value2=value2, precision2=precision2, scale2=scale2)

	try:
	result = self.execute_scalar(query, query_options={'decimal_v2': 'true'})
	except ImpalaBeeswaxException as e:
	result = None
	if result is not None:
	result = decimal.Decimal(result)

	with decimal.localcontext() as ctx:
	# Set the decimal context to a large precision initially, so that the
	# mathematical operations are performed at a high precision.
	ctx.prec = 80

	try:
	if op == '+':
	expected_result = decimal.Decimal(value1) + decimal.Decimal(value2)
	elif op == '-':
	expected_result = decimal.Decimal(value1) - decimal.Decimal(value2)
	elif op == '*':
	expected_result = decimal.Decimal(value1) * decimal.Decimal(value2)
	elif op == '/':
	expected_result = decimal.Decimal(value1) / decimal.Decimal(value2)
	elif op == '%':
	expected_result = decimal.Decimal(value1) % decimal.Decimal(value2)
	else:
	assert False
	except decimal.InvalidOperation as e:
	expected_result = None
	except decimal.DivisionByZero as e:
	expected_result = None
	assert self.result_equals(expected_result, result)

	def test_decimal_ops(self, vector):
	for _ in xrange(self.iterations):
	self.execute_one_decimal_op()

	def width_bucket(self, val, min_range, max_range, num_buckets):
	# Multiplying the values by 10**40 guarantees that the numbers can be converted
	# to int without losing information.
	val_int = int(decimal.Decimal(val) * 10**40)
	min_range_int = int(decimal.Decimal(min_range) * 10**40)
	max_range_int = int(decimal.Decimal(max_range) * 10**40)

	if min_range_int >= max_range_int:
	return None
	if val_int < min_range_int:
	return 0
	if val_int > max_range_int:
	return num_buckets + 1

	range_size = max_range_int - min_range_int
	dist_from_min = val_int - min_range_int
	return (num_buckets * dist_from_min) / range_size + 1

	def execute_one_width_bucket(self):
	val, val_prec, val_scale = self.get_decimal()
	min_range, min_range_prec, min_range_scale = self.get_decimal()
	max_range, max_range_prec, max_range_scale = self.get_decimal()
	num_buckets = random.randint(1, 2147483647)

	query = ('select width_bucket('
	'cast({val} as decimal({val_prec},{val_scale})), '
	'cast({min_range} as decimal({min_range_prec},{min_range_scale})), '
	'cast({max_range} as decimal({max_range_prec},{max_range_scale})), '
	'{num_buckets})')

	query = query.format(val=val, val_prec=val_prec, val_scale=val_scale,
	min_range=min_range, min_range_prec=min_range_prec,
	min_range_scale=min_range_scale,
	max_range=max_range, max_range_prec=max_range_prec,
	max_range_scale=max_range_scale,
	num_buckets=num_buckets)

	expected_result = self.width_bucket(val, min_range, max_range, num_buckets)
	if not expected_result:
	return

	try:
	result = self.execute_scalar(query, query_options={'decimal_v2': 'true'})
	assert int(result) == expected_result
	except ImpalaBeeswaxException as e:
	if "You need to wrap the arguments in a CAST" not in str(e):
	# Sometimes the decimal inputs are incompatible with each other, so it's ok
	# to ignore this error.
	raise e

	def test_width_bucket(self, vector):
	for _ in xrange(self.iterations):
	self.execute_one_width_bucket()