tests/query_test/test_exprs.py - impala - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 import pytest
 import re
 from random import randint

 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.test_dimensions import create_exec_option_dimension
 from tests.common.test_dimensions import create_uncompressed_text_dimension
 from tests.common.test_vector import ImpalaTestDimension
 from tests.util.test_file_parser import QueryTestSectionReader

 class TestExprs(ImpalaTestSuite):
   @classmethod
   def get_workload(cls):
     return 'functional-query'

   @classmethod
   def add_test_dimensions(cls):
     super(TestExprs, cls).add_test_dimensions()
     # Test with and without expr rewrites to cover regular expr evaluations
     # as well as constant folding, in particular, timestamp literals.
     cls.ImpalaTestMatrix.add_dimension(
         ImpalaTestDimension('enable_expr_rewrites', *[0,1]))
     if cls.exploration_strategy() == 'core':
       # Test with file format that supports codegen
       cls.ImpalaTestMatrix.add_constraint(lambda v:\
           v.get_value('table_format').file_format == 'text' and\
           v.get_value('table_format').compression_codec == 'none')

   def test_exprs(self, vector):
     vector.get_value('exec_option')['enable_expr_rewrites'] = \
         vector.get_value('enable_expr_rewrites')
     # TODO: Enable some of these tests for Avro if possible
     # Don't attempt to evaluate timestamp expressions with Avro tables (which don't
     # support a timestamp type)"
     table_format = vector.get_value('table_format')
     if table_format.file_format == 'avro':
       pytest.skip()
     if table_format.file_format == 'hbase':
       pytest.xfail("A lot of queries check for NULLs, which hbase does not recognize")
     if table_format.file_format == 'kudu':
       # Can't load LikeTbl without KUDU-1570.
       pytest.xfail("Need support for Kudu tables with nullable PKs (KUDU-1570)")
     self.run_test_case('QueryTest/exprs', vector)

     # This will change the current database to matching table format and then execute
     # select current_database(). An error will be thrown if multiple values are returned.
     current_db = self.execute_scalar('select current_database()', vector=vector)
     assert current_db == QueryTestSectionReader.get_db_name(table_format)

   def test_special_strings(self, vector):
     """Test handling of expressions with "special" strings."""
     vector.get_value('exec_option')['enable_expr_rewrites'] = \
         vector.get_value('enable_expr_rewrites')
     self.run_test_case('QueryTest/special-strings', vector)

 # Tests very deep expression trees and expressions with many children. Impala defines
 # a 'safe' upper bound on the expr depth and the number of expr children in the
 # FE Expr.java and any changes to those limits should be reflected in this test.
 # The expr limits primarily guard against stack overflows or similar problems
 # causing crashes. Therefore, this tests succeeds if no Impalads crash.
 class TestExprLimits(ImpalaTestSuite):
   # Keep these in sync with Expr.java
   EXPR_CHILDREN_LIMIT = 10000
   EXPR_DEPTH_LIMIT = 1000

   @classmethod
   def get_workload(self):
     return 'functional-query'

   @classmethod
   def add_test_dimensions(cls):
     super(TestExprLimits, cls).add_test_dimensions()
     if cls.exploration_strategy() != 'exhaustive':
       # Ensure the test runs with codegen enabled and disabled, even when the
       # exploration strategy is not exhaustive.
       cls.ImpalaTestMatrix.clear_dimension('exec_option')
       cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
           cluster_sizes=[0], disable_codegen_options=[False, True], batch_sizes=[0]))

     # There is no reason to run these tests using all dimensions.
     cls.ImpalaTestMatrix.add_dimension(
         create_uncompressed_text_dimension(cls.get_workload()))

   def test_expr_child_limit(self, vector):
     # IN predicate
     in_query = "select 1 IN("
     for i in xrange(0, self.EXPR_CHILDREN_LIMIT - 1):
       in_query += str(i)
       if (i + 1 != self.EXPR_CHILDREN_LIMIT - 1):
         in_query += ","
     in_query += ")"
     self.__exec_query(in_query)

     # CASE expr
     case_query = "select case "
     for i in xrange(0, self.EXPR_CHILDREN_LIMIT/2):
       case_query += " when true then 1"
     case_query += " end"
     self.__exec_query(case_query)

   def test_expr_depth_limit(self, vector):
     # Compound predicates
     and_query = "select " + self.__gen_deep_infix_expr("true", " and false")
     self.__exec_query(and_query)
     or_query = "select " + self.__gen_deep_infix_expr("true", " or false")
     self.__exec_query(or_query)

     # Arithmetic expr
     arith_query = "select " + self.__gen_deep_infix_expr("1", " + 1")
     self.__exec_query(arith_query)

     func_query = "select " + self.__gen_deep_func_expr("lower(", "'abc'", ")")
     self.__exec_query(func_query)

     # Casts.
     cast_query = "select " + self.__gen_deep_func_expr("cast(", "1", " as int)")
     self.__exec_query(cast_query)

   def test_under_statement_expression_limit(self):
     """Generate a huge case statement that barely fits within the statement expression
        limit and verify that it runs."""
     # This takes 20+ minutes, so only run it on exhaustive.
     # TODO: Determine whether this needs to run serially. It use >5 GB of memory.
     if self.exploration_strategy() != 'exhaustive':
       pytest.skip("Only test limit of codegen on exhaustive")
     case = self.__gen_huge_case("int_col", 32, 2, "  ")
     query = "select {0} as huge_case from functional_parquet.alltypes".format(case)
     self.__exec_query(query)

   def test_max_statement_size(self):
     """Generate a huge case statement that exceeds the default 16MB limit and verify
        that it gets rejected."""

     expected_err_tmpl = ("Statement length of {0} bytes exceeds the maximum "
         "statement length \({1} bytes\)")
     size_16mb = 16 * 1024 * 1024

     # Case 1: a valid SQL that would parse correctly
     case = self.__gen_huge_case("int_col", 75, 2, "  ")
     query = "select {0} as huge_case from functional.alltypes".format(case)
     err = self.execute_query_expect_failure(self.client, query)
     assert re.search(expected_err_tmpl.format(len(query), size_16mb), str(err))

     # Case 2: a string of 'a' characters that does not parse. This will still fail
     # with the same message, because the check is before parsing.
     invalid_sql = 'a' * (size_16mb + 1)
     err = self.execute_query_expect_failure(self.client, invalid_sql)
     assert re.search(expected_err_tmpl.format(len(invalid_sql), size_16mb), str(err))

   def test_statement_expression_limit(self):
     """Generate a huge case statement that barely fits within the 16MB limit but exceeds
        the statement expression limit. Verify that it fails."""
     case = self.__gen_huge_case("int_col", 66, 2, "  ")
     query = "select {0} as huge_case from functional.alltypes".format(case)
     assert len(query) < 16 * 1024 * 1024
     expected_err_re = ("Exceeded the statement expression limit \({0}\)\n"
         "Statement has .* expressions.").format(250000)
     err = self.execute_query_expect_failure(self.client, query)
     assert re.search(expected_err_re, str(err))

   def __gen_huge_case(self, col_name, fanout, depth, indent):
     toks = ["case\n"]
     for i in xrange(fanout):
       add = randint(1, 1000000)
       divisor = randint(1, 10000000)
       mod = randint(0, divisor)
       # Generate a mathematical expr that can't be easily optimised out.
       when_expr = "{0} + {1} % {2} = {3}".format(col_name, add, divisor, mod)
       if depth == 0:
         then_expr = "{0}".format(i)
       else:
         then_expr = "({0})".format(
             self.__gen_huge_case(col_name, fanout, depth - 1, indent + "  "))
       toks.append(indent)
       toks.append("when {0} then {1}\n".format(when_expr, then_expr))
     toks.append(indent)
     toks.append("end")
     return ''.join(toks)

   def __gen_deep_infix_expr(self, prefix, repeat_suffix):
     expr = prefix
     for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
       expr += repeat_suffix
     return expr

   def __gen_deep_func_expr(self, open_func, base_arg, close_func):
     expr = ""
     for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
       expr += open_func
     expr += base_arg
     for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
       expr += close_func
     return expr

   def __exec_query(self, sql_str):
     try:
       impala_ret = self.execute_query(sql_str)
       assert impala_ret.success, "Failed to execute query %s" % (sql_str)
     except Exception as e:  # consider any exception a failure
       assert False, "Failed to execute query %s: %s" % (sql_str, e)

 class TestUtcTimestampFunctions(ImpalaTestSuite):
   """Tests for UTC timestamp functions, i.e. functions that do not depend on the behavior
      of the flag --use_local_tz_for_unix_timestamp_conversions. Tests added here should
      also be run in the custom cluster test test_local_tz_conversion.py to ensure they
      have the same behavior when the conversion flag is set to true."""

   @classmethod
   def add_test_dimensions(cls):
     super(TestUtcTimestampFunctions, cls).add_test_dimensions()
     # Test with and without expr rewrites to cover regular expr evaluations
     # as well as constant folding, in particular, timestamp literals.
     cls.ImpalaTestMatrix.add_dimension(
         ImpalaTestDimension('enable_expr_rewrites', *[0,1]))
     if cls.exploration_strategy() == 'core':
       # Test with file format that supports codegen
       cls.ImpalaTestMatrix.add_constraint(lambda v:\
           v.get_value('table_format').file_format == 'text' and\
           v.get_value('table_format').compression_codec == 'none')

   @classmethod
   def get_workload(cls):
     return 'functional-query'

   def test_utc_functions(self, vector):
     vector.get_value('exec_option')['enable_expr_rewrites'] = \
         vector.get_value('enable_expr_rewrites')
     self.run_test_case('QueryTest/utc-timestamp-functions', vector)
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	import pytest
	import re
	from random import randint

	from tests.common.impala_test_suite import ImpalaTestSuite
	from tests.common.test_dimensions import create_exec_option_dimension
	from tests.common.test_dimensions import create_uncompressed_text_dimension
	from tests.common.test_vector import ImpalaTestDimension
	from tests.util.test_file_parser import QueryTestSectionReader

	class TestExprs(ImpalaTestSuite):
	@classmethod
	def get_workload(cls):
	return 'functional-query'

	@classmethod
	def add_test_dimensions(cls):
	super(TestExprs, cls).add_test_dimensions()
	# Test with and without expr rewrites to cover regular expr evaluations
	# as well as constant folding, in particular, timestamp literals.
	cls.ImpalaTestMatrix.add_dimension(
	ImpalaTestDimension('enable_expr_rewrites', *[0,1]))
	if cls.exploration_strategy() == 'core':
	# Test with file format that supports codegen
	cls.ImpalaTestMatrix.add_constraint(lambda v:\
	v.get_value('table_format').file_format == 'text' and\
	v.get_value('table_format').compression_codec == 'none')

	def test_exprs(self, vector):
	vector.get_value('exec_option')['enable_expr_rewrites'] = \
	vector.get_value('enable_expr_rewrites')
	# TODO: Enable some of these tests for Avro if possible
	# Don't attempt to evaluate timestamp expressions with Avro tables (which don't
	# support a timestamp type)"
	table_format = vector.get_value('table_format')
	if table_format.file_format == 'avro':
	pytest.skip()
	if table_format.file_format == 'hbase':
	pytest.xfail("A lot of queries check for NULLs, which hbase does not recognize")
	if table_format.file_format == 'kudu':
	# Can't load LikeTbl without KUDU-1570.
	pytest.xfail("Need support for Kudu tables with nullable PKs (KUDU-1570)")
	self.run_test_case('QueryTest/exprs', vector)

	# This will change the current database to matching table format and then execute
	# select current_database(). An error will be thrown if multiple values are returned.
	current_db = self.execute_scalar('select current_database()', vector=vector)
	assert current_db == QueryTestSectionReader.get_db_name(table_format)

	def test_special_strings(self, vector):
	"""Test handling of expressions with "special" strings."""
	vector.get_value('exec_option')['enable_expr_rewrites'] = \
	vector.get_value('enable_expr_rewrites')
	self.run_test_case('QueryTest/special-strings', vector)

	# Tests very deep expression trees and expressions with many children. Impala defines
	# a 'safe' upper bound on the expr depth and the number of expr children in the
	# FE Expr.java and any changes to those limits should be reflected in this test.
	# The expr limits primarily guard against stack overflows or similar problems
	# causing crashes. Therefore, this tests succeeds if no Impalads crash.
	class TestExprLimits(ImpalaTestSuite):
	# Keep these in sync with Expr.java
	EXPR_CHILDREN_LIMIT = 10000
	EXPR_DEPTH_LIMIT = 1000

	@classmethod
	def get_workload(self):
	return 'functional-query'

	@classmethod
	def add_test_dimensions(cls):
	super(TestExprLimits, cls).add_test_dimensions()
	if cls.exploration_strategy() != 'exhaustive':
	# Ensure the test runs with codegen enabled and disabled, even when the
	# exploration strategy is not exhaustive.
	cls.ImpalaTestMatrix.clear_dimension('exec_option')
	cls.ImpalaTestMatrix.add_dimension(create_exec_option_dimension(
	cluster_sizes=[0], disable_codegen_options=[False, True], batch_sizes=[0]))

	# There is no reason to run these tests using all dimensions.
	cls.ImpalaTestMatrix.add_dimension(
	create_uncompressed_text_dimension(cls.get_workload()))

	def test_expr_child_limit(self, vector):
	# IN predicate
	in_query = "select 1 IN("
	for i in xrange(0, self.EXPR_CHILDREN_LIMIT - 1):
	in_query += str(i)
	if (i + 1 != self.EXPR_CHILDREN_LIMIT - 1):
	in_query += ","
	in_query += ")"
	self.__exec_query(in_query)

	# CASE expr
	case_query = "select case "
	for i in xrange(0, self.EXPR_CHILDREN_LIMIT/2):
	case_query += " when true then 1"
	case_query += " end"
	self.__exec_query(case_query)

	def test_expr_depth_limit(self, vector):
	# Compound predicates
	and_query = "select " + self.__gen_deep_infix_expr("true", " and false")
	self.__exec_query(and_query)
	or_query = "select " + self.__gen_deep_infix_expr("true", " or false")
	self.__exec_query(or_query)

	# Arithmetic expr
	arith_query = "select " + self.__gen_deep_infix_expr("1", " + 1")
	self.__exec_query(arith_query)

	func_query = "select " + self.__gen_deep_func_expr("lower(", "'abc'", ")")
	self.__exec_query(func_query)

	# Casts.
	cast_query = "select " + self.__gen_deep_func_expr("cast(", "1", " as int)")
	self.__exec_query(cast_query)

	def test_under_statement_expression_limit(self):
	"""Generate a huge case statement that barely fits within the statement expression
	limit and verify that it runs."""
	# This takes 20+ minutes, so only run it on exhaustive.
	# TODO: Determine whether this needs to run serially. It use >5 GB of memory.
	if self.exploration_strategy() != 'exhaustive':
	pytest.skip("Only test limit of codegen on exhaustive")
	case = self.__gen_huge_case("int_col", 32, 2, " ")
	query = "select {0} as huge_case from functional_parquet.alltypes".format(case)
	self.__exec_query(query)

	def test_max_statement_size(self):
	"""Generate a huge case statement that exceeds the default 16MB limit and verify
	that it gets rejected."""

	expected_err_tmpl = ("Statement length of {0} bytes exceeds the maximum "
	"statement length \({1} bytes\)")
	size_16mb = 16 * 1024 * 1024

	# Case 1: a valid SQL that would parse correctly
	case = self.__gen_huge_case("int_col", 75, 2, " ")
	query = "select {0} as huge_case from functional.alltypes".format(case)
	err = self.execute_query_expect_failure(self.client, query)
	assert re.search(expected_err_tmpl.format(len(query), size_16mb), str(err))

	# Case 2: a string of 'a' characters that does not parse. This will still fail
	# with the same message, because the check is before parsing.
	invalid_sql = 'a' * (size_16mb + 1)
	err = self.execute_query_expect_failure(self.client, invalid_sql)
	assert re.search(expected_err_tmpl.format(len(invalid_sql), size_16mb), str(err))

	def test_statement_expression_limit(self):
	"""Generate a huge case statement that barely fits within the 16MB limit but exceeds
	the statement expression limit. Verify that it fails."""
	case = self.__gen_huge_case("int_col", 66, 2, " ")
	query = "select {0} as huge_case from functional.alltypes".format(case)
	assert len(query) < 16 * 1024 * 1024
	expected_err_re = ("Exceeded the statement expression limit \({0}\)\n"
	"Statement has .* expressions.").format(250000)
	err = self.execute_query_expect_failure(self.client, query)
	assert re.search(expected_err_re, str(err))

	def __gen_huge_case(self, col_name, fanout, depth, indent):
	toks = ["case\n"]
	for i in xrange(fanout):
	add = randint(1, 1000000)
	divisor = randint(1, 10000000)
	mod = randint(0, divisor)
	# Generate a mathematical expr that can't be easily optimised out.
	when_expr = "{0} + {1} % {2} = {3}".format(col_name, add, divisor, mod)
	if depth == 0:
	then_expr = "{0}".format(i)
	else:
	then_expr = "({0})".format(
	self.__gen_huge_case(col_name, fanout, depth - 1, indent + " "))
	toks.append(indent)
	toks.append("when {0} then {1}\n".format(when_expr, then_expr))
	toks.append(indent)
	toks.append("end")
	return ''.join(toks)

	def __gen_deep_infix_expr(self, prefix, repeat_suffix):
	expr = prefix
	for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
	expr += repeat_suffix
	return expr

	def __gen_deep_func_expr(self, open_func, base_arg, close_func):
	expr = ""
	for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
	expr += open_func
	expr += base_arg
	for i in xrange(self.EXPR_DEPTH_LIMIT - 1):
	expr += close_func
	return expr

	def __exec_query(self, sql_str):
	try:
	impala_ret = self.execute_query(sql_str)
	assert impala_ret.success, "Failed to execute query %s" % (sql_str)
	except Exception as e: # consider any exception a failure
	assert False, "Failed to execute query %s: %s" % (sql_str, e)

	class TestUtcTimestampFunctions(ImpalaTestSuite):
	"""Tests for UTC timestamp functions, i.e. functions that do not depend on the behavior
	of the flag --use_local_tz_for_unix_timestamp_conversions. Tests added here should
	also be run in the custom cluster test test_local_tz_conversion.py to ensure they
	have the same behavior when the conversion flag is set to true."""

	@classmethod
	def add_test_dimensions(cls):
	super(TestUtcTimestampFunctions, cls).add_test_dimensions()
	# Test with and without expr rewrites to cover regular expr evaluations
	# as well as constant folding, in particular, timestamp literals.
	cls.ImpalaTestMatrix.add_dimension(
	ImpalaTestDimension('enable_expr_rewrites', *[0,1]))
	if cls.exploration_strategy() == 'core':
	# Test with file format that supports codegen
	cls.ImpalaTestMatrix.add_constraint(lambda v:\
	v.get_value('table_format').file_format == 'text' and\
	v.get_value('table_format').compression_codec == 'none')

	@classmethod
	def get_workload(cls):
	return 'functional-query'

	def test_utc_functions(self, vector):
	vector.get_value('exec_option')['enable_expr_rewrites'] = \
	vector.get_value('enable_expr_rewrites')
	self.run_test_case('QueryTest/utc-timestamp-functions', vector)