tests/custom_cluster/test_codegen_cache.py - impala - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 from __future__ import absolute_import, division, print_function
 from builtins import range
 import pytest
 from copy import copy
 from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
 from tests.common.skip import SkipIf, SkipIfNotHdfsMinicluster
 from tests.common.test_result_verifier import assert_codegen_cache_hit
 from tests.util.filesystem_utils import get_fs_path


 @SkipIf.not_hdfs
 @SkipIfNotHdfsMinicluster.scheduling
 class TestCodegenCache(CustomClusterTestSuite):
   """ This test enables the codegen cache and verfies that cache hit and miss counts
   in the runtime profile and metrics are as expected.
   """
   @classmethod
   def setup_class(cls):
     if cls.exploration_strategy() != 'exhaustive':
       pytest.skip('runs only in exhaustive')
     super(TestCodegenCache, cls).setup_class()

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache(self, vector):
     self._test_codegen_cache(vector,
             ("select * from (select * from functional.alltypes "
              + "limit 1000000) t1 where int_col > 10 limit 10"))

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_int_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where int_col > 0")

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_tinyint_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where tinyint_col > 0")

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_bool_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where bool_col > 0")

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_bigint_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where bigint_col > 0")

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_float_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where float_col > 0")

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_double_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where double_col > 0")

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_date_string_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where date_string_col != ''")

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_string_col(self, vector):
     self._test_codegen_cache(vector,
       "select * from functional.alltypes where string_col != ''")

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_poly_func_string_col(self, vector):
     self._test_codegen_cache(vector,
       ("select * from functional.alltypes where "
       + "CHAR_LENGTH(string_col) > 0"))

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   def test_codegen_cache_poly_func_date_string_col(self, vector):
     self._test_codegen_cache(vector,
       ("select * from functional.alltypes where "
       + "CHAR_LENGTH(date_string_col) > 0"))

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   # Test native uda is missed in the codegen cache, as it is disabled.
   def test_codegen_cache_uda_miss(self, vector):
     database = "test_codegen_cache_uda_miss"
     self._load_functions(database)
     self._test_codegen_cache(vector,
       "select test_count(int_col) from functional.alltypestiny", False)

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
   # Test native udf is missed in the codegen cache, as it is disabled.
   def test_codegen_cache_udf_miss(self, vector):
     database = "test_codegen_cache_udf_miss"
     self._load_functions(database)
     self._test_codegen_cache(vector,
       "select sum(identity(bigint_col)) from functional.alltypes", False)

   SYMBOL_EMITTER_TESTS_IMPALAD_ARGS = "--cache_force_single_shard=1 \
       --codegen_symbol_emitter_log_successful_destruction_test_only=1 \
       --codegen_cache_entry_bytes_charge_overhead=10000000 --codegen_cache_capacity=25MB "

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args=SYMBOL_EMITTER_TESTS_IMPALAD_ARGS + "--asm_module_dir=/dev/null",
           disable_log_buffering=True)
   # Regression test for IMPALA-12260.
   def test_codegen_cache_with_asm_module_dir(self, vector):
     self._test_codegen_cache_with_symbol_emitter(vector)

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args=SYMBOL_EMITTER_TESTS_IMPALAD_ARGS + "--perf_map",
           disable_log_buffering=True)
   # Regression test for IMPALA-12260.
   def test_codegen_cache_with_perf_map(self, vector):
     self._test_codegen_cache_with_symbol_emitter(vector)

   def _test_codegen_cache_with_symbol_emitter(self, vector):
     """Regression test for IMPALA-12260. In the test we run two queries. The first query
     produces two entries in the cache, and they both have a 'CodegenSymbolEmitter' as
     event listeners because of the '--asm_module_dir' or '--perf_map' startup flag. The
     second query inserts new entries in the cache - the size of the cache should be such
     that both entries from the first query fit in it but both are evicted during the
     second query.

     When an 'llvm::ExecutionEngine', which is part of the cache entry, is destroyed, it
     frees any remaining object files and notifies the listeners about this, so the
     listeners should be alive at this time. Prior to IMPALA-12260 the
     'CodegenSymbolEmitter's of the cached fragment instances were destroyed at the end of
     the first query, causing a use-after-free (sometimes leading to a crash) during the
     second one.

     The choice of the size of the cache is based on the following:
       - the first query imposes a lower bound on the cache size (both cache entries should
         fit in the cache) AND
       - the second query imposes an upper bound (the cache entries of the first query
         should be evicted during the second query).
     The acceptable values are in the intersection of these two intervals.
     However, code changes and the difference between debug and release builds
     can have a huge effect on the acceptable range. To get around this, we use
     the '--codegen_cache_entry_bytes_charge_overhead' startup flag to
     artificially assign a higher size to the cache entries, compared to which
     the real size, and therefore also changes in the real size, are
     insignificant.

     This test verifies that the use-after-free scenario doesn't happen. We can't rely on
     the crash to detect it because
     1) the crash is not guaranteed to happen, use-after-free is undefined behaviour
     2) the crash may happen well after the query has finished returning results.

     Therefore in 'CodegenSymbolEmitter' we count how many object files have been emitted
     and freed. If the difference is greater than zero at the time of the destruction of
     the 'CodegenSymbolEmitter', the LLVM execution engine to which the symbol emitter is
     subscribed is still alive and will attempt to notify the symbol emitter when it will
     have already been destroyed, leading to use-after-free.

     When the --codegen_symbol_emitter_log_successful_destruction_test_only flag is set to
     true, 'CodegenSymbolEmitter' will log a message when it is being destroyed correctly
     (i.e. when use-after-free will not happen). If we don't have the expected message in
     the logs (after some timeout), the test fails.

     After IMPALA-11805, codegen caching is no longer using the 'llvm::ExecutionEngine',
     instead we use 'CodeGenObjectCache'. While 'CodeGenObjectCache' doesn't impact the
     lifecycle of 'CodegenSymbolEmitter's, the testcase in this context still verifies
     the correct usage of 'CodegenSymbolEmitter's."""

     exec_options = copy(vector.get_value('exec_option'))
     exec_options['exec_single_node_rows_threshold'] = 0

     q1 = """select int_col from functional_parquet.alltypessmall
         order by int_col desc limit 20"""
     q2 = """select t1.bool_col, t1.year, t1.month
          from functional_parquet.alltypes t1
          inner join functional_parquet.alltypessmall t2 on t1.year = t2.year
          group by t1.id, t1.bool_col, t1.smallint_col, t1.bigint_col, t1.float_col,
              t1.double_col, t1.date_string_col, t1.string_col, t1.timestamp_col, t1.year,
              t1.month
          order by t1.id, t1.bool_col, t1.smallint_col, t1.bigint_col, t1.float_col,
              t1.double_col, t1.date_string_col, t1.string_col, t1.timestamp_col, t1.year,
              t1.month"""

     self._check_metric_expect_init()

     symbol_emitter_ok_msg = "Successful destruction of CodegenSymbolEmitter object."

     # ## First query
     self.execute_query_expect_success(self.client, q1, exec_options)
     cache_entries_in_use = self.get_metric('impala.codegen-cache.entries-in-use')
     cache_entries_evicted = self.get_metric('impala.codegen-cache.entries-evicted')
     # Query 1 contains 2 fragments.
     fragments_ran = 2
     assert cache_entries_in_use > 0
     assert self.get_metric('impala.codegen-cache.hits') == 0
     # Initialising the cross-compiled modules also consumes an LLVM executor engine.
     expected_num_msg = fragments_ran + 1
     self.assert_impalad_log_contains("INFO", symbol_emitter_ok_msg, expected_num_msg)

     # ## Second query
     self.execute_query_expect_success(self.client, q2, exec_options)
     assert self.get_metric('impala.codegen-cache.hits') == 0
     # Query 2 contains 4 fragments.
     fragments_ran = fragments_ran + 4
     cache_entries_evicted = self.get_metric('impala.codegen-cache.entries-evicted')
     assert cache_entries_evicted >= cache_entries_in_use
     # Initialising the cross-compiled modules also consumes an LLVM executor engine.
     expected_num_msg = fragments_ran + 1
     self.assert_impalad_log_contains("INFO", symbol_emitter_ok_msg, expected_num_msg)

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB")
   # Regression test for IMPALA-12269. The first query uses one of the codegen'd functions
   # in two objects, so it is added to be jitted twice. For the second query it is added
   # only once. The hash of the function names should be the same in both cases.
   def test_codegen_cache_with_duplicate_fn_names(self, vector):
     exec_options = copy(vector.get_value('exec_option'))
     exec_options['exec_single_node_rows_threshold'] = 0

     q1 = """select int_col, tinyint_col from functional_parquet.alltypessmall
         order by int_col desc limit 20"""
     q2 = """select tinyint_col from functional_parquet.alltypessmall
         order by int_col desc limit 20"""

     self._check_metric_expect_init()
     self.execute_query_expect_success(self.client, q1, exec_options)
     assert self.get_metric('impala.codegen-cache.entries-evicted') == 0

     self.execute_query_expect_success(self.client, q2, exec_options)
     # If the function name hashes of the first and the second query didn't match, there
     # would be no cache hit and the cache entry from the first query would be evicted
     # because the llvm modules of the two queries, hence the cache keys, are identical.
     assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
     assert self.get_metric('impala.codegen-cache.hits') == 1
     # Expect two misses for the two fragments of the first query and one for one of the
     # fragments of the second query.
     assert self.get_metric('impala.codegen-cache.misses') == 3

   def _check_metric_expect_init(self):
     # Verifies that the cache metrics are all zero.
     assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
     assert self.get_metric('impala.codegen-cache.entries-in-use') == 0
     assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') == 0
     assert self.get_metric('impala.codegen-cache.hits') == 0
     assert self.get_metric('impala.codegen-cache.misses') == 0

   def _test_codegen_cache(self, vector, sql, expect_hit=True, expect_num_frag=2):
     # Do not disable codegen.
     exec_options = copy(vector.get_value('exec_option'))
     exec_options['exec_single_node_rows_threshold'] = 0
     self._check_metric_expect_init()
     result = self.execute_query(sql, exec_options)
     assert_codegen_cache_hit(result.runtime_profile, False)
     # expect_num_cache_miss_fragment is 1 iff expect_hit is False, and expect only
     # one fragment codegen cache missing for the case if expect_hit is False.
     expect_num_cache_miss_fragment = 1
     if expect_hit:
         expect_num_cache_miss_fragment = 0
     expect_num_cache_hit = expect_num_frag - expect_num_cache_miss_fragment

     # Verifies that the cache misses > 0, because the look up fails in an empty
     # brandnew cache, then a new entry should be stored successfully, so the in-use
     # entry number and bytes should be larger than 0.
     assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
     assert self.get_metric('impala.codegen-cache.entries-in-use') == expect_num_cache_hit
     assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
     assert self.get_metric('impala.codegen-cache.hits') == 0
     assert self.get_metric('impala.codegen-cache.misses') == expect_num_cache_hit

     result = self.execute_query(sql, exec_options)
     # Verify again, the expected cache hit should be reflected.
     if expect_hit:
       assert_codegen_cache_hit(result.runtime_profile, True)
     else:
       assert_codegen_cache_hit(result.runtime_profile, False)
     assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
     assert self.get_metric('impala.codegen-cache.entries-in-use') == expect_num_cache_hit
     assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
     assert self.get_metric('impala.codegen-cache.hits') == expect_num_cache_hit
     assert self.get_metric('impala.codegen-cache.misses') == expect_num_cache_hit

   def _load_functions(self, database):
     create_func_template = """
     use default;
     drop database if exists {database} CASCADE;
     create database {database};
     create aggregate function {database}.test_count(int) returns bigint
     location '{location_uda}' update_fn='CountUpdate';
     create function {database}.identity(boolean) returns boolean
     location '{location_udf}' symbol='Identity';
     create function {database}.identity(bigint) returns bigint
     location '{location_udf}' symbol='Identity';
     use {database};
     """
     location_uda = get_fs_path('/test-warehouse/libudasample.so')
     location_udf = get_fs_path('/test-warehouse/libTestUdfs.so')
     queries = create_func_template.format(database=database,
             location_uda=location_uda, location_udf=location_udf)
     queries = [q for q in queries.split(';') if q.strip()]
     for query in queries:
       if query.strip() == '': continue
       result = self.execute_query_expect_success(self.client, query)
       assert result is not None

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=3,
           impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_udf_crash(self, vector):
       # The testcase would crash if we don't disable the native udf for codegen cache.
       database = "test_codegen_cache_udf_crash"
       self._load_functions(database)
       self.run_test_case('QueryTest/codegen-cache-udf', vector, use_db=database)
       # Even the udf is disabled and the queries are using the udf, there could be
       # other fragments stored to the codegen cache, so we check whether the codegen
       # cache is enabled to other cases.
       assert self.get_metric('impala.codegen-cache.entries-in-use') > 0
       assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0

       # Run multiple times, recreate the udfs, would crash if the udf is reused from
       # the codegen cache.
       for i in range(3):
         # Make the database different
         database = database + "diff"
         self._load_functions(database)
         self.run_test_case('QueryTest/codegen-cache-udf', vector, use_db=database)

   def _test_codegen_cache_timezone_crash_helper(self, database):
     create_db_template = """
     use default;
     drop database if exists {database} CASCADE;
     create database {database};
     create table {database}.alltimezones as select * from functional.alltimezones;
     use {database};
     """
     queries = create_db_template.format(database=database)
     queries = [q for q in queries.split(';') if q.strip()]
     query = "select timezone, utctime, localtime,\
             from_utc_timestamp(utctime,timezone) as\
             impalaresult from alltimezones where\
             localtime != from_utc_timestamp(utctime,timezone)"
     queries.append(query)
     for query in queries:
       if query.strip() == '': continue
       result = self.execute_query_expect_success(self.client, query)
       assert result is not None

   @pytest.mark.execute_serially
   @CustomClusterTestSuite.with_args(cluster_size=1,
           impalad_args="--codegen_cache_capacity=1GB")
   def test_codegen_cache_timezone_crash(self, vector):
       # The testcase tests whether it would crash using the broken builtin function
       # from_utc_timestamp from the codegen cache.
       database = "test_codegen_cache_timezone_crash"
       # Run multiple times, recreate the database each time. Except for the first run,
       # other runs should all hit the cache.
       # Expect won't crash.
       for i in range(5):
         # Make the database different
         self._test_codegen_cache_timezone_crash_helper(database + str(i))
         # During the table creation, there will be one fragment involved, for the
         # query we are going to test, will be two fragments, so totally three
         # fragments involved, should all be cached.
         assert self.get_metric('impala.codegen-cache.entries-in-use') == 3
         assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
         assert self.get_metric('impala.codegen-cache.hits') == i * 3
         assert self.get_metric('impala.codegen-cache.misses') == 3
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	from __future__ import absolute_import, division, print_function
	from builtins import range
	import pytest
	from copy import copy
	from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
	from tests.common.skip import SkipIf, SkipIfNotHdfsMinicluster
	from tests.common.test_result_verifier import assert_codegen_cache_hit
	from tests.util.filesystem_utils import get_fs_path


	@SkipIf.not_hdfs
	@SkipIfNotHdfsMinicluster.scheduling
	class TestCodegenCache(CustomClusterTestSuite):
	""" This test enables the codegen cache and verfies that cache hit and miss counts
	in the runtime profile and metrics are as expected.
	"""
	@classmethod
	def setup_class(cls):
	if cls.exploration_strategy() != 'exhaustive':
	pytest.skip('runs only in exhaustive')
	super(TestCodegenCache, cls).setup_class()

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache(self, vector):
	self._test_codegen_cache(vector,
	("select * from (select * from functional.alltypes "
	+ "limit 1000000) t1 where int_col > 10 limit 10"))

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_int_col(self, vector):
	self._test_codegen_cache(vector,
	"select * from functional.alltypes where int_col > 0")

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_tinyint_col(self, vector):
	self._test_codegen_cache(vector,
	"select * from functional.alltypes where tinyint_col > 0")

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_bool_col(self, vector):
	self._test_codegen_cache(vector,
	"select * from functional.alltypes where bool_col > 0")

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_bigint_col(self, vector):
	self._test_codegen_cache(vector,
	"select * from functional.alltypes where bigint_col > 0")

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_float_col(self, vector):
	self._test_codegen_cache(vector,
	"select * from functional.alltypes where float_col > 0")

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_double_col(self, vector):
	self._test_codegen_cache(vector,
	"select * from functional.alltypes where double_col > 0")

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_date_string_col(self, vector):
	self._test_codegen_cache(vector,
	"select * from functional.alltypes where date_string_col != ''")

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_string_col(self, vector):
	self._test_codegen_cache(vector,
	"select * from functional.alltypes where string_col != ''")

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_poly_func_string_col(self, vector):
	self._test_codegen_cache(vector,
	("select * from functional.alltypes where "
	+ "CHAR_LENGTH(string_col) > 0"))

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	def test_codegen_cache_poly_func_date_string_col(self, vector):
	self._test_codegen_cache(vector,
	("select * from functional.alltypes where "
	+ "CHAR_LENGTH(date_string_col) > 0"))

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	# Test native uda is missed in the codegen cache, as it is disabled.
	def test_codegen_cache_uda_miss(self, vector):
	database = "test_codegen_cache_uda_miss"
	self._load_functions(database)
	self._test_codegen_cache(vector,
	"select test_count(int_col) from functional.alltypestiny", False)

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB", force_restart=True)
	# Test native udf is missed in the codegen cache, as it is disabled.
	def test_codegen_cache_udf_miss(self, vector):
	database = "test_codegen_cache_udf_miss"
	self._load_functions(database)
	self._test_codegen_cache(vector,
	"select sum(identity(bigint_col)) from functional.alltypes", False)

	SYMBOL_EMITTER_TESTS_IMPALAD_ARGS = "--cache_force_single_shard=1 \
	--codegen_symbol_emitter_log_successful_destruction_test_only=1 \
	--codegen_cache_entry_bytes_charge_overhead=10000000 --codegen_cache_capacity=25MB "

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args=SYMBOL_EMITTER_TESTS_IMPALAD_ARGS + "--asm_module_dir=/dev/null",
	disable_log_buffering=True)
	# Regression test for IMPALA-12260.
	def test_codegen_cache_with_asm_module_dir(self, vector):
	self._test_codegen_cache_with_symbol_emitter(vector)

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args=SYMBOL_EMITTER_TESTS_IMPALAD_ARGS + "--perf_map",
	disable_log_buffering=True)
	# Regression test for IMPALA-12260.
	def test_codegen_cache_with_perf_map(self, vector):
	self._test_codegen_cache_with_symbol_emitter(vector)

	def _test_codegen_cache_with_symbol_emitter(self, vector):
	"""Regression test for IMPALA-12260. In the test we run two queries. The first query
	produces two entries in the cache, and they both have a 'CodegenSymbolEmitter' as
	event listeners because of the '--asm_module_dir' or '--perf_map' startup flag. The
	second query inserts new entries in the cache - the size of the cache should be such
	that both entries from the first query fit in it but both are evicted during the
	second query.

	When an 'llvm::ExecutionEngine', which is part of the cache entry, is destroyed, it
	frees any remaining object files and notifies the listeners about this, so the
	listeners should be alive at this time. Prior to IMPALA-12260 the
	'CodegenSymbolEmitter's of the cached fragment instances were destroyed at the end of
	the first query, causing a use-after-free (sometimes leading to a crash) during the
	second one.

	The choice of the size of the cache is based on the following:
	- the first query imposes a lower bound on the cache size (both cache entries should
	fit in the cache) AND
	- the second query imposes an upper bound (the cache entries of the first query
	should be evicted during the second query).
	The acceptable values are in the intersection of these two intervals.
	However, code changes and the difference between debug and release builds
	can have a huge effect on the acceptable range. To get around this, we use
	the '--codegen_cache_entry_bytes_charge_overhead' startup flag to
	artificially assign a higher size to the cache entries, compared to which
	the real size, and therefore also changes in the real size, are
	insignificant.

	This test verifies that the use-after-free scenario doesn't happen. We can't rely on
	the crash to detect it because
	1) the crash is not guaranteed to happen, use-after-free is undefined behaviour
	2) the crash may happen well after the query has finished returning results.

	Therefore in 'CodegenSymbolEmitter' we count how many object files have been emitted
	and freed. If the difference is greater than zero at the time of the destruction of
	the 'CodegenSymbolEmitter', the LLVM execution engine to which the symbol emitter is
	subscribed is still alive and will attempt to notify the symbol emitter when it will
	have already been destroyed, leading to use-after-free.

	When the --codegen_symbol_emitter_log_successful_destruction_test_only flag is set to
	true, 'CodegenSymbolEmitter' will log a message when it is being destroyed correctly
	(i.e. when use-after-free will not happen). If we don't have the expected message in
	the logs (after some timeout), the test fails.

	After IMPALA-11805, codegen caching is no longer using the 'llvm::ExecutionEngine',
	instead we use 'CodeGenObjectCache'. While 'CodeGenObjectCache' doesn't impact the
	lifecycle of 'CodegenSymbolEmitter's, the testcase in this context still verifies
	the correct usage of 'CodegenSymbolEmitter's."""

	exec_options = copy(vector.get_value('exec_option'))
	exec_options['exec_single_node_rows_threshold'] = 0

	q1 = """select int_col from functional_parquet.alltypessmall
	order by int_col desc limit 20"""
	q2 = """select t1.bool_col, t1.year, t1.month
	from functional_parquet.alltypes t1
	inner join functional_parquet.alltypessmall t2 on t1.year = t2.year
	group by t1.id, t1.bool_col, t1.smallint_col, t1.bigint_col, t1.float_col,
	t1.double_col, t1.date_string_col, t1.string_col, t1.timestamp_col, t1.year,
	t1.month
	order by t1.id, t1.bool_col, t1.smallint_col, t1.bigint_col, t1.float_col,
	t1.double_col, t1.date_string_col, t1.string_col, t1.timestamp_col, t1.year,
	t1.month"""

	self._check_metric_expect_init()

	symbol_emitter_ok_msg = "Successful destruction of CodegenSymbolEmitter object."

	# ## First query
	self.execute_query_expect_success(self.client, q1, exec_options)
	cache_entries_in_use = self.get_metric('impala.codegen-cache.entries-in-use')
	cache_entries_evicted = self.get_metric('impala.codegen-cache.entries-evicted')
	# Query 1 contains 2 fragments.
	fragments_ran = 2
	assert cache_entries_in_use > 0
	assert self.get_metric('impala.codegen-cache.hits') == 0
	# Initialising the cross-compiled modules also consumes an LLVM executor engine.
	expected_num_msg = fragments_ran + 1
	self.assert_impalad_log_contains("INFO", symbol_emitter_ok_msg, expected_num_msg)

	# ## Second query
	self.execute_query_expect_success(self.client, q2, exec_options)
	assert self.get_metric('impala.codegen-cache.hits') == 0
	# Query 2 contains 4 fragments.
	fragments_ran = fragments_ran + 4
	cache_entries_evicted = self.get_metric('impala.codegen-cache.entries-evicted')
	assert cache_entries_evicted >= cache_entries_in_use
	# Initialising the cross-compiled modules also consumes an LLVM executor engine.
	expected_num_msg = fragments_ran + 1
	self.assert_impalad_log_contains("INFO", symbol_emitter_ok_msg, expected_num_msg)

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB")
	# Regression test for IMPALA-12269. The first query uses one of the codegen'd functions
	# in two objects, so it is added to be jitted twice. For the second query it is added
	# only once. The hash of the function names should be the same in both cases.
	def test_codegen_cache_with_duplicate_fn_names(self, vector):
	exec_options = copy(vector.get_value('exec_option'))
	exec_options['exec_single_node_rows_threshold'] = 0

	q1 = """select int_col, tinyint_col from functional_parquet.alltypessmall
	order by int_col desc limit 20"""
	q2 = """select tinyint_col from functional_parquet.alltypessmall
	order by int_col desc limit 20"""

	self._check_metric_expect_init()
	self.execute_query_expect_success(self.client, q1, exec_options)
	assert self.get_metric('impala.codegen-cache.entries-evicted') == 0

	self.execute_query_expect_success(self.client, q2, exec_options)
	# If the function name hashes of the first and the second query didn't match, there
	# would be no cache hit and the cache entry from the first query would be evicted
	# because the llvm modules of the two queries, hence the cache keys, are identical.
	assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
	assert self.get_metric('impala.codegen-cache.hits') == 1
	# Expect two misses for the two fragments of the first query and one for one of the
	# fragments of the second query.
	assert self.get_metric('impala.codegen-cache.misses') == 3

	def _check_metric_expect_init(self):
	# Verifies that the cache metrics are all zero.
	assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
	assert self.get_metric('impala.codegen-cache.entries-in-use') == 0
	assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') == 0
	assert self.get_metric('impala.codegen-cache.hits') == 0
	assert self.get_metric('impala.codegen-cache.misses') == 0

	def _test_codegen_cache(self, vector, sql, expect_hit=True, expect_num_frag=2):
	# Do not disable codegen.
	exec_options = copy(vector.get_value('exec_option'))
	exec_options['exec_single_node_rows_threshold'] = 0
	self._check_metric_expect_init()
	result = self.execute_query(sql, exec_options)
	assert_codegen_cache_hit(result.runtime_profile, False)
	# expect_num_cache_miss_fragment is 1 iff expect_hit is False, and expect only
	# one fragment codegen cache missing for the case if expect_hit is False.
	expect_num_cache_miss_fragment = 1
	if expect_hit:
	expect_num_cache_miss_fragment = 0
	expect_num_cache_hit = expect_num_frag - expect_num_cache_miss_fragment

	# Verifies that the cache misses > 0, because the look up fails in an empty
	# brandnew cache, then a new entry should be stored successfully, so the in-use
	# entry number and bytes should be larger than 0.
	assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
	assert self.get_metric('impala.codegen-cache.entries-in-use') == expect_num_cache_hit
	assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
	assert self.get_metric('impala.codegen-cache.hits') == 0
	assert self.get_metric('impala.codegen-cache.misses') == expect_num_cache_hit

	result = self.execute_query(sql, exec_options)
	# Verify again, the expected cache hit should be reflected.
	if expect_hit:
	assert_codegen_cache_hit(result.runtime_profile, True)
	else:
	assert_codegen_cache_hit(result.runtime_profile, False)
	assert self.get_metric('impala.codegen-cache.entries-evicted') == 0
	assert self.get_metric('impala.codegen-cache.entries-in-use') == expect_num_cache_hit
	assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
	assert self.get_metric('impala.codegen-cache.hits') == expect_num_cache_hit
	assert self.get_metric('impala.codegen-cache.misses') == expect_num_cache_hit

	def _load_functions(self, database):
	create_func_template = """
	use default;
	drop database if exists {database} CASCADE;
	create database {database};
	create aggregate function {database}.test_count(int) returns bigint
	location '{location_uda}' update_fn='CountUpdate';
	create function {database}.identity(boolean) returns boolean
	location '{location_udf}' symbol='Identity';
	create function {database}.identity(bigint) returns bigint
	location '{location_udf}' symbol='Identity';
	use {database};
	"""
	location_uda = get_fs_path('/test-warehouse/libudasample.so')
	location_udf = get_fs_path('/test-warehouse/libTestUdfs.so')
	queries = create_func_template.format(database=database,
	location_uda=location_uda, location_udf=location_udf)
	queries = [q for q in queries.split(';') if q.strip()]
	for query in queries:
	if query.strip() == '': continue
	result = self.execute_query_expect_success(self.client, query)
	assert result is not None

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=3,
	impalad_args="--codegen_cache_capacity=1GB")
	def test_codegen_cache_udf_crash(self, vector):
	# The testcase would crash if we don't disable the native udf for codegen cache.
	database = "test_codegen_cache_udf_crash"
	self._load_functions(database)
	self.run_test_case('QueryTest/codegen-cache-udf', vector, use_db=database)
	# Even the udf is disabled and the queries are using the udf, there could be
	# other fragments stored to the codegen cache, so we check whether the codegen
	# cache is enabled to other cases.
	assert self.get_metric('impala.codegen-cache.entries-in-use') > 0
	assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0

	# Run multiple times, recreate the udfs, would crash if the udf is reused from
	# the codegen cache.
	for i in range(3):
	# Make the database different
	database = database + "diff"
	self._load_functions(database)
	self.run_test_case('QueryTest/codegen-cache-udf', vector, use_db=database)

	def _test_codegen_cache_timezone_crash_helper(self, database):
	create_db_template = """
	use default;
	drop database if exists {database} CASCADE;
	create database {database};
	create table {database}.alltimezones as select * from functional.alltimezones;
	use {database};
	"""
	queries = create_db_template.format(database=database)
	queries = [q for q in queries.split(';') if q.strip()]
	query = "select timezone, utctime, localtime,\
	from_utc_timestamp(utctime,timezone) as\
	impalaresult from alltimezones where\
	localtime != from_utc_timestamp(utctime,timezone)"
	queries.append(query)
	for query in queries:
	if query.strip() == '': continue
	result = self.execute_query_expect_success(self.client, query)
	assert result is not None

	@pytest.mark.execute_serially
	@CustomClusterTestSuite.with_args(cluster_size=1,
	impalad_args="--codegen_cache_capacity=1GB")
	def test_codegen_cache_timezone_crash(self, vector):
	# The testcase tests whether it would crash using the broken builtin function
	# from_utc_timestamp from the codegen cache.
	database = "test_codegen_cache_timezone_crash"
	# Run multiple times, recreate the database each time. Except for the first run,
	# other runs should all hit the cache.
	# Expect won't crash.
	for i in range(5):
	# Make the database different
	self._test_codegen_cache_timezone_crash_helper(database + str(i))
	# During the table creation, there will be one fragment involved, for the
	# query we are going to test, will be two fragments, so totally three
	# fragments involved, should all be cached.
	assert self.get_metric('impala.codegen-cache.entries-in-use') == 3
	assert self.get_metric('impala.codegen-cache.entries-in-use-bytes') > 0
	assert self.get_metric('impala.codegen-cache.hits') == i * 3
	assert self.get_metric('impala.codegen-cache.misses') == 3