tests/query_test/test_charcodec.py - impala - Git at Google

 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
 # regarding copyright ownership.  The ASF licenses this file
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
 #
 #   http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.

 from __future__ import absolute_import, division, print_function
 from tests.common.impala_test_suite import ImpalaTestSuite
 from tests.common.test_vector import ImpalaTestDimension
 from tests.common.skip import SkipIfFS
 import codecs
 import os
 import pytest
 import random
 import tempfile
 import shutil
 import sys

 if sys.version_info[0] >= 3:
     unichr = chr  # Python 3

 _hiragana_range = [codepoint for codepoint in range(0x3040, 0x309F) if codepoint not in
     # problematic symbols: unassigned, deprecated, etc:
     set([0x3040, 0x3094, 0x3095, 0x3096, 0x3097, 0x3098, 0x3099, 0x309A, 0x309B, 0x309C])]

 _cyrillic_range = [codepoint for codepoint in range(0x0410, 0x045F) if codepoint not in
     # problematic symbols: unassigned, deprecated, etc:
     set([0x0450, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458,
          0x0459, 0x045A, 0x045B, 0x045C, 0x045D, 0x045E])]

 _charsets = {
   'gbk': u''.join(unichr(i) for i in range(0x4E00, 0x9FA6)),
   'latin1': u''.join(unichr(i) for i in range(0x20, 0x7F)),
   'shift_jis': u''.join(unichr(i) for i in _hiragana_range),
   'cp1251': u''.join(unichr(i) for i in range(0x0410, 0x044F)),
   'koi8-r': u''.join(unichr(i) for i in _cyrillic_range)
 }


 def _generate_random_word(charset, min_length=1, max_length=20):
   length = random.randint(min_length, max_length)
   return u''.join(random.choice(charset) for _ in range(length))


 def _compare_tables(selfobj, db, utf8_table, encoded_table, row_count):
     # Compare count(*) of the encoded table with the utf8 table
     count_utf8 = selfobj.client.execute("""select count(*) from {}.{}"""
         .format(db, utf8_table))
     count_encoded = selfobj.client.execute("""select count(*) from {}.{}"""
         .format(db, encoded_table))
     assert int(count_utf8.get_data()) == int(count_encoded.get_data()) == row_count

     # Compare * of the encoded table with the utf8 table
     result = selfobj.client.execute("""select * from {}.{} except select * from {}.{}
         union all select * from {}.{} except select * from {}.{}"""
         .format(db, utf8_table, db, encoded_table, db, encoded_table, db, utf8_table))
     assert result.data == []


 # Tests with auto-generated data
 class TestCharCodecGen(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestCharCodecGen, cls).add_test_dimensions()
     encodings = list(_charsets.keys())
     # Only run the tests for single 'gbk' encoding in non-exhaustive mode.
     if cls.exploration_strategy() != 'exhaustive':
       encodings = [enc for enc in encodings if enc == 'gbk']
     cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension(
         'charset', *encodings))
     # There is no reason to run these tests using all dimensions.
     # See IMPALA-14063 for Sequence file format support.
     cls.ImpalaTestMatrix.add_constraint(
         lambda v: v.get_value('table_format').file_format == 'text'
         and v.get_value('table_format').compression_codec == 'none')
     cls.ImpalaTestMatrix.add_constraint(
         lambda v: v.get_value('exec_option')['disable_codegen'] is False)

   # Basic Tests
   ####################################################################
   def generate_text_files(self, encoding_name, charset, test_name,
                           num_lines=10000, words_per_line=5, num_files=1,
                           min_word_length=1, max_word_length=20):
     lines_per_file = num_lines // num_files
     file_paths = []
     tmp_dir = tempfile.mkdtemp(dir=os.path.join(os.environ['IMPALA_HOME'], "testdata"))
     for file_index in range(num_files):
       data_file_path = os.path.join(tmp_dir, "charcodec_{}_{}_utf8_{}.txt"
                                     .format(encoding_name, test_name, file_index))
       file_paths.append(data_file_path)
       with codecs.open(data_file_path, 'w', encoding='utf-8') as file:
         for _ in range(lines_per_file):
           words = [_generate_random_word(charset, min_word_length, max_word_length)
                    for _ in range(words_per_line)]
           line = u','.join(words)
           file.write(line + u'\n')
     return tmp_dir, file_paths, num_lines

   def prepare_utf8_test_table(self, db, file_paths, encoding_name, vector):
     encoding_name_tbl = encoding_name.replace('-', '')
     tbl_name = "{}_gen_utf8".format(encoding_name_tbl)
     self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
         name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
         ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE"""
         .format(db, tbl_name))
     for file_path in file_paths:
       self.filesystem_client.copy_from_local(file_path,
           self._get_table_location("{0}.{1}".format(db, tbl_name), vector))
     # remove REFRESH when IMPALA-13749 is fixed
     self.execute_query("""REFRESH {}.{}""".format(db, tbl_name))
     return tbl_name

   def prepare_encoded_test_table(self, db, utf8_table, encoding_name):
     encoding_name_tbl = encoding_name.replace('-', '')
     encoded_table = "{}_gen".format(encoding_name_tbl)
     self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
         name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
         STORED AS TEXTFILE""".format(db, encoded_table))
     self.execute_query("""ALTER TABLE {}.{}
                        SET SERDEPROPERTIES("serialization.encoding"="{}")"""
         .format(db, encoded_table, encoding_name))
     self.execute_query("""REFRESH {}.{}""".format(db, encoded_table))
     self.execute_query("""INSERT OVERWRITE TABLE {}.{} SELECT * FROM {}.{}"""
         .format(db, encoded_table, db, utf8_table))
     return encoded_table

   def test_enc_dec_gen(self, vector, unique_database):
     """Write encoded table with Impala and read it back."""
     db = unique_database
     encoding_name = vector.get_value('charset')
     charset = _charsets[encoding_name]
     tmp_dir, file_paths, row_count = self.generate_text_files(
         encoding_name, charset, "gen")
     utf8_table = self.prepare_utf8_test_table(db, file_paths, encoding_name, vector)
     shutil.rmtree(tmp_dir)
     encoded_table = self.prepare_encoded_test_table(db, utf8_table, encoding_name)
     _compare_tables(self, db, utf8_table, encoded_table, row_count)

   def test_enc_dec_gen_long_words(self, vector, unique_database):
     db = unique_database
     encoding_name = vector.get_value('charset')
     charset = _charsets[encoding_name]
     tmp_dir, file_paths, row_count = self.generate_text_files(
         encoding_name, charset, "gen", min_word_length=100, max_word_length=1000)
     utf8_table = self.prepare_utf8_test_table(db, file_paths, encoding_name, vector)
     shutil.rmtree(tmp_dir)
     encoded_table = self.prepare_encoded_test_table(db, utf8_table, encoding_name)
     _compare_tables(self, db, utf8_table, encoded_table, row_count)

   # Split-file tests
   ####################################################################
   def test_enc_dec_gen_split(self, vector, unique_database):
     """Test table is split across multiple files."""
     db = unique_database
     encoding_name = vector.get_value('charset')
     charset = _charsets[encoding_name]
     tmp_dir, file_paths, row_count = self.generate_text_files(
         encoding_name, charset, "split", num_lines=10000, words_per_line=5, num_files=5)
     utf8_table = self.prepare_utf8_test_table(db, file_paths, encoding_name, vector)
     shutil.rmtree(tmp_dir)
     encoded_table = self.prepare_encoded_test_table(db, utf8_table, encoding_name)
     _compare_tables(self, db, utf8_table, encoded_table, row_count)

   # Hive + Compression Tests
   ####################################################################
   def prepare_encoded_test_table_compress(self, db, utf8_table, encoding_name, codec):
     encoding_name_tbl = encoding_name.replace('-', '')
     encoded_table = "{}_gen_{}".format(encoding_name_tbl, codec)
     self.run_stmt_in_hive("""CREATE TABLE IF NOT EXISTS {}.{} (
         name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
         STORED AS TEXTFILE""".format(db, encoded_table))
     self.run_stmt_in_hive("""ALTER TABLE {}.{}
                           SET SERDEPROPERTIES("serialization.encoding"="{}")"""
         .format(db, encoded_table, encoding_name))
     self.run_stmt_in_hive("""set hive.exec.compress.output={};
         set mapreduce.output.fileoutputformat.compress.codec=
         org.apache.hadoop.io.compress.{}Codec;
         INSERT OVERWRITE TABLE {}.{} SELECT * FROM {}.{}
         """.format("false" if codec == "None" else "true",
                    codec, db, encoded_table, db, utf8_table))
     return encoded_table

   @SkipIfFS.hive
   def test_enc_dec_gen_compress(self, vector, unique_database):
     db = unique_database
     encoding_name = vector.get_value('charset')
     charset = _charsets[encoding_name]

     tmp_dir, file_paths, row_count = self.generate_text_files(
         encoding_name, charset, "compress", num_lines=10000)
     utf8_table = self.prepare_utf8_test_table(db, file_paths, encoding_name, vector)
     shutil.rmtree(tmp_dir)
     # Snappy codec supports streaming, ZStandard does not
     for codec in ["None", "Snappy", "ZStandard"]:
       encoded_table = self.prepare_encoded_test_table_compress(db, utf8_table,
                                                                 encoding_name, codec)
       _compare_tables(self, db, utf8_table, encoded_table, row_count)

   # Partitions Tests
   ####################################################################
   def prepare_utf8_test_table_partitions(self, db, file_paths, encoding_name, vector):
     encoding_name_tbl = encoding_name.replace('-', '')
     tbl_name = "{}_gen_utf8".format(encoding_name_tbl)
     self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
         name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
         PARTITIONED BY (part STRING)
         ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE"""
         .format(db, tbl_name))
     for i in range(len(file_paths)):
       self.execute_query("""ALTER TABLE {}.{} ADD PARTITION (part='{}')"""
           .format(db, tbl_name, i))
       part_url = os.path.join(
           self._get_table_location("{0}.{1}".format(db, tbl_name), vector),
           "part={}".format(i))
       self.filesystem_client.copy_from_local(file_paths[i], part_url)
     self.execute_query("""REFRESH {}.{}""".format(db, tbl_name))
     return tbl_name

   def prepare_encoded_test_table_partitions(self, db, utf8_table, encoding_name,
                                             file_paths):
     encoding_name_tbl = encoding_name.replace('-', '')
     encoded_table = "{}_gen".format(encoding_name_tbl)
     self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
         name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
         PARTITIONED BY (part STRING)
         STORED AS TEXTFILE""".format(db, encoded_table))
     for i in range(len(file_paths)):
       self.execute_query("""ALTER TABLE {}.{} ADD PARTITION (part='{}')"""
                          .format(db, encoded_table, i))
       self.execute_query("""ALTER TABLE {}.{} PARTITION (part='{}')
                         SET SERDEPROPERTIES("serialization.encoding"="{}")"""
                         .format(db, encoded_table, i, encoding_name))
       self.execute_query("""REFRESH {}.{}""".format(db, encoded_table))
       self.execute_query("""INSERT OVERWRITE TABLE {}.{} PARTITION (part='{}')
           SELECT name1, name2, name3, name4, name5 FROM {}.{} WHERE part='{}'"""
           .format(db, encoded_table, i, db, utf8_table, i))
     return encoded_table

   def test_enc_dec_gen_partitions(self, vector, unique_database):
     db = unique_database
     encoding_name = vector.get_value('charset')
     charset = _charsets[encoding_name]
     tmp_dir, file_paths, row_count = self.generate_text_files(
         encoding_name, charset, "partitions", num_lines=10000, num_files=5)
     utf8_table = self.prepare_utf8_test_table_partitions(
         db, file_paths, encoding_name, vector)
     shutil.rmtree(tmp_dir)
     encoded_table = self.prepare_encoded_test_table_partitions(db,
         utf8_table, encoding_name, file_paths)
     _compare_tables(self, db, utf8_table, encoded_table, row_count)


 class TestCharCodecGenMixed(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestCharCodecGenMixed, cls).add_test_dimensions()
     # There is no reason to run these tests using all dimensions.
     cls.ImpalaTestMatrix.add_constraint(
         lambda v: v.get_value('table_format').file_format == 'text'
         and v.get_value('table_format').compression_codec == 'none')
     cls.ImpalaTestMatrix.add_constraint(
         lambda v: v.get_value('exec_option')['disable_codegen'] is False)

   # Mixed Partitions Tests
   ####################################################################
   def generate_text_files_mixed(self, test_file, num_lines=10000, words_per_line=5,
                                 num_files=1):
     lines_per_file = num_lines // num_files
     file_paths = []
     encodings = []
     tmp_dir = tempfile.mkdtemp(dir=os.path.join(os.environ['IMPALA_HOME'], "testdata"))
     for i in range(num_files):
       encoding_name, charset = random.choice(list(_charsets.items()))
       data_file_path = os.path.join(tmp_dir, "charcodec_{}_{}_utf8_{}.txt"
                                     .format(encoding_name, test_file, i))
       encodings.append(encoding_name)
       file_paths.append(data_file_path)
       with codecs.open(data_file_path, 'w', encoding='utf-8') as file:
         for _ in range(lines_per_file):
           words = [_generate_random_word(charset) for _ in range(words_per_line)]
           line = u','.join(words)
           file.write(line + u'\n')
     return tmp_dir, file_paths, encodings, num_lines

   # Partitioned table with different encodings.
   def prepare_utf8_test_table_partitions_mixed(self, db, file_paths, vector):
     tbl_name = "mixed_gen_utf8"
     self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
         name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
         PARTITIONED BY (part STRING)
         ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE"""
         .format(db, tbl_name))
     for i in range(len(file_paths)):
       self.execute_query("""ALTER TABLE {}.{} ADD PARTITION (part='{}')"""
         .format(db, tbl_name, i))
       part_url = os.path.join(
           self._get_table_location("{0}.{1}".format(db, tbl_name), vector),
           "part={}".format(i))
       self.filesystem_client.copy_from_local(file_paths[i], part_url)
     self.execute_query("""REFRESH {}.{}""".format(db, tbl_name))
     return tbl_name

   def prepare_encoded_test_table_partitions_mixed(self, db, utf8_table, encodings):
     encoded_table = "mixed_gen"
     self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
         name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
         PARTITIONED BY (part STRING)
         STORED AS TEXTFILE""".format(db, encoded_table))
     for i in range(len(encodings)):
       self.execute_query("""ALTER TABLE {}.{} ADD PARTITION (part='{}')"""
                          .format(db, encoded_table, i))
       self.execute_query("""ALTER TABLE {}.{} PARTITION (part='{}')
                         SET SERDEPROPERTIES("serialization.encoding"="{}")"""
                          .format(db, encoded_table, i, encodings[i]))
       self.execute_query("""REFRESH {}.{}""".format(db, encoded_table))
       self.execute_query("""INSERT OVERWRITE TABLE {}.{} PARTITION (part='{}')
           SELECT name1, name2, name3, name4, name5 FROM {}.{} WHERE part='{}'"""
           .format(db, encoded_table, i, db, utf8_table, i))
     return encoded_table

   def test_enc_dec_gen_partitions_mixed(self, unique_database, vector):
     db = unique_database
     tmp_dir, file_paths, encodings, row_count = self.generate_text_files_mixed(
                                                     "mixed", num_lines=10000, num_files=5)
     utf8_table = self.prepare_utf8_test_table_partitions_mixed(db, file_paths, vector)
     shutil.rmtree(tmp_dir)
     encoded_table = self.prepare_encoded_test_table_partitions_mixed(db,
         utf8_table, encodings)
     _compare_tables(self, db, utf8_table, encoded_table, row_count)


 class TestCharCodecPreCreated(ImpalaTestSuite):
   @classmethod
   def add_test_dimensions(cls):
     super(TestCharCodecPreCreated, cls).add_test_dimensions()
     encodings = list(_charsets.keys())
     # Only run the tests for single 'gbk' encoding in non-exhaustive mode.
     if cls.exploration_strategy() != 'exhaustive':
       encodings = [enc for enc in encodings if enc == 'gbk']
     cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension(
         'charset', *encodings))
     # There is no reason to run these tests using all dimensions.
     cls.ImpalaTestMatrix.add_constraint(
         lambda v: v.get_value('table_format').file_format == 'text'
         and v.get_value('table_format').compression_codec == 'none')
     cls.ImpalaTestMatrix.add_constraint(
         lambda v: v.get_value('exec_option')['disable_codegen'] is False)

   def prepare_test_table(self, vector, db, tbl_name, datafile, encoding=None):
     tbl_name = tbl_name.replace('-', '')
     datafile = datafile.replace('-', '')
     self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (name STRING)
         STORED AS TEXTFILE""".format(db, tbl_name))
     if encoding:
       self.execute_query("""ALTER TABLE {}.{} SET
           SERDEPROPERTIES("serialization.encoding"="{}")"""
           .format(db, tbl_name, encoding))
     data_file_path = os.path.join(os.environ['IMPALA_HOME'], "testdata",
         "charcodec", datafile)
     self.filesystem_client.copy_from_local(data_file_path,
         self._get_table_location("{0}.{1}".format(db, tbl_name), vector))
     self.execute_query("""REFRESH {}.{}""".format(db, tbl_name))
     return tbl_name

   def test_precreated_files(self, vector, unique_database):
     """Read encoded precreated files."""
     db = unique_database
     enc = vector.get_value('charset')

     # Without SERDEPROPERTIES("serialization.encoding") data is read incorrectly
     utf8_table = self.prepare_test_table(
         vector, db, enc + '_names_utf8', enc + '_names_utf8.txt', None)
     encoded_table = self.prepare_test_table(
         vector, db, enc + '_names_none', enc + '_names.txt', None)
     with pytest.raises(AssertionError) as exc_info:
         _compare_tables(self, db, utf8_table, encoded_table, 3)
     assert " == []" in str(exc_info.value)

     # With SERDEPROPERTIES("serialization.encoding") data is read correctly
     encoded_table = self.prepare_test_table(
         vector, db, enc + '_names', enc + '_names.txt', enc)
     _compare_tables(self, db, utf8_table, encoded_table, 3)

   def test_precreated_decoding_with_errors(self, vector, unique_database):
     db = unique_database
     enc = vector.get_value('charset')
     # Skip for promiscious encodings
     if enc not in ['gbk', 'shift_jis']: pytest.skip()
     encoded_table = self.prepare_test_table(
         vector, db, enc + '_names_error', enc + '_names_error.txt', enc)
     err = self.execute_query_expect_failure(
         self.client, """select * from {}.{}""".format(db, encoded_table))
     assert "Error during buffer conversion: Conversion failed" in str(err)

   def test_precreated_encoding_with_errors(self, vector, unique_database):
     db = unique_database
     enc = vector.get_value('charset')
     # Skip for promiscious encodings
     if enc not in ['gbk', 'shift_jis']: pytest.skip()
     encoded_table = self.prepare_test_table(
         vector, db, enc + '_names_error', enc + '_names_error.txt', enc)
     err = self.execute_query_expect_failure(self.client, """insert overwrite {}.{}
         select cast(binary_col as string) from functional.binary_tbl"""
         .format(db, encoded_table))
     assert "Error during buffer conversion: Conversion failed" in str(err)

   @SkipIfFS.hive
   def test_read_from_hive(self, unique_database, vector):
     """Write table with Impala and read it back with Hive."""
     db = unique_database
     enc = vector.get_value('charset')

     utf8_table = self.prepare_test_table(
         vector, db, enc + '_names_utf8', enc + '_names_utf8.txt', None)
     encoded_table = self.prepare_test_table(
         vector, db, enc + '_names', enc + '_names.txt', enc)
     self.execute_query(
         """insert overwrite {}.{} select * from {}.{}"""
         .format(db, encoded_table, db, utf8_table))

     result_hive = self.run_stmt_in_hive(
         """select name from {}.{}""".format(db, encoded_table))
     result_impala = self.client.execute(
         """select name from {}.{}""".format(db, utf8_table))
     result_hive_list = result_hive.strip().split('\n')[1:]
     assert result_hive_list == result_impala.data
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing,
	# software distributed under the License is distributed on an
	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	# KIND, either express or implied. See the License for the
	# specific language governing permissions and limitations
	# under the License.

	from __future__ import absolute_import, division, print_function
	from tests.common.impala_test_suite import ImpalaTestSuite
	from tests.common.test_vector import ImpalaTestDimension
	from tests.common.skip import SkipIfFS
	import codecs
	import os
	import pytest
	import random
	import tempfile
	import shutil
	import sys

	if sys.version_info[0] >= 3:
	unichr = chr # Python 3

	_hiragana_range = [codepoint for codepoint in range(0x3040, 0x309F) if codepoint not in
	# problematic symbols: unassigned, deprecated, etc:
	set([0x3040, 0x3094, 0x3095, 0x3096, 0x3097, 0x3098, 0x3099, 0x309A, 0x309B, 0x309C])]

	_cyrillic_range = [codepoint for codepoint in range(0x0410, 0x045F) if codepoint not in
	# problematic symbols: unassigned, deprecated, etc:
	set([0x0450, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, 0x0458,
	0x0459, 0x045A, 0x045B, 0x045C, 0x045D, 0x045E])]

	_charsets = {
	'gbk': u''.join(unichr(i) for i in range(0x4E00, 0x9FA6)),
	'latin1': u''.join(unichr(i) for i in range(0x20, 0x7F)),
	'shift_jis': u''.join(unichr(i) for i in _hiragana_range),
	'cp1251': u''.join(unichr(i) for i in range(0x0410, 0x044F)),
	'koi8-r': u''.join(unichr(i) for i in _cyrillic_range)
	}


	def _generate_random_word(charset, min_length=1, max_length=20):
	length = random.randint(min_length, max_length)
	return u''.join(random.choice(charset) for _ in range(length))


	def _compare_tables(selfobj, db, utf8_table, encoded_table, row_count):
	# Compare count(*) of the encoded table with the utf8 table
	count_utf8 = selfobj.client.execute("""select count(*) from {}.{}"""
	.format(db, utf8_table))
	count_encoded = selfobj.client.execute("""select count(*) from {}.{}"""
	.format(db, encoded_table))
	assert int(count_utf8.get_data()) == int(count_encoded.get_data()) == row_count

	# Compare * of the encoded table with the utf8 table
	result = selfobj.client.execute("""select * from {}.{} except select * from {}.{}
	union all select * from {}.{} except select * from {}.{}"""
	.format(db, utf8_table, db, encoded_table, db, encoded_table, db, utf8_table))
	assert result.data == []


	# Tests with auto-generated data
	class TestCharCodecGen(ImpalaTestSuite):
	@classmethod
	def add_test_dimensions(cls):
	super(TestCharCodecGen, cls).add_test_dimensions()
	encodings = list(_charsets.keys())
	# Only run the tests for single 'gbk' encoding in non-exhaustive mode.
	if cls.exploration_strategy() != 'exhaustive':
	encodings = [enc for enc in encodings if enc == 'gbk']
	cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension(
	'charset', *encodings))
	# There is no reason to run these tests using all dimensions.
	# See IMPALA-14063 for Sequence file format support.
	cls.ImpalaTestMatrix.add_constraint(
	lambda v: v.get_value('table_format').file_format == 'text'
	and v.get_value('table_format').compression_codec == 'none')
	cls.ImpalaTestMatrix.add_constraint(
	lambda v: v.get_value('exec_option')['disable_codegen'] is False)

	# Basic Tests
	####################################################################
	def generate_text_files(self, encoding_name, charset, test_name,
	num_lines=10000, words_per_line=5, num_files=1,
	min_word_length=1, max_word_length=20):
	lines_per_file = num_lines // num_files
	file_paths = []
	tmp_dir = tempfile.mkdtemp(dir=os.path.join(os.environ['IMPALA_HOME'], "testdata"))
	for file_index in range(num_files):
	data_file_path = os.path.join(tmp_dir, "charcodec_{}_{}_utf8_{}.txt"
	.format(encoding_name, test_name, file_index))
	file_paths.append(data_file_path)
	with codecs.open(data_file_path, 'w', encoding='utf-8') as file:
	for _ in range(lines_per_file):
	words = [_generate_random_word(charset, min_word_length, max_word_length)
	for _ in range(words_per_line)]
	line = u','.join(words)
	file.write(line + u'\n')
	return tmp_dir, file_paths, num_lines

	def prepare_utf8_test_table(self, db, file_paths, encoding_name, vector):
	encoding_name_tbl = encoding_name.replace('-', '')
	tbl_name = "{}_gen_utf8".format(encoding_name_tbl)
	self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
	name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
	ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE"""
	.format(db, tbl_name))
	for file_path in file_paths:
	self.filesystem_client.copy_from_local(file_path,
	self._get_table_location("{0}.{1}".format(db, tbl_name), vector))
	# remove REFRESH when IMPALA-13749 is fixed
	self.execute_query("""REFRESH {}.{}""".format(db, tbl_name))
	return tbl_name

	def prepare_encoded_test_table(self, db, utf8_table, encoding_name):
	encoding_name_tbl = encoding_name.replace('-', '')
	encoded_table = "{}_gen".format(encoding_name_tbl)
	self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
	name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
	STORED AS TEXTFILE""".format(db, encoded_table))
	self.execute_query("""ALTER TABLE {}.{}
	SET SERDEPROPERTIES("serialization.encoding"="{}")"""
	.format(db, encoded_table, encoding_name))
	self.execute_query("""REFRESH {}.{}""".format(db, encoded_table))
	self.execute_query("""INSERT OVERWRITE TABLE {}.{} SELECT * FROM {}.{}"""
	.format(db, encoded_table, db, utf8_table))
	return encoded_table

	def test_enc_dec_gen(self, vector, unique_database):
	"""Write encoded table with Impala and read it back."""
	db = unique_database
	encoding_name = vector.get_value('charset')
	charset = _charsets[encoding_name]
	tmp_dir, file_paths, row_count = self.generate_text_files(
	encoding_name, charset, "gen")
	utf8_table = self.prepare_utf8_test_table(db, file_paths, encoding_name, vector)
	shutil.rmtree(tmp_dir)
	encoded_table = self.prepare_encoded_test_table(db, utf8_table, encoding_name)
	_compare_tables(self, db, utf8_table, encoded_table, row_count)

	def test_enc_dec_gen_long_words(self, vector, unique_database):
	db = unique_database
	encoding_name = vector.get_value('charset')
	charset = _charsets[encoding_name]
	tmp_dir, file_paths, row_count = self.generate_text_files(
	encoding_name, charset, "gen", min_word_length=100, max_word_length=1000)
	utf8_table = self.prepare_utf8_test_table(db, file_paths, encoding_name, vector)
	shutil.rmtree(tmp_dir)
	encoded_table = self.prepare_encoded_test_table(db, utf8_table, encoding_name)
	_compare_tables(self, db, utf8_table, encoded_table, row_count)

	# Split-file tests
	####################################################################
	def test_enc_dec_gen_split(self, vector, unique_database):
	"""Test table is split across multiple files."""
	db = unique_database
	encoding_name = vector.get_value('charset')
	charset = _charsets[encoding_name]
	tmp_dir, file_paths, row_count = self.generate_text_files(
	encoding_name, charset, "split", num_lines=10000, words_per_line=5, num_files=5)
	utf8_table = self.prepare_utf8_test_table(db, file_paths, encoding_name, vector)
	shutil.rmtree(tmp_dir)
	encoded_table = self.prepare_encoded_test_table(db, utf8_table, encoding_name)
	_compare_tables(self, db, utf8_table, encoded_table, row_count)

	# Hive + Compression Tests
	####################################################################
	def prepare_encoded_test_table_compress(self, db, utf8_table, encoding_name, codec):
	encoding_name_tbl = encoding_name.replace('-', '')
	encoded_table = "{}_gen_{}".format(encoding_name_tbl, codec)
	self.run_stmt_in_hive("""CREATE TABLE IF NOT EXISTS {}.{} (
	name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
	STORED AS TEXTFILE""".format(db, encoded_table))
	self.run_stmt_in_hive("""ALTER TABLE {}.{}
	SET SERDEPROPERTIES("serialization.encoding"="{}")"""
	.format(db, encoded_table, encoding_name))
	self.run_stmt_in_hive("""set hive.exec.compress.output={};
	set mapreduce.output.fileoutputformat.compress.codec=
	org.apache.hadoop.io.compress.{}Codec;
	INSERT OVERWRITE TABLE {}.{} SELECT * FROM {}.{}
	""".format("false" if codec == "None" else "true",
	codec, db, encoded_table, db, utf8_table))
	return encoded_table

	@SkipIfFS.hive
	def test_enc_dec_gen_compress(self, vector, unique_database):
	db = unique_database
	encoding_name = vector.get_value('charset')
	charset = _charsets[encoding_name]

	tmp_dir, file_paths, row_count = self.generate_text_files(
	encoding_name, charset, "compress", num_lines=10000)
	utf8_table = self.prepare_utf8_test_table(db, file_paths, encoding_name, vector)
	shutil.rmtree(tmp_dir)
	# Snappy codec supports streaming, ZStandard does not
	for codec in ["None", "Snappy", "ZStandard"]:
	encoded_table = self.prepare_encoded_test_table_compress(db, utf8_table,
	encoding_name, codec)
	_compare_tables(self, db, utf8_table, encoded_table, row_count)

	# Partitions Tests
	####################################################################
	def prepare_utf8_test_table_partitions(self, db, file_paths, encoding_name, vector):
	encoding_name_tbl = encoding_name.replace('-', '')
	tbl_name = "{}_gen_utf8".format(encoding_name_tbl)
	self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
	name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
	PARTITIONED BY (part STRING)
	ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE"""
	.format(db, tbl_name))
	for i in range(len(file_paths)):
	self.execute_query("""ALTER TABLE {}.{} ADD PARTITION (part='{}')"""
	.format(db, tbl_name, i))
	part_url = os.path.join(
	self._get_table_location("{0}.{1}".format(db, tbl_name), vector),
	"part={}".format(i))
	self.filesystem_client.copy_from_local(file_paths[i], part_url)
	self.execute_query("""REFRESH {}.{}""".format(db, tbl_name))
	return tbl_name

	def prepare_encoded_test_table_partitions(self, db, utf8_table, encoding_name,
	file_paths):
	encoding_name_tbl = encoding_name.replace('-', '')
	encoded_table = "{}_gen".format(encoding_name_tbl)
	self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
	name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
	PARTITIONED BY (part STRING)
	STORED AS TEXTFILE""".format(db, encoded_table))
	for i in range(len(file_paths)):
	self.execute_query("""ALTER TABLE {}.{} ADD PARTITION (part='{}')"""
	.format(db, encoded_table, i))
	self.execute_query("""ALTER TABLE {}.{} PARTITION (part='{}')
	SET SERDEPROPERTIES("serialization.encoding"="{}")"""
	.format(db, encoded_table, i, encoding_name))
	self.execute_query("""REFRESH {}.{}""".format(db, encoded_table))
	self.execute_query("""INSERT OVERWRITE TABLE {}.{} PARTITION (part='{}')
	SELECT name1, name2, name3, name4, name5 FROM {}.{} WHERE part='{}'"""
	.format(db, encoded_table, i, db, utf8_table, i))
	return encoded_table

	def test_enc_dec_gen_partitions(self, vector, unique_database):
	db = unique_database
	encoding_name = vector.get_value('charset')
	charset = _charsets[encoding_name]
	tmp_dir, file_paths, row_count = self.generate_text_files(
	encoding_name, charset, "partitions", num_lines=10000, num_files=5)
	utf8_table = self.prepare_utf8_test_table_partitions(
	db, file_paths, encoding_name, vector)
	shutil.rmtree(tmp_dir)
	encoded_table = self.prepare_encoded_test_table_partitions(db,
	utf8_table, encoding_name, file_paths)
	_compare_tables(self, db, utf8_table, encoded_table, row_count)


	class TestCharCodecGenMixed(ImpalaTestSuite):
	@classmethod
	def add_test_dimensions(cls):
	super(TestCharCodecGenMixed, cls).add_test_dimensions()
	# There is no reason to run these tests using all dimensions.
	cls.ImpalaTestMatrix.add_constraint(
	lambda v: v.get_value('table_format').file_format == 'text'
	and v.get_value('table_format').compression_codec == 'none')
	cls.ImpalaTestMatrix.add_constraint(
	lambda v: v.get_value('exec_option')['disable_codegen'] is False)

	# Mixed Partitions Tests
	####################################################################
	def generate_text_files_mixed(self, test_file, num_lines=10000, words_per_line=5,
	num_files=1):
	lines_per_file = num_lines // num_files
	file_paths = []
	encodings = []
	tmp_dir = tempfile.mkdtemp(dir=os.path.join(os.environ['IMPALA_HOME'], "testdata"))
	for i in range(num_files):
	encoding_name, charset = random.choice(list(_charsets.items()))
	data_file_path = os.path.join(tmp_dir, "charcodec_{}_{}_utf8_{}.txt"
	.format(encoding_name, test_file, i))
	encodings.append(encoding_name)
	file_paths.append(data_file_path)
	with codecs.open(data_file_path, 'w', encoding='utf-8') as file:
	for _ in range(lines_per_file):
	words = [_generate_random_word(charset) for _ in range(words_per_line)]
	line = u','.join(words)
	file.write(line + u'\n')
	return tmp_dir, file_paths, encodings, num_lines

	# Partitioned table with different encodings.
	def prepare_utf8_test_table_partitions_mixed(self, db, file_paths, vector):
	tbl_name = "mixed_gen_utf8"
	self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
	name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
	PARTITIONED BY (part STRING)
	ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE"""
	.format(db, tbl_name))
	for i in range(len(file_paths)):
	self.execute_query("""ALTER TABLE {}.{} ADD PARTITION (part='{}')"""
	.format(db, tbl_name, i))
	part_url = os.path.join(
	self._get_table_location("{0}.{1}".format(db, tbl_name), vector),
	"part={}".format(i))
	self.filesystem_client.copy_from_local(file_paths[i], part_url)
	self.execute_query("""REFRESH {}.{}""".format(db, tbl_name))
	return tbl_name

	def prepare_encoded_test_table_partitions_mixed(self, db, utf8_table, encodings):
	encoded_table = "mixed_gen"
	self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (
	name1 STRING, name2 STRING, name3 STRING, name4 STRING, name5 STRING)
	PARTITIONED BY (part STRING)
	STORED AS TEXTFILE""".format(db, encoded_table))
	for i in range(len(encodings)):
	self.execute_query("""ALTER TABLE {}.{} ADD PARTITION (part='{}')"""
	.format(db, encoded_table, i))
	self.execute_query("""ALTER TABLE {}.{} PARTITION (part='{}')
	SET SERDEPROPERTIES("serialization.encoding"="{}")"""
	.format(db, encoded_table, i, encodings[i]))
	self.execute_query("""REFRESH {}.{}""".format(db, encoded_table))
	self.execute_query("""INSERT OVERWRITE TABLE {}.{} PARTITION (part='{}')
	SELECT name1, name2, name3, name4, name5 FROM {}.{} WHERE part='{}'"""
	.format(db, encoded_table, i, db, utf8_table, i))
	return encoded_table

	def test_enc_dec_gen_partitions_mixed(self, unique_database, vector):
	db = unique_database
	tmp_dir, file_paths, encodings, row_count = self.generate_text_files_mixed(
	"mixed", num_lines=10000, num_files=5)
	utf8_table = self.prepare_utf8_test_table_partitions_mixed(db, file_paths, vector)
	shutil.rmtree(tmp_dir)
	encoded_table = self.prepare_encoded_test_table_partitions_mixed(db,
	utf8_table, encodings)
	_compare_tables(self, db, utf8_table, encoded_table, row_count)


	class TestCharCodecPreCreated(ImpalaTestSuite):
	@classmethod
	def add_test_dimensions(cls):
	super(TestCharCodecPreCreated, cls).add_test_dimensions()
	encodings = list(_charsets.keys())
	# Only run the tests for single 'gbk' encoding in non-exhaustive mode.
	if cls.exploration_strategy() != 'exhaustive':
	encodings = [enc for enc in encodings if enc == 'gbk']
	cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension(
	'charset', *encodings))
	# There is no reason to run these tests using all dimensions.
	cls.ImpalaTestMatrix.add_constraint(
	lambda v: v.get_value('table_format').file_format == 'text'
	and v.get_value('table_format').compression_codec == 'none')
	cls.ImpalaTestMatrix.add_constraint(
	lambda v: v.get_value('exec_option')['disable_codegen'] is False)

	def prepare_test_table(self, vector, db, tbl_name, datafile, encoding=None):
	tbl_name = tbl_name.replace('-', '')
	datafile = datafile.replace('-', '')
	self.execute_query("""CREATE TABLE IF NOT EXISTS {}.{} (name STRING)
	STORED AS TEXTFILE""".format(db, tbl_name))
	if encoding:
	self.execute_query("""ALTER TABLE {}.{} SET
	SERDEPROPERTIES("serialization.encoding"="{}")"""
	.format(db, tbl_name, encoding))
	data_file_path = os.path.join(os.environ['IMPALA_HOME'], "testdata",
	"charcodec", datafile)
	self.filesystem_client.copy_from_local(data_file_path,
	self._get_table_location("{0}.{1}".format(db, tbl_name), vector))
	self.execute_query("""REFRESH {}.{}""".format(db, tbl_name))
	return tbl_name

	def test_precreated_files(self, vector, unique_database):
	"""Read encoded precreated files."""
	db = unique_database
	enc = vector.get_value('charset')

	# Without SERDEPROPERTIES("serialization.encoding") data is read incorrectly
	utf8_table = self.prepare_test_table(
	vector, db, enc + '_names_utf8', enc + '_names_utf8.txt', None)
	encoded_table = self.prepare_test_table(
	vector, db, enc + '_names_none', enc + '_names.txt', None)
	with pytest.raises(AssertionError) as exc_info:
	_compare_tables(self, db, utf8_table, encoded_table, 3)
	assert " == []" in str(exc_info.value)

	# With SERDEPROPERTIES("serialization.encoding") data is read correctly
	encoded_table = self.prepare_test_table(
	vector, db, enc + '_names', enc + '_names.txt', enc)
	_compare_tables(self, db, utf8_table, encoded_table, 3)

	def test_precreated_decoding_with_errors(self, vector, unique_database):
	db = unique_database
	enc = vector.get_value('charset')
	# Skip for promiscious encodings
	if enc not in ['gbk', 'shift_jis']: pytest.skip()
	encoded_table = self.prepare_test_table(
	vector, db, enc + '_names_error', enc + '_names_error.txt', enc)
	err = self.execute_query_expect_failure(
	self.client, """select * from {}.{}""".format(db, encoded_table))
	assert "Error during buffer conversion: Conversion failed" in str(err)

	def test_precreated_encoding_with_errors(self, vector, unique_database):
	db = unique_database
	enc = vector.get_value('charset')
	# Skip for promiscious encodings
	if enc not in ['gbk', 'shift_jis']: pytest.skip()
	encoded_table = self.prepare_test_table(
	vector, db, enc + '_names_error', enc + '_names_error.txt', enc)
	err = self.execute_query_expect_failure(self.client, """insert overwrite {}.{}
	select cast(binary_col as string) from functional.binary_tbl"""
	.format(db, encoded_table))
	assert "Error during buffer conversion: Conversion failed" in str(err)

	@SkipIfFS.hive
	def test_read_from_hive(self, unique_database, vector):
	"""Write table with Impala and read it back with Hive."""
	db = unique_database
	enc = vector.get_value('charset')

	utf8_table = self.prepare_test_table(
	vector, db, enc + '_names_utf8', enc + '_names_utf8.txt', None)
	encoded_table = self.prepare_test_table(
	vector, db, enc + '_names', enc + '_names.txt', enc)
	self.execute_query(
	"""insert overwrite {}.{} select * from {}.{}"""
	.format(db, encoded_table, db, utf8_table))

	result_hive = self.run_stmt_in_hive(
	"""select name from {}.{}""".format(db, encoded_table))
	result_impala = self.client.execute(
	"""select name from {}.{}""".format(db, utf8_table))
	result_hive_list = result_hive.strip().split('\n')[1:]
	assert result_hive_list == result_impala.data