| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| |
| import getpass |
| import pytest |
| |
| from tests.common.impala_test_suite import ImpalaTestSuite |
| from tests.common.skip import SkipIfS3, SkipIfADLS, SkipIfIsilon, SkipIfLocal |
| from tests.common.test_dimensions import ( |
| create_single_exec_option_dimension, |
| create_uncompressed_text_dimension) |
| from tests.common.test_vector import ImpalaTestDimension |
| from tests.util.shell_util import exec_process |
| |
| TEST_DB = 'test_encryption_db' |
| |
| # PyWebHdfs takes absolute paths without a leading slash |
| PYWEBHDFS_TMP_DIR = 'tmp/test_encryption_load_data' |
| TMP_DIR = '/%s' % (PYWEBHDFS_TMP_DIR) |
| |
| |
| @SkipIfS3.hdfs_encryption |
| @SkipIfADLS.hdfs_encryption |
| @SkipIfIsilon.hdfs_encryption |
| @SkipIfLocal.hdfs_encryption |
| @pytest.mark.execute_serially |
| class TestHdfsEncryption(ImpalaTestSuite): |
| ''' Tests LOAD DATA commands work between HDFS encryption zones. |
| |
| A test directory is created with data to be loaded into a test table. |
| The test directory and/or the table directory are created as encryption |
| zones with different keys, and the LOAD DATA command is executed. The |
| tests operate on both partitioned and non-partitioned tables. |
| ''' |
| @classmethod |
| def get_workload(self): |
| return 'functional-query' |
| |
| @classmethod |
| def add_test_dimensions(cls): |
| super(TestHdfsEncryption, cls).add_test_dimensions() |
| cls.ImpalaTestMatrix.add_dimension(create_single_exec_option_dimension()) |
| cls.ImpalaTestMatrix.add_dimension( |
| create_uncompressed_text_dimension(cls.get_workload())) |
| |
| PARTITIONED = [True, False] |
| # For 'core', just test loading from a directory that is encrypted. |
| KEY_LOAD_DIR = ["testkey1"] |
| KEY_TBL_DIR = [None] |
| |
| if cls.exploration_strategy() == 'exhaustive': |
| KEY_LOAD_DIR = [None, "testkey1", "testkey2"] |
| KEY_TBL_DIR = [None, "testkey1", "testkey2"] |
| |
| cls.ImpalaTestMatrix.add_dimension( |
| ImpalaTestDimension('partitioned', *PARTITIONED)) |
| cls.ImpalaTestMatrix.add_dimension( |
| ImpalaTestDimension('key_load_dir', *KEY_LOAD_DIR)) |
| cls.ImpalaTestMatrix.add_dimension( |
| ImpalaTestDimension('key_tbl_dir', *KEY_TBL_DIR)) |
| cls.ImpalaTestMatrix.add_constraint(lambda v:\ |
| v.get_value('key_load_dir') is not None or\ |
| v.get_value('key_tbl_dir') is not None) |
| |
| def setup_method(self, method): |
| self.__cleanup() |
| self.client.execute('create database if not exists %s' % TEST_DB) |
| self.client.execute('use %s' % TEST_DB) |
| self.hdfs_client.make_dir(PYWEBHDFS_TMP_DIR) |
| # Few tests depend on the .Trash directory being present. In case it doesn't |
| # exist, we create a random text file and delete it so that hdfs recreates |
| # the hierarchy of trash |
| if not self.hdfs_client.exists("/user/{0}/.Trash/".format(getpass.getuser())): |
| self.hdfs_client.create_file("test-warehouse/random",file_data="random") |
| rc, stdout, stderr = exec_process("hadoop fs -rm /test-warehouse/random") |
| assert rc == 0, 'Error re-creating trash: %s %s' % (stdout, stderr) |
| |
| def teardown_method(self, method): |
| self.__cleanup() |
| # Clean up trash directory so that further tests aren't affected |
| rc, stdout, stderr = exec_process( |
| "hadoop fs -rmr /user/{0}/.Trash/".format(getpass.getuser())) |
| assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr) |
| |
| def create_encryption_zone(self, key, path): |
| """Creates an encryption zone using key 'key' on path 'path'""" |
| rc, stdout, stderr = exec_process( |
| "hdfs crypto -createZone -keyName %s -path %s" % (key, path)) |
| assert rc == 0, 'Error creating encryption zone: %s %s' % (stdout, stderr) |
| |
| def __cleanup(self): |
| self.client.execute('use default') |
| self.client.execute('drop table if exists %s.tbl purge' % TEST_DB) |
| self.client.execute('drop table if exists %s.t1 purge' % TEST_DB) |
| self.cleanup_db(TEST_DB) |
| self.hdfs_client.delete_file_dir(PYWEBHDFS_TMP_DIR, recursive=True) |
| |
| def test_load_data(self, vector): |
| key_tbl_dir = vector.get_value('key_tbl_dir') |
| key_load_dir = vector.get_value('key_load_dir') |
| |
| if vector.get_value('partitioned'): |
| src_file = "/test-warehouse/alltypes/year=2010/month=1/100101.txt" |
| src_tbl_schema = "functional.alltypes" |
| else: |
| src_file = "/test-warehouse/tinytable/data.csv" |
| src_tbl_schema = "functional.tinytable" |
| |
| if key_load_dir is not None: |
| rc, stdout, stderr = exec_process( |
| 'hdfs crypto -createZone -keyName %s -path %s' % (key_load_dir, TMP_DIR)) |
| assert rc == 0, 'Error executing hdfs crypto: %s %s' % (stdout, stderr) |
| |
| # hdfs_client doesn't support copy |
| rc, stdout, stderr = exec_process('hdfs dfs -cp %s %s' % (src_file, TMP_DIR)) |
| assert rc == 0, 'Error executing hdfs cp: %s %s' % (stdout, stderr) |
| |
| self.client.execute('create table tbl like %s' % (src_tbl_schema)) |
| |
| if key_tbl_dir is not None: |
| rc, stdout, stderr = exec_process( |
| 'hdfs crypto -createZone -keyName %s -path /test-warehouse/%s.db/tbl' % |
| (key_tbl_dir, TEST_DB)) |
| assert rc == 0, 'Error executing hdfs crypto: %s %s' % (stdout, stderr) |
| |
| if vector.get_value('partitioned'): |
| # insert a single value to create the partition spec |
| self.client.execute('insert into tbl partition (year=2010, month=1) '\ |
| 'values (0,true,0,0,0,0,0,0,NULL,NULL,NULL)') |
| self.client.execute('load data inpath \'%s\' into table tbl '\ |
| 'partition(year=2010, month=1)' % (TMP_DIR)) |
| else: |
| self.client.execute('load data inpath \'%s\' into table tbl ' % (TMP_DIR)) |
| |
| @SkipIfIsilon.hdfs_encryption |
| @pytest.mark.execute_serially |
| def test_drop_partition_encrypt(self): |
| """Verifies if alter <tbl> drop partition purge works in case |
| where the Trash dir and partition dir are in different encryption |
| zones. Check IMPALA-2310 for details""" |
| self.client.execute("create table {0}.t1(i int) partitioned\ |
| by (j int)".format(TEST_DB)) |
| # Add three partitions (j=1), (j=2), (j=3) to table t1 |
| self.client.execute("alter table {0}.t1 add partition(j=1)".format(TEST_DB)); |
| self.client.execute("alter table {0}.t1 add partition(j=2)".format(TEST_DB)); |
| self.client.execute("alter table {0}.t1 add partition(j=3)".format(TEST_DB)); |
| # Clean up the trash directory to create an encrypted zone |
| rc, stdout, stderr = exec_process( |
| "hadoop fs -rm -r /user/{0}/.Trash/*".format(getpass.getuser())) |
| assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr) |
| # Create the necessary encryption zones |
| self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=1"\ |
| .format(TEST_DB)) |
| self.create_encryption_zone("testkey2", "/test-warehouse/{0}.db/t1/j=2"\ |
| .format(TEST_DB)) |
| self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t1/j=3"\ |
| .format(TEST_DB)) |
| |
| # HDFS 2.8+ behavior is to create individual trash per encryption zone; |
| # don't create an encryption zone on .Trash in that case, otherwise |
| # recursive trash is created. |
| has_own_trash = self.hdfs_client.exists( |
| "/test-warehouse/{0}.db/t1/j=1/.Trash".format(TEST_DB)) |
| if not has_own_trash: |
| self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(\ |
| getpass.getuser())) |
| |
| # Load sample data into the partition directories |
| self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=1/j1.txt"\ |
| .format(TEST_DB), file_data='j1') |
| self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=2/j2.txt"\ |
| .format(TEST_DB), file_data='j2') |
| self.hdfs_client.create_file("test-warehouse/{0}.db/t1/j=3/j3.txt"\ |
| .format(TEST_DB), file_data='j3') |
| |
| # Drop the partition (j=1) without purge and make sure partition directory still |
| # exists. This behavior is expected due to the difference in encryption zones |
| # between the .Trash and the warehouse directory (prior to HDFS 2.8) |
| if not has_own_trash: |
| self.execute_query_expect_failure(self.client, "alter table {0}.t1 drop \ |
| partition(j=1)".format(TEST_DB)); |
| assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB)) |
| assert self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB)) |
| else: |
| # HDFS 2.8+ behavior succeeds the query and creates trash; the partition removal |
| # ends up destroying the directories which moves this back to the user's trash |
| self.client.execute("alter table {0}.t1 drop partition(j=1)".format(TEST_DB)); |
| assert self.hdfs_client.exists( |
| "/user/{0}/.Trash/Current/test-warehouse/{1}.db/t1/j=1/j1.txt"\ |
| .format(getpass.getuser(), TEST_DB)) |
| assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1/j1.txt".format(TEST_DB)) |
| assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=1".format(TEST_DB)) |
| |
| # Drop the partition j=2 (with purge) and make sure the partition directory is deleted |
| self.client.execute("alter table {0}.t1 drop partition(j=2) purge".format(TEST_DB)) |
| assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2/j2.txt".format(TEST_DB)) |
| assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=2".format(TEST_DB)) |
| # Drop the partition j=3 (with purge) and make sure the partition is deleted |
| # This is the case where the trash directory and partition data directory |
| # are in different encryption zones. Using purge should delete the partition |
| # data pemanently by skipping trash |
| self.client.execute("alter table {0}.t1 drop partition(j=3) purge".format(TEST_DB)) |
| assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3/j3.txt".format(TEST_DB)) |
| assert not self.hdfs_client.exists("test-warehouse/{0}.db/t1/j=3".format(TEST_DB)) |
| |
| @SkipIfIsilon.hdfs_encryption |
| @pytest.mark.execute_serially |
| def test_drop_table_encrypt(self): |
| """Verifies if drop <table> purge works in a case where Trash directory and table |
| directory in different encryption zones""" |
| self.client.execute("create table {0}.t3(i int)".format(TEST_DB)) |
| |
| # Clean up the trash directory to create an encrypted zone |
| rc, stdout, stderr = exec_process( |
| "hadoop fs -rmr /user/{0}/.Trash/*".format(getpass.getuser())) |
| assert rc == 0, 'Error deleting Trash: %s %s' % (stdout, stderr) |
| # Create table directory and trash directory in different encryption zones |
| self.create_encryption_zone("testkey1", "/test-warehouse/{0}.db/t3".format(TEST_DB)) |
| self.create_encryption_zone("testkey2", "/user/{0}/.Trash/".format(getpass.getuser())) |
| self.client.execute("drop table {0}.t3 purge".format(TEST_DB)) |
| assert not self.hdfs_client.exists("test-warehouse/{0}.db/t3".format(TEST_DB)) |