blob: 4d7c202461610e06579ddc7718e7f2685ff40634 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
# Tests for IMPALA-1658
import pytest
import time
from tests.common.custom_cluster_test_suite import CustomClusterTestSuite
class TestHiveParquetTimestampConversion(CustomClusterTestSuite):
'''Hive writes timestamps in parquet files by first converting values from local time
to UTC. The conversion was not expected (other file formats don't convert) and a
startup flag was later added to adjust for this (IMPALA-1658). This file tests that
the conversion and flag behave as expected.
'''
@classmethod
def add_test_dimensions(cls):
super(CustomClusterTestSuite, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format == 'parquet' and
v.get_value('table_format').compression_codec == 'none')
def check_sanity(self, expect_converted_result):
data = self.execute_query_expect_success(self.client, """
SELECT COUNT(timestamp_col), COUNT(DISTINCT timestamp_col),
MIN(timestamp_col), MAX(timestamp_col)
FROM functional_parquet.alltypesagg_hive_13_1""")\
.get_data()
assert len(data) > 0
rows = data.split("\n")
assert len(rows) == 1
values = rows[0].split("\t")
assert len(values) == 4
assert values[0] == "11000"
assert values[1] == "10000"
if expect_converted_result:
# Doing easy time zone conversion in python seems to require a 3rd party lib,
# so the only check will be that the value changed in some way.
assert values[2] != "2010-01-01 00:00:00"
assert values[3] != "2010-01-10 18:02:05.100000000"
else:
assert values[2] == "2010-01-01 00:00:00"
assert values[3] == "2010-01-10 18:02:05.100000000"
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args("-convert_legacy_hive_parquet_utc_timestamps=true")
def test_conversion(self, vector):
tz_name = time.tzname[time.localtime().tm_isdst]
self.check_sanity(tz_name not in ("UTC", "GMT"))
# The value read from the Hive table should be the same as reading a UTC converted
# value from the Impala table.
tz_name = time.tzname[time.localtime().tm_isdst]
data = self.execute_query_expect_success(self.client, """
SELECT h.id, h.day, h.timestamp_col, i.timestamp_col
FROM functional_parquet.alltypesagg_hive_13_1 h
JOIN functional_parquet.alltypesagg
i ON i.id = h.id AND i.day = h.day -- serves as a unique key
WHERE
(h.timestamp_col IS NULL AND i.timestamp_col IS NOT NULL)
OR (h.timestamp_col IS NOT NULL AND i.timestamp_col IS NULL)
OR h.timestamp_col != FROM_UTC_TIMESTAMP(i.timestamp_col, '%s')
""" % tz_name)\
.get_data()
assert len(data) == 0
@pytest.mark.execute_serially
@CustomClusterTestSuite.with_args("-convert_legacy_hive_parquet_utc_timestamps=false")
def test_no_conversion(self, vector):
self.check_sanity(False)
# Without conversion all the values will be different.
tz_name = time.tzname[time.localtime().tm_isdst]
data = self.execute_query_expect_success(self.client, """
SELECT h.id, h.day, h.timestamp_col, i.timestamp_col
FROM functional_parquet.alltypesagg_hive_13_1 h
JOIN functional_parquet.alltypesagg
i ON i.id = h.id AND i.day = h.day -- serves as a unique key
WHERE h.timestamp_col != FROM_UTC_TIMESTAMP(i.timestamp_col, '%s')
""" % tz_name)\
.get_data()
if tz_name in ("UTC", "GMT"):
assert len(data) == 0
else:
assert len(data.split('\n')) == 10000
# A value should either stay null or stay not null.
data = self.execute_query_expect_success(self.client, """
SELECT h.id, h.day, h.timestamp_col, i.timestamp_col
FROM functional_parquet.alltypesagg_hive_13_1 h
JOIN functional_parquet.alltypesagg
i ON i.id = h.id AND i.day = h.day -- serves as a unique key
WHERE
(h.timestamp_col IS NULL AND i.timestamp_col IS NOT NULL)
OR (h.timestamp_col IS NOT NULL AND i.timestamp_col IS NULL)
""")\
.get_data()
assert len(data) == 0