blob: d078cdff598dbba856b5756554d2919336a4eb13 [file] [log] [blame]
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
import os
from subprocess import check_call
from pytest import skip
from tests.beeswax.impala_beeswax import ImpalaBeeswaxException
from tests.common.impala_test_suite import ImpalaTestSuite
from tests.common.skip import (
SkipIfIsilon,
SkipIfS3,
SkipIfADLS,
SkipIfLocal)
from tests.common.test_vector import ImpalaTestDimension
from tests.util.filesystem_utils import WAREHOUSE, get_fs_path
class TestNestedTypes(ImpalaTestSuite):
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestNestedTypes, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format == 'parquet')
def test_scanner_basic(self, vector):
"""Queries that do not materialize arrays."""
self.run_test_case('QueryTest/nested-types-scanner-basic', vector)
def test_scanner_array_materialization(self, vector):
"""Queries that materialize arrays."""
self.run_test_case('QueryTest/nested-types-scanner-array-materialization', vector)
def test_scanner_multiple_materialization(self, vector):
"""Queries that materialize the same array multiple times."""
self.run_test_case('QueryTest/nested-types-scanner-multiple-materialization', vector)
def test_scanner_position(self, vector):
"""Queries that materialize the artifical position element."""
self.run_test_case('QueryTest/nested-types-scanner-position', vector)
def test_scanner_map(self, vector):
"""Queries that materialize maps. (Maps looks like arrays of key/value structs, so
most map functionality is already tested by the array tests.)"""
self.run_test_case('QueryTest/nested-types-scanner-maps', vector)
def test_runtime(self, vector):
"""Queries that send collections through the execution runtime."""
self.run_test_case('QueryTest/nested-types-runtime', vector)
def test_subplan(self, vector):
"""Test subplans with various exec nodes inside it."""
self.run_test_case('QueryTest/nested-types-subplan', vector)
def test_with_clause(self, vector):
"""Queries using nested types and with WITH clause."""
self.run_test_case('QueryTest/nested-types-with-clause', vector)
@SkipIfLocal.mem_usage_different
def test_tpch(self, vector):
"""Queries over the larger nested TPCH dataset."""
self.run_test_case('QueryTest/nested-types-tpch', vector)
def test_parquet_stats(self, vector):
"""Queries that test evaluation of Parquet row group statistics."""
self.run_test_case('QueryTest/nested-types-parquet-stats', vector)
@SkipIfIsilon.hive
@SkipIfS3.hive
@SkipIfADLS.hive
@SkipIfLocal.hive
def test_upper_case_field_name(self, unique_database):
"""IMPALA-5994: Tests that a Hive-created table with a struct field name with upper
case characters can be selected."""
table_name = "%s.upper_case_test" % unique_database
create_table = "CREATE TABLE %s (s struct<upperCaseName:int>) STORED AS PARQUET" % \
table_name
self.run_stmt_in_hive(create_table)
self.client.execute("invalidate metadata %s" % table_name)
self.client.execute("select s.UppercasenamE from %s" % table_name)
self.client.execute("select s.* from %s" % table_name)
def test_partitioned_table(self, vector, unique_database):
"""IMPALA-6370: Test that a partitioned table with nested types can be scanned."""
table = "complextypes_partitioned"
db_table = "{0}.{1}".format(unique_database, table)
self.client.execute("""
CREATE EXTERNAL TABLE {0} (
id BIGINT,
int_array ARRAY<INT>,
int_array_array ARRAY<ARRAY<INT>>,
int_map MAP<STRING,INT>,
int_map_array ARRAY<MAP<STRING,INT>>,
nested_struct STRUCT<
a:INT,
b:ARRAY<INT>,
c:STRUCT<d:ARRAY<ARRAY<STRUCT<e:INT,f:STRING>>>>,
g:MAP<STRING,STRUCT<h:STRUCT<i:ARRAY<DOUBLE>>>>>
)
PARTITIONED BY (
part int
)
STORED AS PARQUET""".format(db_table))
# Add multiple partitions pointing to the complextypes_tbl data.
for partition in [1, 2]:
self.client.execute("ALTER TABLE {0} ADD PARTITION(part={1}) LOCATION '{2}'".format(
db_table, partition,
self._get_table_location("functional_parquet.complextypestbl", vector)))
self.run_test_case('QueryTest/nested-types-basic-partitioned', vector,
unique_database)
class TestParquetArrayEncodings(ImpalaTestSuite):
TESTFILE_DIR = os.path.join(os.environ['IMPALA_HOME'],
"testdata/parquet_nested_types_encodings")
ARRAY_RESOLUTION_POLICIES = ["three_level", "two_level", "two_level_then_three_level"]
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestParquetArrayEncodings, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension(
'parquet_array_resolution', *TestParquetArrayEncodings.ARRAY_RESOLUTION_POLICIES))
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format == 'parquet')
def __init_arr_res(self, vector):
arr_res = vector.get_value('parquet_array_resolution')
qopts = vector.get_value('exec_option')
qopts['parquet_array_resolution'] = arr_res
return (arr_res, qopts)
# $ parquet-tools schema SingleFieldGroupInList.parquet
# message SingleFieldGroupInList {
# optional group single_element_groups (LIST) {
# repeated group single_element_group {
# required int64 count;
# }
# }
# }
#
# $ parquet-tools cat SingleFieldGroupInList.parquet
# single_element_groups:
# .single_element_group:
# ..count = 1234
# .single_element_group:
# ..count = 2345
def test_single_field_group_in_list(self, vector, unique_database):
self.__test_single_field_group_in_list(unique_database, "SingleFieldGroupInList",
"SingleFieldGroupInList.parquet", vector)
# $ parquet-tools schema AvroSingleFieldGroupInList.parquet
# message AvroSingleFieldGroupInList {
# optional group single_element_groups (LIST) {
# repeated group array {
# required int64 count;
# }
# }
# }
#
# $ parquet-tools cat AvroSingleFieldGroupInList.parquet
# single_element_groups:
# .array:
# ..count = 1234
# .array:
# ..count = 2345
def test_avro_single_field_group_in_list(self, vector, unique_database):
self.__test_single_field_group_in_list(unique_database, "AvroSingleFieldGroupInList",
"AvroSingleFieldGroupInList.parquet", vector)
# $ parquet-tools schema ThriftSingleFieldGroupInList.parquet
# message ThriftSingleFieldGroupInList {
# optional group single_element_groups (LIST) {
# repeated group single_element_groups_tuple {
# required int64 count;
# }
# }
# }
#
# $ parquet-tools cat ThriftSingleFieldGroupInList.parquet
# single_element_groups:
# .single_element_groups_tuple:
# ..count = 1234
# .single_element_groups_tuple:
# ..count = 2345
def test_thrift_single_field_group_in_list(self, vector, unique_database):
self.__test_single_field_group_in_list(unique_database,
"ThriftSingleFieldGroupInList", "ThriftSingleFieldGroupInList.parquet", vector)
def __test_single_field_group_in_list(self, db, tbl, parq_file, vector):
arr_res, qopts = self.__init_arr_res(vector)
full_name = "%s.%s" % (db, tbl)
if arr_res == "two_level" or arr_res == "two_level_then_three_level":
self._create_test_table(db, tbl, parq_file, "col1 array<struct<f1: bigint>>")
result = self.execute_query("select item.f1 from %s.col1" % full_name, qopts)
assert result.data == ['1234', '2345']
result = self.execute_query("select item.f1 from %s t, t.col1" % full_name, qopts)
assert result.data == ['1234', '2345']
if arr_res == "three_level":
self._create_test_table(db, tbl, parq_file, "col1 array<bigint>")
result = self.execute_query("select item from %s.col1" % full_name, qopts)
assert result.data == ['1234', '2345']
result = self.execute_query("select item from %s t, t.col1" % full_name, qopts)
assert result.data == ['1234', '2345']
result = self.execute_query(
"select cnt from %s t, (select count(*) cnt from t.col1) v" % full_name, qopts)
assert result.data == ['2']
# $ parquet-tools schema AvroPrimitiveInList.parquet
# message AvroPrimitiveInList {
# required group list_of_ints (LIST) {
# repeated int32 array;
# }
# }
#
# $ parquet-tools cat AvroPrimitiveInList.parquet
# list_of_ints:
# .array = 34
# .array = 35
# .array = 36
def test_avro_primitive_in_list(self, vector, unique_database):
self.__test_primitive_in_list(unique_database, "AvroPrimitiveInList",
"AvroPrimitiveInList.parquet", vector)
# $ parquet-tools schema ThriftPrimitiveInList.parquet
# message ThriftPrimitiveInList {
# required group list_of_ints (LIST) {
# repeated int32 list_of_ints_tuple;
# }
# }
#
# $ parquet-tools cat ThriftPrimitiveInList.parquet
# list_of_ints:
# .list_of_ints_tuple = 34
# .list_of_ints_tuple = 35
# .list_of_ints_tuple = 36
def test_thrift_primitive_in_list(self, vector, unique_database):
self.__test_primitive_in_list(unique_database, "ThriftPrimitiveInList",
"ThriftPrimitiveInList.parquet", vector)
def __test_primitive_in_list(self, db, tbl, parq_file, vector):
arr_res, qopts = self.__init_arr_res(vector)
full_name = "%s.%s" % (db, tbl)
self._create_test_table(db, tbl, parq_file, "col1 array<int>")
if arr_res == "two_level" or arr_res == "two_level_then_three_level":
result = self.execute_query("select item from %s.col1" % full_name, qopts)
assert result.data == ['34', '35', '36']
result = self.execute_query("select item from %s t, t.col1" % full_name, qopts)
assert result.data == ['34', '35', '36']
if arr_res == "three_level":
result = self.execute_query("select item from %s.col1" % full_name, qopts)
assert result.data == ['NULL', 'NULL', 'NULL']
result = self.execute_query("select item from %s t, t.col1" % full_name, qopts)
assert result.data == ['NULL', 'NULL', 'NULL']
result = self.execute_query(
"select cnt from %s t, (select count(*) cnt from t.col1) v" % full_name, qopts)
assert result.data == ['3']
# $ parquet-tools schema bad-avro.parquet
# message org.apache.spark.sql.execution.datasources.parquet.test.avro.AvroArrayOfArray {
# required group int_arrays_column (LIST) {
# repeated group array (LIST) {
# repeated int32 array;
# }
# }
# }
#
# $ parquet-tools cat bad-avro.parquet
# int_arrays_column:
# .array:
# ..array = 0
# ..array = 1
# ..array = 2
# .array:
# ..array = 3
# ..array = 4
# ..array = 5
# .array:
# ..array = 6
# ..array = 7
# ..array = 8
#
# int_arrays_column:
# .array:
# ..array = 0
# ..array = 1
# ..array = 2
# .array:
# ..array = 3
# ..array = 4
# ..array = 5
# .array:
# ..array = 6
# ..array = 7
# ..array = 8
#
# [Same int_arrays_column repeated 8x more]
def test_avro_array_of_arrays(self, vector, unique_database):
self.__test_array_of_arrays(unique_database, "AvroArrayOfArrays",
"bad-avro.parquet", vector, 10)
# $ parquet-tools schema bad-thrift.parquet
# message ParquetSchema {
# required group intListsColumn (LIST) {
# repeated group intListsColumn_tuple (LIST) {
# repeated int32 intListsColumn_tuple_tuple;
# }
# }
# }
#
# $ parquet-tools cat bad-thrift.parquet
# intListsColumn:
# .intListsColumn_tuple:
# ..intListsColumn_tuple_tuple = 0
# ..intListsColumn_tuple_tuple = 1
# ..intListsColumn_tuple_tuple = 2
# .intListsColumn_tuple:
# ..intListsColumn_tuple_tuple = 3
# ..intListsColumn_tuple_tuple = 4
# ..intListsColumn_tuple_tuple = 5
# .intListsColumn_tuple:
# ..intListsColumn_tuple_tuple = 6
# ..intListsColumn_tuple_tuple = 7
# ..intListsColumn_tuple_tuple = 8
def test_thrift_array_of_arrays(self, vector, unique_database):
self.__test_array_of_arrays(unique_database, "ThriftArrayOfArrays",
"bad-thrift.parquet", vector, 1)
def __test_array_of_arrays(self, db, tbl, parq_file, vector, mult):
arr_res, qopts = self.__init_arr_res(vector)
full_name = "%s.%s" % (db, tbl)
self._create_test_table(db, tbl, parq_file, "col1 array<array<int>>")
if arr_res == "two_level" or arr_res == "two_level_then_three_level":
result = self.execute_query("select item from %s.col1.item" % full_name, qopts)
assert result.data == ['0', '1', '2', '3', '4', '5', '6', '7', '8'] * mult
result = self.execute_query(
"select a2.item from %s t, t.col1 a1, a1.item a2" % full_name)
assert result.data == ['0', '1', '2', '3', '4', '5', '6', '7', '8'] * mult
result = self.execute_query(
"select cnt from %s t, (select count(*) cnt from t.col1) v" % full_name, qopts)
assert result.data == ['3'] * mult
result = self.execute_query(
"select cnt from %s t, t.col1 a1, (select count(*) cnt from a1.item) v"\
% full_name, qopts)
assert result.data == ['3', '3', '3'] * mult
if arr_res == "three_level":
expected_err = "has an incompatible Parquet schema"
try:
self.execute_query("select item from %s.col1.item" % full_name, qopts)
except Exception, e:
assert expected_err in str(e)
try:
self.execute_query("select cnt from %s t, (select count(*) cnt from t.col1) v"\
% full_name, qopts)
except Exception, e:
assert expected_err in str(e)
# $ parquet-tools schema UnannotatedListOfPrimitives.parquet
# message UnannotatedListOfPrimitives {
# repeated int32 list_of_ints;
# }
#
# $ parquet-tools cat UnannotatedListOfPrimitives.parquet
# list_of_ints = 34
# list_of_ints = 35
# list_of_ints = 36
def test_unannotated_list_of_primitives(self, vector, unique_database):
arr_res, qopts = self.__init_arr_res(vector)
tablename = "UnannotatedListOfPrimitives"
full_name = "%s.%s" % (unique_database, tablename)
self._create_test_table(unique_database, tablename,
"UnannotatedListOfPrimitives.parquet", "col1 array<int>")
result = self.execute_query("select item from %s.col1" % full_name, qopts)
assert result.data == ['34', '35', '36']
result = self.execute_query("select item from %s t, t.col1" % full_name, qopts)
assert result.data == ['34', '35', '36']
result = self.execute_query(
"select cnt from %s t, (select count(*) cnt from t.col1) v" % full_name, qopts)
assert result.data == ['3']
# $ parquet-tools schema UnannotatedListOfGroups.parquet
# message UnannotatedListOfGroups {
# repeated group list_of_points {
# required float x;
# required float y;
# }
# }
#
# $ parquet-tools cat UnannotatedListOfGroups.parquet
# list_of_points:
# .x = 1.0
# .y = 1.0
# list_of_points:
# .x = 2.0
# .y = 2.0
def test_unannotated_list_of_groups(self, vector, unique_database):
arr_res, qopts = self.__init_arr_res(vector)
tablename = "UnannotatedListOfGroups"
full_name = "%s.%s" % (unique_database, tablename)
self._create_test_table(unique_database, tablename,
"UnannotatedListOfGroups.parquet", "col1 array<struct<f1: float, f2: float>>")
result = self.execute_query("select f1, f2 from %s.col1" % full_name, qopts)
assert result.data == ['1\t1', '2\t2']
result = self.execute_query("select f1, f2 from %s t, t.col1" % full_name, qopts)
assert result.data == ['1\t1', '2\t2']
result = self.execute_query(
"select cnt from %s t, (select count(*) cnt from t.col1) v" % full_name, qopts)
assert result.data == ['2']
# $ parquet-tools schema AmbiguousList_Modern.parquet
# message org.apache.impala.nested {
# required group ambigArray (LIST) {
# repeated group list {
# required group element {
# required group s2 {
# optional int32 f21;
# optional int32 f22;
# }
# optional int32 F11;
# optional int32 F12;
# }
# }
# }
# }
# $ parquet-tools cat AmbiguousList_Modern.parquet
# ambigArray:
# .list:
# ..element:
# ...s2:
# ....f21 = 21
# ....f22 = 22
# ...F11 = 11
# ...F12 = 12
# .list:
# ..element:
# ...s2:
# ....f21 = 210
# ....f22 = 220
# ...F11 = 110
# ...F12 = 120
#
# $ parquet-tools schema AmbiguousList_Legacy.parquet
# message org.apache.impala.nested {
# required group ambigArray (LIST) {
# repeated group array {
# required group s2 {
# optional int32 f21;
# optional int32 f22;
# }
# optional int32 F11;
# optional int32 F12;
# }
# }
# }
# $ parquet-tools cat AmbiguousList_Legacy.parquet
# ambigArray:
# .array:
# ..s2:
# ...f21 = 21
# ...f22 = 22
# ..F11 = 11
# ..F12 = 12
# .array:
# ..s2:
# ...f21 = 210
# ...f22 = 220
# ..F11 = 110
# ..F12 = 120
def test_ambiguous_list(self, vector, unique_database):
"""IMPALA-4725: Tests the schema-resolution behavior with different values for the
PARQUET_ARRAY_RESOLUTION and PARQUET_FALLBACK_SCHEMA_RESOLUTION query options.
The schema of the Parquet test files is constructed to induce incorrect results
with index-based resolution and the default TWO_LEVEL_THEN_THREE_LEVEL array
resolution policy. Regardless of whether the Parquet data files use the 2-level or
3-level encoding, incorrect results may be returned if the array resolution does
not exactly match the data files'. The name-based policy generally does not have
this problem because it avoids traversing incorrect schema paths.
"""
# The Parquet resolution policy is manually set in the .test files.
if vector.get_value('parquet_array_resolution') != "three_level":
skip("Test only run with three_level")
ambig_modern_tbl = "ambig_modern"
self._create_test_table(unique_database, ambig_modern_tbl,
"AmbiguousList_Modern.parquet",
"ambigarray array<struct<s2:struct<f21:int,f22:int>,f11:int,f12:int>>")
self.run_test_case('QueryTest/parquet-ambiguous-list-modern',
vector, unique_database)
ambig_legacy_tbl = "ambig_legacy"
self._create_test_table(unique_database, ambig_legacy_tbl,
"AmbiguousList_Legacy.parquet",
"ambigarray array<struct<s2:struct<f21:int,f22:int>,f11:int,f12:int>>")
self.run_test_case('QueryTest/parquet-ambiguous-list-legacy',
vector, unique_database)
def _create_test_table(self, dbname, tablename, filename, columns):
"""Creates a table in the given database with the given name and columns. Copies
the file with the given name from TESTFILE_DIR into the table."""
location = get_fs_path("/test-warehouse/%s.db/%s" % (dbname, tablename))
self.client.execute("create table %s.%s (%s) stored as parquet location '%s'" %
(dbname, tablename, columns, location))
local_path = self.TESTFILE_DIR + "/" + filename
check_call(["hadoop", "fs", "-put", local_path, location], shell=False)
class TestMaxNestingDepth(ImpalaTestSuite):
# Should be kept in sync with the FE's Type.MAX_NESTING_DEPTH
MAX_NESTING_DEPTH = 100
@classmethod
def get_workload(self):
return 'functional-query'
@classmethod
def add_test_dimensions(cls):
super(TestMaxNestingDepth, cls).add_test_dimensions()
cls.ImpalaTestMatrix.add_constraint(lambda v:
v.get_value('table_format').file_format == 'parquet')
def test_max_nesting_depth(self, vector, unique_database):
"""Tests that Impala can scan Parquet files having complex types of
the maximum nesting depth."""
check_call(["hdfs", "dfs", "-copyFromLocal",
"%s/testdata/max_nesting_depth" % os.environ['IMPALA_HOME'],
"%s/%s.db/" % (WAREHOUSE, unique_database)], shell=False)
self.run_test_case('QueryTest/max-nesting-depth', vector, unique_database)
@SkipIfIsilon.hive
@SkipIfS3.hive
@SkipIfADLS.hive
@SkipIfLocal.hive
def test_load_hive_table(self, vector, unique_database):
"""Tests that Impala rejects Hive-created tables with complex types that exceed
the maximum nesting depth."""
# Type with a nesting depth of MAX_NESTING_DEPTH + 1
type_sql = ("array<" * self.MAX_NESTING_DEPTH) + "int" +\
(">" * self.MAX_NESTING_DEPTH)
create_table_sql = "CREATE TABLE %s.above_max_depth (f %s) STORED AS PARQUET" %\
(unique_database, type_sql)
self.run_stmt_in_hive(create_table_sql)
self.client.execute("invalidate metadata %s.above_max_depth" % unique_database)
try:
self.client.execute("explain select 1 from %s.above_max_depth" % unique_database)
assert False, "Expected table loading to fail."
except ImpalaBeeswaxException, e:
assert "Type exceeds the maximum nesting depth" in str(e)