blob: 55c7fab283e6b52fcad8561974627a52fcee678f [file] [log] [blame]
# This tests parquet dictionary filtering.
====
---- QUERY
# id: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where id < 10000;
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# id: Some values pass
# Filters 7/8 row groups
select count(*) from functional_parquet.alltypes where mod(id, 10000) < 20;
---- RESULTS
20
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 23
====
---- QUERY
# id: No values pass
# Filters 8/8 row groups
select count(*) from functional_parquet.alltypes where mod(id,10000) = 7301;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 24
====
---- QUERY
# tinyint_col: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where tinyint_col < 10;
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# tinyint_col: No values pass
# Filters 8/8 row groups
select count(*) from functional_parquet.alltypes where mod(tinyint_col,50) > 10;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 24
====
---- QUERY
# smallint_col: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where smallint_col < 10
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# smallint_col: No values pass
# Filters 8/8 row groups
select count(*) from functional_parquet.alltypes where mod(smallint_col,50) > 10;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 24
====
---- QUERY
# int_col: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where int_col < 10
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# int_col: No values pass
# Filters 8/8 row groups
select count(*) from functional_parquet.alltypes where mod(int_col, 50) > 10;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 24
====
---- QUERY
# bigint_col: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where bigint_col < 100
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# bigint_col: No values pass
# Filters 8/8 row groups
select count(*) from functional_parquet.alltypes where mod(bigint_col, 500) > 100;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 24
====
---- QUERY
# float_col: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where float_col < 10
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# float_col: No values pass
# Filters 8/8 row groups
select count(*) from functional_parquet.alltypes where mod(float_col, 100) > 10;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 24
====
---- QUERY
# double_col: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where double_col < 100
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# double_col: No values pass
# Filters 8/8 row groups
select count(*) from functional_parquet.alltypes where mod(double_col, 100) > 100;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 24
====
---- QUERY
# date_string_col: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where date_string_col like '%/%/%';
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# date_string_col: Half of the values pass
# Filters 4/8 row groups
select count(*) from functional_parquet.alltypes where date_string_col like '%/10';
---- RESULTS
3650
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 12
====
---- QUERY
# date_string_col: No values pass
# Filters 8/8 row groups
select count(*) from functional_parquet.alltypes where date_string_col = '01/01/11';
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 24
====
---- QUERY
# string_col: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where length(string_col) = 1 ;
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# string_col: No values pass
# Filters 8/8 row groups
select count(*) from functional_parquet.alltypes where string_col = '10';
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 24
====
---- QUERY
# timestamp_col: All values pass
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where timestamp_col >= '2009-01-01 00:00:00';
---- RESULTS
7300
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# timestamp_col: No values pass
# Note: dictionary filtering currently does not work on timestamps
# Filters 0/8 row groups
select count(*) from functional_parquet.alltypes where timestamp_col = '2009-01-01 00:00:01';
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# date_col: All values pass
# Filters 0/4 row groups
select count(*) from functional_parquet.date_tbl where date_col > '2017-12-31';
---- RESULTS
4
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# date_col: No values pass
# Note: dictionary filtering currently does not work on dates (see IMPALA-4994)
# Filters 0/4 row groups
select count(*) from functional_parquet.date_tbl where date_col = '2017-12-31';
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# TPC-H lineitem.l_orderkey has high cardinality (1500000)
# It always uses mixed encoding (PLAIN_DICTIONARY + PLAIN)
# Verify that no dictionary filtering is used even for a predicate
# that eliminates all rows.
select count(*) from tpch_parquet.lineitem where l_orderkey = 50;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 3
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Verify dictionary filtering on top level of a schema with nested
# data.
select count(*) from tpch_nested_parquet.customer where c_mktsegment = 'COMEDY';
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 4
====
---- QUERY
# Verify that parquet_dictionary_filtering=false turns off dictionary filtering
# This query filters 8/8 row groups when filtering is on. It should now filter
# zero.
set parquet_dictionary_filtering=false;
select count(*) from functional_parquet.alltypes where mod(id,10000) = 7301;
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 24
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Nested dictionary filtering.
#
# Test coverage includes the following dimensions:
# - nested collection type: map, array, struct
# - number of filters and their nesting depth
# - number of projections and their nesting depth
# - required vs. non-required collections (outer vs. inner joins)
# - filter matches: some, none
# - count(*) optimization (exercises special code path)
# - multiple row-groups per file
#
# Map key at depth 1. All required. No matches. Only one row-group is dictionary filtered
# since only one (of two) row-groups is dictionary encoded for that column.
select id from functional_parquet.complextypestbl f, f.int_map m where m.key = 'k5'
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 1
====
---- QUERY
select count(*) from functional_parquet.complextypestbl.int_map m where m.key = 'k5'
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 1
====
---- QUERY
# Map key at depth 1. All required. Matches.
select id from functional_parquet.complextypestbl f, f.int_map m where m.key = 'k1'
---- RESULTS
8
1
2
7
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Map key at depth 1. None required. No matches.
select id from functional_parquet.complextypestbl f left outer join f.int_map m
where m.key = 'k5'
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Map key at depth 1. None required. Matches.
select id from functional_parquet.complextypestbl f left outer join f.int_map m
where m.key = 'k1'
---- RESULTS
8
1
2
7
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Map key at depth 1. All required. No matches. count(*).
select count(*) from functional_parquet.complextypestbl f, f.int_map m where m.key = 'k5'
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 1
---- QUERY
# Map key at depth 1. All required. Matches. count(*)
select count(*) from functional_parquet.complextypestbl f, f.int_map m where m.key = 'k1'
---- RESULTS
4
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Map key at depth 1. None required. No matches. count(*)
select count(*) from functional_parquet.complextypestbl f left outer join f.int_map m
where m.key = 'k5'
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Map key at depth 1. None required. Matches. count(*)
select count(*) from functional_parquet.complextypestbl f left outer join f.int_map m
where m.key = 'k1'
---- RESULTS
4
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Array struct value at depth 2. All required. Matches.
select r_name from tpch_nested_parquet.region r, r.r_nations n where n.n_name = 'FRANCE'
---- RESULTS
'EUROPE'
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 1
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Array struct value at depth 3. All required. No matches.
select c_name from tpch_nested_parquet.customer c, c.c_orders o, o.o_lineitems l
where l.l_returnflag = 'foo'
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 4
====
---- QUERY
# Array struct value at depth 3. All required. No matches.
# Multiple nested values projected.
select c_name, o.o_clerk from tpch_nested_parquet.customer c,
c.c_orders o, o.o_lineitems l
where l.l_returnflag = 'foo'
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 4
====
---- QUERY
# Array struct value at depth 3. Bottom not required. No matches.
# Multiple nested values projected.
select c_name, o.o_clerk from tpch_nested_parquet.customer c,
c.c_orders o left outer join o.o_lineitems l
where l.l_returnflag = 'foo'
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Array struct value at depth 3. Top not required. No matches.
# Multiple nested values projected.
select c_name, o.o_clerk from tpch_nested_parquet.customer c left outer join c.c_orders o,
o.o_lineitems l
where l.l_returnflag = 'foo'
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Array struct value at depth 3. All required. Multiple filters, multiple projections.
# Matches.
select c_name, o.o_clerk from tpch_nested_parquet.customer c,
c.c_orders o, o.o_lineitems l
where l.l_shipdate = '1994-08-19' and
l.l_receiptdate = '1994-08-24' and l.l_shipmode = 'RAIL' and l.l_returnflag = 'R'
---- RESULTS
'Customer#000013873','Clerk#000000554'
'Customer#000049757','Clerk#000000156'
'Customer#000037490','Clerk#000000026'
'Customer#000002836','Clerk#000000577'
'Customer#000004897','Clerk#000000112'
'Customer#000107891','Clerk#000000576'
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 0
---- QUERY
# Array struct value at depth 3. All required. No matches, count(*).
select count(*) from tpch_nested_parquet.customer c, c.c_orders o, o.o_lineitems l
where l.l_returnflag = 'foo'
---- RESULTS
0
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 4
====
---- QUERY
# Array struct values at depths 2 and 3. All required.
# Matches, multiple nested values projected.
select c_name, o.o_comment from tpch_nested_parquet.customer c,
c.c_orders o, o.o_lineitems l
where l.l_returnflag = 'foo' and o.o_clerk = 'foo'
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 4
====
---- QUERY
# Array struct value at depth 2. Not required. No match.
# Illustrates a case that should not be pruned.
select count(*) from tpch_nested_parquet.customer c left outer join
(select * from c.c_orders o where o.o_orderstatus = 'foo') v
---- RESULTS
150000
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 4
aggregation(SUM, NumDictFilteredRowGroups): 0
====
---- QUERY
# Tests dictionary filtering with files with multiple row-groups.
# Expect all results to be filtered.
select l.l_linenumber from functional_parquet.customer_multiblock c,
c.c_orders o, o.o_lineitems l
where l.l_linenumber + 1 < 0;
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 2
---- QUERY
# Tests dictionary filtering with files with multiple row-groups and count(*).
# Expect all results to be filtered.
select count(*) from functional_parquet.customer_multiblock c,
c.c_orders o, o.o_lineitems l
where l.l_linenumber + 1 < 0;
---- RESULTS
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 2
---- QUERY
# Tests dictionary filtering with files with multiple row-groups and count(*).
# Expect same result as obtained with dictionary filtering disabled.
select count(*) from functional_parquet.customer_multiblock c,
c.c_orders o, o.o_lineitems l
where l.l_linenumber > 0;
---- RESULTS
7786
---- RUNTIME_PROFILE
aggregation(SUM, NumRowGroups): 2
aggregation(SUM, NumDictFilteredRowGroups): 0