IMPALA-6538: Fix read path when Parquet min/max statistics contain NaN If the first number in a row group written by Impala is NaN, then Impala writes incorrect statistics in the metadata. This will result in incorrect results when filtering the data. This commit fixes the read path when encountering NaNs in Parquet min/max statistics. If min and max are both NaN, we can't use the statistics at all. If only one of them is NaN, the other still can be used. I added some tests to QueryTest/parqet-stats.test Change-Id: If3897fc1426541239223670812f59e2bed32f455 Reviewed-on: http://gerrit.cloudera.org:8080/9358 Reviewed-by: Tim Armstrong <tarmstrong@cloudera.com> Tested-by: Impala Public Jenkins

commit: 881e00a8bff0469ab7860bcd0d4d4794fb04a4b8 [log] [tgz]
author: Zoltan Borok-Nagy <boroknagyz@cloudera.com> Mon Feb 19 15:26:42 2018 +0100
committer: Impala Public Jenkins <impala-public-jenkins@gerrit.cloudera.org> Thu Feb 22 00:57:46 2018 +0000
tree: 373667e0ad56b31a616286fe1aef5cf0dfed5c28
parent: a25f69355fa6cccaabcea86a434c4ca61ed7c8ac [diff]
diff --git a/be/src/exec/parquet-column-stats.cc b/be/src/exec/parquet-column-stats.cc
index a1d1155..2f6f7fc 100644
--- a/be/src/exec/parquet-column-stats.cc
+++ b/be/src/exec/parquet-column-stats.cc

@@ -18,6 +18,7 @@
 #include "parquet-column-stats.inline.h"
 
 #include <algorithm>
+#include <cmath>
 #include <limits>
 
 #include "common/names.h"
@@ -94,11 +95,13 @@
       return ColumnStats<int64_t>::DecodePlainValue(*stat_value, slot,
           col_chunk.meta_data.type);
     case TYPE_FLOAT:
+      // IMPALA-6527, IMPALA-6538: ignore min/max stats if NaN
       return ColumnStats<float>::DecodePlainValue(*stat_value, slot,
-          col_chunk.meta_data.type);
+          col_chunk.meta_data.type) && !std::isnan(*reinterpret_cast<float*>(slot));
     case TYPE_DOUBLE:
+      // IMPALA-6527, IMPALA-6538: ignore min/max stats if NaN
       return ColumnStats<double>::DecodePlainValue(*stat_value, slot,
-          col_chunk.meta_data.type);
+          col_chunk.meta_data.type) && !std::isnan(*reinterpret_cast<double*>(slot));
     case TYPE_TIMESTAMP:
       return ColumnStats<TimestampValue>::DecodePlainValue(*stat_value, slot,
           col_chunk.meta_data.type);

diff --git a/testdata/data/README b/testdata/data/README
index 2b92a0a..e293c3c 100644
--- a/testdata/data/README
+++ b/testdata/data/README

@@ -148,3 +148,10 @@
     optional int32 int_col;
     optional int64 bigint_col;
   }
+
+min_max_is_nan.parquet:
+Generated by Impala's Parquet writer before the fix for IMPALA-6527. Git hash: 3a049a53
+Created to test the read path for a Parquet file with invalid metadata, namely when
+'max_value' and 'min_value' are both NaN. Contains 2 single-column rows:
+NaN
+42

diff --git a/testdata/data/min_max_is_nan.parquet b/testdata/data/min_max_is_nan.parquet
new file mode 100644
index 0000000..7fedf3a
--- /dev/null
+++ b/testdata/data/min_max_is_nan.parquet
Binary files differ

diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-invalid-minmax-stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-invalid-minmax-stats.test
new file mode 100644
index 0000000..c2044ae
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-invalid-minmax-stats.test

@@ -0,0 +1,32 @@
+====
+---- QUERY
+# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering.
+# When the first value is NaN in a column chunk, Impala might choose it as min_value and
+# max_value for statistics. In this case the min/max filter should be ignored.
+# 'min_max_is_nan' is written by an old writer, therefore it contains invalid statistics.
+select * from min_max_is_nan where val > 0
+---- RESULTS
+42
+====
+---- QUERY
+# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering
+# test equality predicate
+select * from min_max_is_nan where val = 42
+---- RESULTS
+42
+====
+---- QUERY
+# IMPALA-6527: NaN values lead to incorrect filtering
+# test predicate that is true for NaN
+select * from min_max_is_nan where not val >= 0
+---- RESULTS
+NaN
+====
+---- QUERY
+# IMPALA-6527: NaN values lead to incorrect filtering
+# test predicate that is true for NaN
+select * from min_max_is_nan where val != 0
+---- RESULTS
+NaN
+42
+====

diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
index 70b5f27..273dff8 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/parquet-stats.test

@@ -491,3 +491,80 @@
 aggregation(SUM, NumRowGroups): 1
 aggregation(SUM, NumStatsFilteredRowGroups): 0
 ====
+---- QUERY
+# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering.
+# When the first value is NaN in a column chunk, Impala chooses it as min_value and
+# max_value for statistics. In this case the min/max filter should be ignored.
+create table test_nan(val double) stored as parquet;
+insert into test_nan values (cast('NaN' as double)), (42);
+select * from test_nan where val > 0
+---- RESULTS
+42
+====
+---- QUERY
+# IMPALA-6527, IMPALA-6538: NaN values lead to incorrect filtering
+# test equality predicate
+select * from test_nan where val = 42
+---- RESULTS
+42
+====
+---- QUERY
+# IMPALA-6527: NaN values lead to incorrect filtering
+# test predicate that is true for NaN
+select * from test_nan where not val >= 0
+---- RESULTS
+NaN
+====
+---- QUERY
+# IMPALA-6527: NaN values lead to incorrect filtering
+# test predicate that is true for NaN
+select * from test_nan where val != 0
+---- RESULTS
+NaN
+42
+====
+---- QUERY
+# Statistics filtering must not filter out row groups if predicate can be true for NaN
+create table test_nan_true_predicate(val double) stored as parquet;
+insert into test_nan_true_predicate values (10), (20), (cast('NaN' as double));
+select * from test_nan_true_predicate where not val >= 0
+---- RESULTS
+NaN
+====
+---- QUERY
+# NaN is the last element, predicate is true for NaN and value
+select * from test_nan_true_predicate where not val >= 20
+---- RESULTS
+10
+NaN
+====
+---- QUERY
+# NaN is the last element, predicate is true for NaN and value
+select * from test_nan_true_predicate where val != 10
+---- RESULTS
+20
+NaN
+====
+---- QUERY
+# Test the case when NaN is inserted between two values
+# Test predicate true for NaN and false for the values
+create table test_nan_in_the_middle(val double) stored as parquet;
+insert into test_nan_in_the_middle values (10), (cast('NaN' as double)), (20);
+select * from test_nan_in_the_middle where not val >= 0
+---- RESULTS
+NaN
+====
+---- QUERY
+# NaN in the middle, predicate true for NaN and value
+select * from test_nan_in_the_middle where not val >= 20
+---- RESULTS
+10
+NaN
+====
+---- QUERY
+# NaN in the middle, '!=' should return NaN and value
+select * from test_nan_in_the_middle where val != 10
+---- RESULTS
+NaN
+20
+====

diff --git a/tests/query_test/test_parquet_stats.py b/tests/query_test/test_parquet_stats.py
index 00afd09..3f8cd2f 100644
--- a/tests/query_test/test_parquet_stats.py
+++ b/tests/query_test/test_parquet_stats.py

@@ -69,3 +69,19 @@
     # skipped inside a fragment, so we ensure that the tests run in a single fragment.
     vector.get_value('exec_option')['num_nodes'] = 1
     self.run_test_case('QueryTest/parquet-deprecated-stats', vector, unique_database)
+
+  def test_invalid_stats(self, vector, unique_database):
+    """IMPALA-6538" Test that reading parquet files with statistics with invalid
+    'min_value'/'max_value' fields works correctly. 'min_value' and 'max_value' are both
+    NaNs, therefore we need to ignore them"""
+    table_name = 'min_max_is_nan'
+    self.client.execute('create table %s.%s (val double) stored as parquet' %
+                       (unique_database, table_name))
+    table_location = get_fs_path('/test-warehouse/%s.db/%s' %
+                                 (unique_database, table_name))
+    local_file = os.path.join(os.environ['IMPALA_HOME'],
+                              'testdata/data/min_max_is_nan.parquet')
+    assert os.path.isfile(local_file)
+    check_call(['hdfs', 'dfs', '-copyFromLocal', local_file, table_location])
+    self.client.execute('invalidate metadata %s.%s' % (unique_database, table_name))
+    self.run_test_case('QueryTest/parquet-invalid-minmax-stats', vector, unique_database)
\ No newline at end of file
commit	881e00a8bff0469ab7860bcd0d4d4794fb04a4b8	[log] [tgz]
author	Zoltan Borok-Nagy <boroknagyz@cloudera.com>	Mon Feb 19 15:26:42 2018 +0100
committer	Impala Public Jenkins <impala-public-jenkins@gerrit.cloudera.org>	Thu Feb 22 00:57:46 2018 +0000
tree	373667e0ad56b31a616286fe1aef5cf0dfed5c28
parent	a25f69355fa6cccaabcea86a434c4ca61ed7c8ac [diff]