IMPALA-8184: Add timestamp validation to ORC scanner

Hive can write timestamps that are outside Impala's valid
range (Impala: 1400-9999 Hive: 0001-9999). This change adds
validation logic to ORC reading that replaces out-of-range
timestamps with NULLs and adds a warning to the query.

The logic is very similar to the existing validation in
Parquet. Some differences:
- "time of day" is not checked separately as it doesn't make
  sense with ORC's encoding
- instead of column name only column id is added to the warning

Testing:
- added a simple EE test that scans an existing ORC file

Change-Id: I8ee2ba83a54f93d37e8832e064f2c8418b503490
Reviewed-on: http://gerrit.cloudera.org:8080/14832
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc
index 05e8ea2..69128b9 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc
@@ -191,6 +191,12 @@
   auto slot = reinterpret_cast<TimestampValue*>(GetSlot(tuple));
   *slot = TimestampValue::FromUnixTimeNanos(secs, nanos,
       scanner_->state_->local_time_zone());
+  if (UNLIKELY(!slot->HasDate())) {
+    SetNullSlot(tuple);
+    TErrorCode::type errorCode = TErrorCode::ORC_TIMESTAMP_OUT_OF_RANGE;
+    ErrorMsg msg(errorCode, scanner_->filename(), orc_column_id_);
+    return scanner_->state_->LogOrReturnError(msg);
+  }
   return Status::OK();
 }
 
diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 8f0cbaa..3da3061 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py
@@ -438,6 +438,10 @@
 
   ("AVRO_INVALID_DATE", 144, "Avro file '$0' is corrupt: out of range date value $1 "
    "at offset $2. The valid date range is -719162..2932896 (0001-01-01..9999-12-31)."),
+
+  ("ORC_TIMESTAMP_OUT_OF_RANGE", 145,
+   "ORC file '$0' column '$1' contains an out of range timestamp. "
+   "The valid date range is 1400-01-01..9999-12-31."),
 )
 
 import sys
diff --git a/testdata/data/README b/testdata/data/README
index bbffeb4..8e7e7a3 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -450,4 +450,8 @@
 
 child_table:
 Created manually. Contains four columns. 'seq' column is the primary key of this table. ('id', 'year') form a foreign key referring to parent_table('id', 'year') and 'a' is a
-foreign key referring to parent_table_2's primary column 'a'.
\ No newline at end of file
+foreign key referring to parent_table_2's primary column 'a'.
+
+out_of_range_timestamp.orc:
+Created with Hive. ORC file with a single timestamp column 'ts'.
+Contains one row (1300-01-01 00:00:00) which is outside Impala's valid time range.
diff --git a/testdata/data/out_of_range_timestamp.orc b/testdata/data/out_of_range_timestamp.orc
new file mode 100644
index 0000000..268b900
--- /dev/null
+++ b/testdata/data/out_of_range_timestamp.orc
Binary files differ
diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test
new file mode 100644
index 0000000..c39cd21
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-out-of-range-timestamp.test
@@ -0,0 +1,16 @@
+====
+---- QUERY
+SET abort_on_error=1;
+SELECT * FROM out_of_range_timestamp;
+---- CATCH
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_timestamp/out_of_range_timestamp.orc' column '1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31.
+====
+---- QUERY
+SET abort_on_error=0;
+SELECT * FROM out_of_range_timestamp;
+---- TYPES
+TIMESTAMP
+---- RESULTS
+NULL
+---- ERRORS
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_timestamp/out_of_range_timestamp.orc' column '1' contains an out of range timestamp. The valid date range is 1400-01-01..9999-12-31.
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index 5b17ecb..ea71911 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1323,6 +1323,17 @@
 
     self.run_test_case('DataErrorsTest/orc-type-checks', vector, unique_database)
 
+  def test_orc_timestamp_out_of_range(self, vector, unique_database):
+      """Test the validation of out-of-range timestamps."""
+      test_files = ["testdata/data/out_of_range_timestamp.orc"]
+      create_table_and_copy_files(self.client, "create table {db}.{tbl} "
+                                               "(ts timestamp) stored as orc",
+                                  unique_database, "out_of_range_timestamp", test_files)
+      new_vector = deepcopy(vector)
+      del new_vector.get_value('exec_option')['abort_on_error']
+      self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
+                         new_vector, unique_database)
+
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod
   def get_workload(self):