IMPALA-8801: Date type support for ORC scanner Implements read path for the date type in ORC scanner. The internal representation of a date is an int32 meaning the number of days since Unix epoch using proleptic Gregorian calendar. Similarly to the Parquet implementation (IMPALA-7370) this representation introduces an interoperability issue between Impala and older versions of Hive (before 3.1). For more details see the commit message of the mentioned Parquet implementation. Change-Id: I672a2cdd2452a46b676e0e36942fd310f55c4956 Reviewed-on: http://gerrit.cloudera.org:8080/14982 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>

commit: 63f52518ab331ad6ef31875c1cc6e16072ef8b70 [log] [tgz]
author: Gabor Kaszab <gaborkaszab@cloudera.com> Tue Dec 17 16:36:30 2019 +0100
committer: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Fri Jan 17 18:54:33 2020 +0000
tree: 22e8a38ba49dfe18472ca166cb5549fa3584fc07
parent: 09363842712f8429899776a0f1c56b2eba6d7073 [diff]
diff --git a/be/src/exec/hdfs-orc-scanner.h b/be/src/exec/hdfs-orc-scanner.h
index 78cd0a9..d26ca88 100644
--- a/be/src/exec/hdfs-orc-scanner.h
+++ b/be/src/exec/hdfs-orc-scanner.h

@@ -136,6 +136,7 @@
 
  private:
   friend class OrcColumnReader;
+  friend class OrcDateColumnReader;
   friend class OrcStringColumnReader;
   friend class OrcTimestampReader;
   friend class OrcComplexColumnReader;

diff --git a/be/src/exec/orc-column-readers.cc b/be/src/exec/orc-column-readers.cc
index 69128b9..9787d01 100644
--- a/be/src/exec/orc-column-readers.cc
+++ b/be/src/exec/orc-column-readers.cc

@@ -98,6 +98,9 @@
           }
         }
         break;
+      case TYPE_DATE:
+        reader = new OrcDateColumnReader(node, slot_desc, scanner);
+        break;
       default:
         DCHECK(false) << slot_desc->type().DebugString();
     } // end of switch
@@ -200,6 +203,22 @@
   return Status::OK();
 }
 
+Status OrcDateColumnReader::ReadValue(int row_idx, Tuple* tuple, MemPool* pool) {
+  if (IsNull(DCHECK_NOTNULL(batch_), row_idx)) {
+    SetNullSlot(tuple);
+    return Status::OK();
+  }
+  DateValue dv(batch_->data.data()[row_idx]);
+  if (UNLIKELY(!dv.IsValid())) {
+    SetNullSlot(tuple);
+    ErrorMsg msg(TErrorCode::ORC_DATE_OUT_OF_RANGE, scanner_->filename(), orc_column_id_);
+    return scanner_->state_->LogOrReturnError(msg);
+  }
+  DateValue* slot = reinterpret_cast<DateValue*>(GetSlot(tuple));
+  *slot = dv;
+  return Status::OK();
+}
+
 Status OrcDecimal16ColumnReader::ReadValue(int row_idx, Tuple* tuple, MemPool* pool) {
   if (IsNull(DCHECK_NOTNULL(batch_), row_idx)) {
     SetNullSlot(tuple);

diff --git a/be/src/exec/orc-column-readers.h b/be/src/exec/orc-column-readers.h
index 1bc589e..222b82d 100644
--- a/be/src/exec/orc-column-readers.h
+++ b/be/src/exec/orc-column-readers.h

@@ -54,7 +54,7 @@
   /// Create a column reader for the given 'slot_desc' based on the ORC 'node'. We say
   /// the 'slot_desc' and ORC 'node' match iff
   ///     scanner->col_id_path_map_[node->getColumnId()] == slot_desc->col_path
-  /// Caller should guaranteed that 'slot_desc' matches to ORC 'node' or one of its
+  /// Caller should guarantee that 'slot_desc' matches to ORC 'node' or one of its
   /// descendants. If 'node' is a primitive type, 'slot_desc' should match it since
   /// primitive types don't have descendants.
   /// If 'node' is in complex types (struct/array/map) and does not match 'slot_desc',
@@ -220,6 +220,22 @@
   orc::TimestampVectorBatch* batch_ = nullptr;
 };
 
+class OrcDateColumnReader : public OrcColumnReader {
+ public:
+  OrcDateColumnReader(const orc::Type* node, const SlotDescriptor* slot_desc,
+      HdfsOrcScanner* scanner)
+      : OrcColumnReader(node, slot_desc, scanner) { }
+
+  void UpdateInputBatch(orc::ColumnVectorBatch* orc_batch) override {
+    batch_ = static_cast<orc::LongVectorBatch*>(orc_batch);
+    DCHECK(batch_ == dynamic_cast<orc::LongVectorBatch*>(orc_batch));
+  }
+
+  Status ReadValue(int row_idx, Tuple* tuple, MemPool* pool) override WARN_UNUSED_RESULT;
+ private:
+  orc::LongVectorBatch* batch_ = nullptr;
+};
+
 template<typename DECIMAL_TYPE>
 class OrcDecimalColumnReader : public OrcColumnReader {
  public:

diff --git a/be/src/exec/orc-metadata-utils.cc b/be/src/exec/orc-metadata-utils.cc
index 27db46d..2add0b5 100644
--- a/be/src/exec/orc-metadata-utils.cc
+++ b/be/src/exec/orc-metadata-utils.cc

@@ -182,6 +182,9 @@
           "Column $0 in ORC file '$1' can't be truncated to table column $2",
           orc_type.toString(), filename_, type.DebugString()));
     }
+    case orc::TypeKind::DATE:
+      if (type.type == TYPE_DATE) return Status::OK();
+      break;
     default: break;
   }
   return Status(Substitute(

diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py
index 3da3061..43ccf21 100755
--- a/common/thrift/generate_error_codes.py
+++ b/common/thrift/generate_error_codes.py

@@ -442,6 +442,10 @@
   ("ORC_TIMESTAMP_OUT_OF_RANGE", 145,
    "ORC file '$0' column '$1' contains an out of range timestamp. "
    "The valid date range is 1400-01-01..9999-12-31."),
+
+  ("ORC_DATE_OUT_OF_RANGE", 146,
+   "ORC file '$0' column '$1' contains an out of range date. "
+   "The valid date range is 0001-01-01..9999-12-31."),
 )
 
 import sys

diff --git a/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java b/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
index 47d9773..5b083ec 100644
--- a/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java
+++ b/fe/src/main/java/org/apache/impala/catalog/HdfsFileFormat.java

@@ -34,6 +34,7 @@
  * 4) whether scanning complex types from it is supported
  * 5) whether the file format can skip complex columns in scans and just materialize
  *    scalar typed columns
+ * 6) Indicates if the given file format supports Date type.
  *
  * Important note: Always keep consistent with the classes used in Hive.
  * TODO: Kudu doesn't belong in this list. Either rename this enum or create a separate
@@ -66,7 +67,7 @@
   ORC("org.apache.hadoop.hive.ql.io.orc.OrcInputFormat",
       "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat",
       "org.apache.hadoop.hive.ql.io.orc.OrcSerde",
-      true, true, false),
+      true, true, true),
   KUDU("org.apache.hadoop.hive.kudu.KuduInputFormat",
        "org.apache.hadoop.hive.kudu.KuduOutputFormat",
        "org.apache.hadoop.hive.kudu.KuduSerDe",

diff --git a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
index b61830d..9dff663 100644
--- a/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java
+++ b/fe/src/main/java/org/apache/impala/planner/SingleNodePlanner.java

@@ -1389,8 +1389,8 @@
 
   /**
    * Looks for a filesystem-based partition in 'partitions' with no DATE support and
-   * returns the first one it finds. Right now, scanning DATE values is only supported for
-   * TEXT, PARQUET and AVRO fileformats.
+   * returns the first one it finds. Right now, scanning DATE values is supported for
+   * TEXT, PARQUET, AVRO and ORC fileformats.
    *
    * Returns null otherwise.
    */

diff --git a/testdata/data/README b/testdata/data/README
index f46bc9f..9a7b52c 100644
--- a/testdata/data/README
+++ b/testdata/data/README

@@ -321,11 +321,25 @@
    9999-12-31
   10000-01-01 (invalid - date too large)
 
+out_of_range_date.orc:
+Created using a pre-3.1. Hive version (2.1.1.) to contain an out-of-range date value.
+Took advantage of the incompatibility between Hive and Impala when Hive (before 3.1)
+writes a date before 1582-10-15 is interpreted incorrectly by Impala. The values I wrote
+with Hive:
+2019-10-04, 1582-10-15, 0001-01-01, 9999-12-31
+This is interpreted by Impala as:
+2019-10-04, 1582-10-15, 0000-12-30 (invalid - date too small), 9999-12-31
+
+
+
 hive2_pre_gregorian.parquet:
 Small parquet table with one DATE column, created by Hive 2.1.1.
 Used to demonstrate parquet interoperability issues between Hive and Impala for dates
 before the introduction of Gregorian calendar in 1582-10-15.
 
+hive2_pre_gregorian.orc:
+Same as the above but in ORC format instead of Parquet.
+
 decimals_1_10.parquet:
 Contains two decimal columns, one with precision 1, the other with precision 10.
 I used Hive 2.1.1 with a modified version of Parquet-MR (6901a20) to create tiny,

diff --git a/testdata/data/hive2_pre_gregorian.orc b/testdata/data/hive2_pre_gregorian.orc
new file mode 100644
index 0000000..0bb963e
--- /dev/null
+++ b/testdata/data/hive2_pre_gregorian.orc
Binary files differ

diff --git a/testdata/data/out_of_range_date.orc b/testdata/data/out_of_range_date.orc
new file mode 100644
index 0000000..cc9d138
--- /dev/null
+++ b/testdata/data/out_of_range_date.orc
Binary files differ

diff --git a/testdata/datasets/functional/schema_constraints.csv b/testdata/datasets/functional/schema_constraints.csv
index ab778a3..b5599a1 100644
--- a/testdata/datasets/functional/schema_constraints.csv
+++ b/testdata/datasets/functional/schema_constraints.csv

@@ -230,9 +230,11 @@
 table_name:table_with_header_insert, constraint:restrict_to, table_format:parquet/none/none
 
 # IMPALA-7368/IMPALA-7370/IMPALA-8198 adds DATE support for text, hbase, parquet and avro.
+# IMPALA-8801 adds DATE support for ORC.
 # Other file-formats will be introduced later.
 table_name:date_tbl, constraint:restrict_to, table_format:parquet/none/none
 table_name:date_tbl, constraint:restrict_to, table_format:avro/snap/block
+table_name:date_tbl, constraint:restrict_to, table_format:orc/def/block
 table_name:date_tbl, constraint:restrict_to, table_format:hbase/none/none
 table_name:date_tbl, constraint:restrict_to, table_format:text/none/none
 table_name:date_tbl, constraint:restrict_to, table_format:text/lzo/block

diff --git a/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test
index 7e1a189..83a1540 100644
--- a/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test
+++ b/testdata/workloads/functional-query/queries/DataErrorsTest/orc-type-checks.test

@@ -55,6 +55,16 @@
 Type mismatch: table column BIGINT is map to column timestamp in ORC file
 ====
 ---- QUERY
+select c11 from illtypes_ts_to_date
+---- CATCH
+Type mismatch: table column DATE is map to column timestamp in ORC file
+====
+---- QUERY
+select c2 from illtypes_date_tbl
+---- CATCH
+Type mismatch: table column TIMESTAMP is map to column date in ORC file
+====
+---- QUERY
 select * from safetypes order by c1
 ---- TYPES
 bigint,boolean,smallint,int,bigint,bigint,double,double,char,string,timestamp,int,int

diff --git a/testdata/workloads/functional-query/queries/QueryTest/date-fileformat-support.test b/testdata/workloads/functional-query/queries/QueryTest/date-fileformat-support.test
index d4cd44c..11d3636 100644
--- a/testdata/workloads/functional-query/queries/QueryTest/date-fileformat-support.test
+++ b/testdata/workloads/functional-query/queries/QueryTest/date-fileformat-support.test

@@ -31,10 +31,15 @@
 DATE
 ====
 ---- QUERY
-# Querying orc table is not supported.
+# Querying ORC table is supported.
 select * from $DATABASE.orc_date_tbl;
----- CATCH
-NotImplementedException: Scanning DATE values in table '$DATABASE.orc_date_tbl' is not supported for fileformat ORC
+---- TYPES
+DATE
+---- RESULTS
+1970-06-12
+2008-11-21
+9999-12-31
+NULL
 ====
 ---- QUERY
 # Inserting text partitions to $DATABASE.date_tbl is OK.
@@ -54,20 +59,12 @@
 date_part=1899-12-31/: 3
 ====
 ---- QUERY
-# Adding orc partition works even though Impala cannot scan/write DATE values in orc.
-# Querying all the partitions fails because of the one orc partition.
+# Adding ORC partition works even though Impala cannot write ORC format.
+# Querying all the partitions also works.
 alter table $DATABASE.date_tbl add partition (date_part='2099-12-31')
 location '$NAMENODE/test-warehouse/$DATABASE.db/orc_date_tbl';
 alter table $DATABASE.date_tbl partition (date_part='2099-12-31') set fileformat orc;
-select * from $DATABASE.date_tbl;
----- CATCH
-NotImplementedException: Scanning DATE values in table '$DATABASE.date_tbl' is not supported for fileformat ORC
-====
----- QUERY
-# Querying text, parquet and avro partitions is OK.
-select date_part, date_col
-from $DATABASE.date_tbl
-where date_part != '2099-12-31';
+select date_part, date_col from $DATABASE.date_tbl;
 ---- RESULTS
 0001-01-01,0001-01-01
 0001-01-01,0001-12-31
@@ -104,12 +101,10 @@
 1999-12-31,1970-01-02
 1999-12-31,1224-05-19
 1999-12-31,8543-11-21
+2099-12-31,1970-06-12
+2099-12-31,2008-11-21
+2099-12-31,9999-12-31
+2099-12-31,NULL
 ---- TYPES
 DATE,DATE
 ====
----- QUERY
-# Querying the orc partition separately fails.
-select date_part, date_col from $DATABASE.date_tbl where date_part='2099-12-31';
----- CATCH
-NotImplementedException: Scanning DATE values in table '$DATABASE.date_tbl' is not supported for fileformat ORC
-====

diff --git a/testdata/workloads/functional-query/queries/QueryTest/hive2-pre-gregorian-date-orc.test b/testdata/workloads/functional-query/queries/QueryTest/hive2-pre-gregorian-date-orc.test
new file mode 100644
index 0000000..38189a6
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/hive2-pre-gregorian-date-orc.test

@@ -0,0 +1,17 @@
+====
+---- QUERY
+# Query an ORC table created by Hive 2.1.1 containig the following dates:
+# 1582-10-04, 1582-10-05, 1582-10-06, 1582-10-15, 1582-10-16.
+# Impala will incorrectly read back the dates that precede the introduction of Gregorian
+# calendar (1582-10-15).
+SELECT * FROM $DATABASE.hive2_pre_gregorian_orc;
+---- TYPES
+DATE
+---- RESULTS
+1582-10-14
+1582-10-15
+1582-10-16
+1582-10-15
+1582-10-16
+====
+

diff --git a/testdata/workloads/functional-query/queries/QueryTest/out-of-range-date-orc.test b/testdata/workloads/functional-query/queries/QueryTest/out-of-range-date-orc.test
new file mode 100644
index 0000000..acaf28e
--- /dev/null
+++ b/testdata/workloads/functional-query/queries/QueryTest/out-of-range-date-orc.test

@@ -0,0 +1,20 @@
+====
+---- QUERY
+SELECT * FROM $DATABASE.out_of_range_date_orc;
+---- TYPES
+DATE
+---- RESULTS
+2019-10-04
+1582-10-15
+NULL
+9999-12-31
+---- ERRORS
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_date_orc/out_of_range_date.orc' column '1' contains an out of range date. The valid date range is 0001-01-01..9999-12-31.
+====
+---- QUERY
+set abort_on_error=1;
+SELECT * FROM $DATABASE.out_of_range_date_orc;
+---- CATCH
+ORC file '$NAMENODE/test-warehouse/$DATABASE.db/out_of_range_date_orc/out_of_range_date.orc' column '1' contains an out of range date. The valid date range is 0001-01-01..9999-12-31.
+====
+

diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index b303a62..5f41f08 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py

@@ -31,6 +31,7 @@
 from subprocess import check_call
 
 from testdata.common import widetable
+from tests.common.file_utils import create_table_and_copy_files
 from tests.common.impala_test_suite import ImpalaTestSuite, LOG
 from tests.common.skip import (
     SkipIf,
@@ -344,7 +345,7 @@
     self.run_test_case('QueryTest/out-of-range-timestamp-abort-on-error',
         vector, unique_database)
 
-  def test_date_out_of_range(self, vector, unique_database):
+  def test_date_out_of_range_parquet(self, vector, unique_database):
     """Test scanning parquet files with an out of range date."""
     create_table_from_parquet(self.client, unique_database, "out_of_range_date")
 
@@ -352,7 +353,7 @@
     del new_vector.get_value('exec_option')['abort_on_error']
     self.run_test_case('QueryTest/out-of-range-date', new_vector, unique_database)
 
-  def test_pre_gregorian_date(self, vector, unique_database):
+  def test_pre_gregorian_date_parquet(self, vector, unique_database):
     """Test date interoperability issues between Impala and Hive 2.1.1 when scanning
        a parquet table that contains dates that precede the introduction of Gregorian
        calendar in 1582-10-15.
@@ -1297,20 +1298,31 @@
     assert total == num_scanners_with_no_reads
 
   def test_type_conversions(self, vector, unique_database):
-    # Create an "illtypes" table whose columns can't match the underlining ORC file's.
+    # Create "illtypes" tables whose columns can't match the underlining ORC file's.
     # Create an "safetypes" table likes above but ORC columns can still fit into it.
-    # Reuse the data files of functional_orc_def.alltypestiny
+    # Reuse the data files of alltypestiny and date_tbl in funtional_orc_def.
     tbl_loc = get_fs_path("/test-warehouse/alltypestiny_orc_def")
     self.client.execute("""create external table %s.illtypes (c1 boolean, c2 float,
         c3 boolean, c4 tinyint, c5 smallint, c6 int, c7 boolean, c8 string, c9 int,
         c10 float, c11 bigint) partitioned by (year int, month int) stored as ORC
         location '%s';""" % (unique_database, tbl_loc))
+    self.client.execute("""create external table %s.illtypes_ts_to_date (c1 boolean,
+        c2 float, c3 boolean, c4 tinyint, c5 smallint, c6 int, c7 boolean, c8 string,
+        c9 int, c10 float, c11 date) partitioned by (year int, month int) stored as ORC
+        location '%s';""" % (unique_database, tbl_loc))
     self.client.execute("""create external table %s.safetypes (c1 bigint, c2 boolean,
         c3 smallint, c4 int, c5 bigint, c6 bigint, c7 double, c8 double, c9 char(3),
         c10 varchar(3), c11 timestamp) partitioned by (year int, month int) stored as ORC
         location '%s';""" % (unique_database, tbl_loc))
+    self.client.execute("""create external table %s.illtypes_date_tbl (c1 boolean,
+        c2 timestamp) partitioned by (date_part date) stored as ORC location '%s';"""
+        % (unique_database, "/test-warehouse/date_tbl_orc_def"))
     self.client.execute("alter table %s.illtypes recover partitions" % unique_database)
+    self.client.execute("alter table %s.illtypes_ts_to_date recover partitions"
+        % unique_database)
     self.client.execute("alter table %s.safetypes recover partitions" % unique_database)
+    self.client.execute("alter table %s.illtypes_date_tbl recover partitions"
+        % unique_database)
 
     # Create a decimal table whose precisions don't match the underlining orc files.
     # Reuse the data files of functional_orc_def.decimal_tbl.
@@ -1319,7 +1331,8 @@
         d2 decimal(8,0), d3 decimal(19,10), d4 decimal(20,20), d5 decimal(2,0))
         partitioned by (d6 decimal(9,0)) stored as orc location '%s'"""
         % (unique_database, decimal_loc))
-    self.client.execute("alter table %s.mismatch_decimals recover partitions" % unique_database)
+    self.client.execute("alter table %s.mismatch_decimals recover partitions"
+        % unique_database)
 
     self.run_test_case('DataErrorsTest/orc-type-checks', vector, unique_database)
 
@@ -1355,6 +1368,32 @@
         "root type is boolean (should be struct)")
 
 
+  def test_date_out_of_range_orc(self, vector, unique_database):
+    """Test scanning orc files with an out of range date."""
+    orc_tbl_name = "out_of_range_date_orc"
+    create_sql = "create table %s.%s (d date) stored as orc" % (unique_database,
+        orc_tbl_name)
+    create_table_and_copy_files(self.client, create_sql, unique_database, orc_tbl_name,
+        ["/testdata/data/out_of_range_date.orc"])
+
+    new_vector = deepcopy(vector)
+    del new_vector.get_value('exec_option')['abort_on_error']
+    self.run_test_case('QueryTest/out-of-range-date-orc', new_vector, unique_database)
+
+  def test_pre_gregorian_date_orc(self, vector, unique_database):
+    """Test date interoperability issues between Impala and Hive 2.1.1 when scanning
+       an orc table that contains dates that precede the introduction of Gregorian
+       calendar in 1582-10-15.
+    """
+    orc_tbl_name = "hive2_pre_gregorian_orc"
+    create_sql = "create table %s.%s (d date) stored as orc" % (unique_database,
+        orc_tbl_name)
+    create_table_and_copy_files(self.client, create_sql, unique_database, orc_tbl_name,
+        ["/testdata/data/hive2_pre_gregorian.orc"])
+
+    self.run_test_case('QueryTest/hive2-pre-gregorian-date-orc', vector, unique_database)
+
+
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod
   def get_workload(self):
commit	63f52518ab331ad6ef31875c1cc6e16072ef8b70	[log] [tgz]
author	Gabor Kaszab <gaborkaszab@cloudera.com>	Tue Dec 17 16:36:30 2019 +0100
committer	Impala Public Jenkins <impala-public-jenkins@cloudera.com>	Fri Jan 17 18:54:33 2020 +0000
tree	22e8a38ba49dfe18472ca166cb5549fa3584fc07
parent	09363842712f8429899776a0f1c56b2eba6d7073 [diff]