IMPALA-9277: Catch exception thrown from orc::ColumnSelector::updateSelectedByTypeId orc::ColumnSelector::updateSelectedByTypeId can throw an exception on malformed ORC files. The exception wasn't caught by Impala therefore it caused program termination. The fix is to simply catch the exception and return with a parse error instead. Testing: * added corrupt ORC file and e2e test Change-Id: I2f706bc832298cb5089e539b7a818cb86d02199f Reviewed-on: http://gerrit.cloudera.org:8080/14994 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>

commit: 8f448cfc6bb3c101ab4737eecd1e84a858744793 [log] [tgz]
author: Zoltan Borok-Nagy <boroknagyz@cloudera.com> Thu Jan 09 18:16:57 2020 +0100
committer: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Fri Jan 10 17:35:31 2020 +0000
tree: 65045d0d6efd22518e6177f5917479479875eaed
parent: 641e2abf2985972f96a1f27b94758c7bf26e64d5 [diff]
diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc
index 0c5739b..164feaa 100644
--- a/be/src/exec/hdfs-orc-scanner.cc
+++ b/be/src/exec/hdfs-orc-scanner.cc

@@ -194,12 +194,19 @@
   // ancestors and children will be selected too.
   // Here we haven't read stripe data yet so no orc::RowReaders are created. To get the
   // selected types we create a temp orc::RowReader (but won't read rows from it).
-  unique_ptr<orc::RowReader> tmp_row_reader =
-      reader_->createRowReader(row_reader_options_);
-  const orc::Type* root_type = &tmp_row_reader->getSelectedType();
-  DCHECK_EQ(root_type->getKind(), orc::TypeKind::STRUCT);
-  orc_root_reader_ = this->obj_pool_.Add(
-      new OrcStructReader(root_type, scan_node_->tuple_desc(), this));
+  try {
+    unique_ptr<orc::RowReader> tmp_row_reader =
+        reader_->createRowReader(row_reader_options_);
+    const orc::Type* root_type = &tmp_row_reader->getSelectedType();
+    DCHECK_EQ(root_type->getKind(), orc::TypeKind::STRUCT);
+    orc_root_reader_ = this->obj_pool_.Add(
+        new OrcStructReader(root_type, scan_node_->tuple_desc(), this));
+  } catch (std::exception& e) {
+    string msg = Substitute("Encountered parse error during schema selection in "
+        "ORC file $0: $1", filename(), e.what());
+    parse_status_ = Status(msg);
+    return parse_status_;
+  }
 
   // Set top-level template tuple.
   template_tuple_ = template_tuple_map_[scan_node_->tuple_desc()];

diff --git a/testdata/data/README b/testdata/data/README
index 8e7e7a3..a8f20c2 100644
--- a/testdata/data/README
+++ b/testdata/data/README

@@ -455,3 +455,6 @@
 out_of_range_timestamp.orc:
 Created with Hive. ORC file with a single timestamp column 'ts'.
 Contains one row (1300-01-01 00:00:00) which is outside Impala's valid time range.
+
+corrupt_schema.orc:
+ORC file from IMPALA-9277, generated by fuzz test. The file contains malformed metadata.

diff --git a/testdata/data/corrupt_schema.orc b/testdata/data/corrupt_schema.orc
new file mode 100644
index 0000000..86d2afe
--- /dev/null
+++ b/testdata/data/corrupt_schema.orc
Binary files differ

diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index ea71911..1d26463 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py

@@ -1334,6 +1334,16 @@
       self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
                          new_vector, unique_database)
 
+  def test_invalid_schema(self, vector, unique_database):
+    """Test scanning of ORC file with malformed schema."""
+    test_files = ["testdata/data/corrupt_schema.orc"]
+    create_table_and_copy_files(self.client,
+        "CREATE TABLE {db}.{tbl} (id BIGINT) STORED AS ORC",
+        unique_database, "corrupt_schema", test_files)
+    err = self.execute_query_expect_failure(self.client,
+        "select count(*) from {0}.{1}".format(unique_database, "corrupt_schema"))
+    assert "Encountered parse error during schema selection" in str(err)
+
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod
   def get_workload(self):
commit	8f448cfc6bb3c101ab4737eecd1e84a858744793	[log] [tgz]
author	Zoltan Borok-Nagy <boroknagyz@cloudera.com>	Thu Jan 09 18:16:57 2020 +0100
committer	Impala Public Jenkins <impala-public-jenkins@cloudera.com>	Fri Jan 10 17:35:31 2020 +0000
tree	65045d0d6efd22518e6177f5917479479875eaed
parent	641e2abf2985972f96a1f27b94758c7bf26e64d5 [diff]