IMPALA-9277: Catch exception thrown from orc::ColumnSelector::updateSelectedByTypeId

orc::ColumnSelector::updateSelectedByTypeId can throw an exception on
malformed ORC files. The exception wasn't caught by Impala therefore it
caused program termination.

The fix is to simply catch the exception and return with a parse error
instead.

Testing:
* added corrupt ORC file and e2e test

Change-Id: I2f706bc832298cb5089e539b7a818cb86d02199f
Reviewed-on: http://gerrit.cloudera.org:8080/14994
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
diff --git a/be/src/exec/hdfs-orc-scanner.cc b/be/src/exec/hdfs-orc-scanner.cc
index 0c5739b..164feaa 100644
--- a/be/src/exec/hdfs-orc-scanner.cc
+++ b/be/src/exec/hdfs-orc-scanner.cc
@@ -194,12 +194,19 @@
   // ancestors and children will be selected too.
   // Here we haven't read stripe data yet so no orc::RowReaders are created. To get the
   // selected types we create a temp orc::RowReader (but won't read rows from it).
-  unique_ptr<orc::RowReader> tmp_row_reader =
-      reader_->createRowReader(row_reader_options_);
-  const orc::Type* root_type = &tmp_row_reader->getSelectedType();
-  DCHECK_EQ(root_type->getKind(), orc::TypeKind::STRUCT);
-  orc_root_reader_ = this->obj_pool_.Add(
-      new OrcStructReader(root_type, scan_node_->tuple_desc(), this));
+  try {
+    unique_ptr<orc::RowReader> tmp_row_reader =
+        reader_->createRowReader(row_reader_options_);
+    const orc::Type* root_type = &tmp_row_reader->getSelectedType();
+    DCHECK_EQ(root_type->getKind(), orc::TypeKind::STRUCT);
+    orc_root_reader_ = this->obj_pool_.Add(
+        new OrcStructReader(root_type, scan_node_->tuple_desc(), this));
+  } catch (std::exception& e) {
+    string msg = Substitute("Encountered parse error during schema selection in "
+        "ORC file $0: $1", filename(), e.what());
+    parse_status_ = Status(msg);
+    return parse_status_;
+  }
 
   // Set top-level template tuple.
   template_tuple_ = template_tuple_map_[scan_node_->tuple_desc()];
diff --git a/testdata/data/README b/testdata/data/README
index 8e7e7a3..a8f20c2 100644
--- a/testdata/data/README
+++ b/testdata/data/README
@@ -455,3 +455,6 @@
 out_of_range_timestamp.orc:
 Created with Hive. ORC file with a single timestamp column 'ts'.
 Contains one row (1300-01-01 00:00:00) which is outside Impala's valid time range.
+
+corrupt_schema.orc:
+ORC file from IMPALA-9277, generated by fuzz test. The file contains malformed metadata.
diff --git a/testdata/data/corrupt_schema.orc b/testdata/data/corrupt_schema.orc
new file mode 100644
index 0000000..86d2afe
--- /dev/null
+++ b/testdata/data/corrupt_schema.orc
Binary files differ
diff --git a/tests/query_test/test_scanners.py b/tests/query_test/test_scanners.py
index ea71911..1d26463 100644
--- a/tests/query_test/test_scanners.py
+++ b/tests/query_test/test_scanners.py
@@ -1334,6 +1334,16 @@
       self.run_test_case('DataErrorsTest/orc-out-of-range-timestamp',
                          new_vector, unique_database)
 
+  def test_invalid_schema(self, vector, unique_database):
+    """Test scanning of ORC file with malformed schema."""
+    test_files = ["testdata/data/corrupt_schema.orc"]
+    create_table_and_copy_files(self.client,
+        "CREATE TABLE {db}.{tbl} (id BIGINT) STORED AS ORC",
+        unique_database, "corrupt_schema", test_files)
+    err = self.execute_query_expect_failure(self.client,
+        "select count(*) from {0}.{1}".format(unique_database, "corrupt_schema"))
+    assert "Encountered parse error during schema selection" in str(err)
+
 class TestScannerReservation(ImpalaTestSuite):
   @classmethod
   def get_workload(self):