IMPALA-10012: ds_hll_sketch() results ascii codec decoding error fix While the ds_hll_sketch() generates a string value as output the data is not an ascii encoded text but a bitsketch, because of this, when the shell get this data it disconnect while it tries to decode it. The issue can be reproduced with a simple method like using unhex with a wrong input. Example: SELECT unhex("aa"); This patch contains a solution, where we replace any not UTF-8 decodable characters if we run into an UnicodeDecodeError after fetching it. This solution is working with the Thrift 0.9.3 autogenerated gen-py but still fails with Thrift 0.11.0. For Thrift 0.11.0 the error is catched and an error message is sent (not working with beeswax protocol, because it generates a different error (TypeError) which can come for other reasons too). Testing: -manual testing with these protocols: 'hs2-http', 'hs2', 'beeswax' Change-Id: I0c5f1290356e21aed8ca7f896f953541942aed05 Reviewed-on: http://gerrit.cloudera.org:8080/16418 Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Reviewed-by: Gabor Kaszab <gaborkaszab@cloudera.com>

commit: fe6e6257475b521b3ceeae4f6a898aae6ce3eb94 [log] [tgz]
author: Adam Tamas <tadam@cloudera.com> Thu Jul 30 10:10:25 2020 +0200
committer: Gabor Kaszab <gaborkaszab@cloudera.com> Sat Sep 05 09:42:46 2020 +0000
tree: 1f562b0c513d5dc67c92e5dd9b7d4034c476d81b
parent: b7965d8240f3a5cd6972351b7fe1116e4a0835c1 [diff]
diff --git a/shell/impala_client.py b/shell/impala_client.py
index cb7cb39..19a570f 100755
--- a/shell/impala_client.py
+++ b/shell/impala_client.py

@@ -1095,7 +1095,15 @@
                                         self.fetch_size))
       if rpc_status != RpcStatus.OK:
         raise RPCException()
-      yield [row.split('\t') for row in result.data]
+
+      def split_row_and_decode_if_needed(row):
+        try:
+          return row.split('\t')
+        except UnicodeDecodeError:
+          return row.decode('utf-8', 'replace').split('\t')
+
+      yield [split_row_and_decode_if_needed(row) for row in result.data]
+
       if not result.has_more:
         return
 

diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index 8c18408..6cb9aa7 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py

@@ -1221,6 +1221,15 @@
     except RPCException as e:
       # could not complete the rpc successfully
       print(e, file=sys.stderr)
+    except UnicodeDecodeError as e:
+      # An error occoured possibly during the fetching.
+      # Depending of which protocol is at use it can come from different places.
+      # Possibly occours because we try to display binary data which contains
+      # undecodable elements.
+      if self.last_query_handle is not None:
+        self.imp_client.close_query(self.last_query_handle)
+      print('UnicodeDecodeError : %s \nPlease check for columns containing binary data '
+          'to find the possible source of the error.' % (e,), file=sys.stderr)
     except QueryStateException as e:
       # an exception occurred while executing the query
       if self.last_query_handle is not None:
@@ -1348,7 +1357,7 @@
     if self.readline and self.readline.get_current_history_length() > 0:
       for index in xrange(1, self.readline.get_current_history_length() + 1):
         cmd = self.readline.get_history_item(index)
-        print('[%d]: %s' % (index, cmd), file=sys.stderr)
+        print('[%d]: %s' % (index, cmd.decode('utf-8', 'replace')), file=sys.stderr)
     else:
       print(READLINE_UNAVAILABLE_ERROR, file=sys.stderr)
 

diff --git a/shell/shell_output.py b/shell/shell_output.py
index c6c2e99..31d91a0 100644
--- a/shell/shell_output.py
+++ b/shell/shell_output.py

@@ -35,11 +35,25 @@
 
   def format(self, rows):
     """Returns string containing representation of the table data."""
+
+    def decode_if_needed(row):
+      # Checking if the values in row is decodable. If it is we just give back the row
+      # as it is, if not we generate a new row and give that back instead where the
+      # undecodable parts are swapped out.
+      try:
+        ''.join(str(row))
+        return row
+      except UnicodeDecodeError:
+        new_row = []
+        for entry in row:
+          new_row.append(entry.decode('UTF-8', 'replace'))
+        return new_row
+
     # Clear rows that already exist in the table.
     self.prettytable.clear_rows()
     try:
       for row in rows:
-        self.prettytable.add_row(row)
+        self.prettytable.add_row(decode_if_needed(row))
       return self.prettytable.get_string()
     except Exception as e:
       # beeswax returns each row as a tab separated string. If a string column
@@ -49,7 +63,8 @@
                    "embedded tabs. Reverting to tab delimited text output")
       print(error_msg, file=sys.stderr)
       print('{0}: {1}'.format(type(e), str(e)), file=sys.stderr)
-      return '\n'.join(['\t'.join(row) for row in rows])
+
+      return '\n'.join(['\t'.join(decode_if_needed(row)) for row in rows])
 
 
 class DelimitedOutputFormatter(object):
@@ -77,7 +92,7 @@
                         lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
     for row in rows:
       if sys.version_info.major == 2:
-        row = [val.encode('utf-8') for val in row]
+        row = [val.encode('utf-8', 'replace') for val in row]
       writer.writerow(row)
     rows = temp_buffer.getvalue().rstrip()
     temp_buffer.close()
commit	fe6e6257475b521b3ceeae4f6a898aae6ce3eb94	[log] [tgz]
author	Adam Tamas <tadam@cloudera.com>	Thu Jul 30 10:10:25 2020 +0200
committer	Gabor Kaszab <gaborkaszab@cloudera.com>	Sat Sep 05 09:42:46 2020 +0000
tree	1f562b0c513d5dc67c92e5dd9b7d4034c476d81b
parent	b7965d8240f3a5cd6972351b7fe1116e4a0835c1 [diff]