IMPALA-10012: ds_hll_sketch() results ascii codec decoding error fix

While the ds_hll_sketch() generates a string value as output the data
is not an ascii encoded text but a bitsketch, because of this, when
the shell get this data it disconnect while it tries to decode it.

The issue can be reproduced with a simple method like using unhex
with a wrong input.
Example: SELECT unhex("aa");

This patch contains a solution, where we replace any not UTF-8
decodable characters if we run into an UnicodeDecodeError after
fetching it.

This solution is working with the Thrift 0.9.3 autogenerated gen-py
but still fails with Thrift 0.11.0.

For Thrift 0.11.0 the error is catched and an error message is sent
(not working with beeswax protocol, because it generates a different
error (TypeError) which can come for other reasons too).

Testing:
-manual testing with these protocols: 'hs2-http', 'hs2', 'beeswax'

Change-Id: I0c5f1290356e21aed8ca7f896f953541942aed05
Reviewed-on: http://gerrit.cloudera.org:8080/16418
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Reviewed-by: Gabor Kaszab <gaborkaszab@cloudera.com>
diff --git a/shell/impala_client.py b/shell/impala_client.py
index cb7cb39..19a570f 100755
--- a/shell/impala_client.py
+++ b/shell/impala_client.py
@@ -1095,7 +1095,15 @@
                                         self.fetch_size))
       if rpc_status != RpcStatus.OK:
         raise RPCException()
-      yield [row.split('\t') for row in result.data]
+
+      def split_row_and_decode_if_needed(row):
+        try:
+          return row.split('\t')
+        except UnicodeDecodeError:
+          return row.decode('utf-8', 'replace').split('\t')
+
+      yield [split_row_and_decode_if_needed(row) for row in result.data]
+
       if not result.has_more:
         return
 
diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index 8c18408..6cb9aa7 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -1221,6 +1221,15 @@
     except RPCException as e:
       # could not complete the rpc successfully
       print(e, file=sys.stderr)
+    except UnicodeDecodeError as e:
+      # An error occoured possibly during the fetching.
+      # Depending of which protocol is at use it can come from different places.
+      # Possibly occours because we try to display binary data which contains
+      # undecodable elements.
+      if self.last_query_handle is not None:
+        self.imp_client.close_query(self.last_query_handle)
+      print('UnicodeDecodeError : %s \nPlease check for columns containing binary data '
+          'to find the possible source of the error.' % (e,), file=sys.stderr)
     except QueryStateException as e:
       # an exception occurred while executing the query
       if self.last_query_handle is not None:
@@ -1348,7 +1357,7 @@
     if self.readline and self.readline.get_current_history_length() > 0:
       for index in xrange(1, self.readline.get_current_history_length() + 1):
         cmd = self.readline.get_history_item(index)
-        print('[%d]: %s' % (index, cmd), file=sys.stderr)
+        print('[%d]: %s' % (index, cmd.decode('utf-8', 'replace')), file=sys.stderr)
     else:
       print(READLINE_UNAVAILABLE_ERROR, file=sys.stderr)
 
diff --git a/shell/shell_output.py b/shell/shell_output.py
index c6c2e99..31d91a0 100644
--- a/shell/shell_output.py
+++ b/shell/shell_output.py
@@ -35,11 +35,25 @@
 
   def format(self, rows):
     """Returns string containing representation of the table data."""
+
+    def decode_if_needed(row):
+      # Checking if the values in row is decodable. If it is we just give back the row
+      # as it is, if not we generate a new row and give that back instead where the
+      # undecodable parts are swapped out.
+      try:
+        ''.join(str(row))
+        return row
+      except UnicodeDecodeError:
+        new_row = []
+        for entry in row:
+          new_row.append(entry.decode('UTF-8', 'replace'))
+        return new_row
+
     # Clear rows that already exist in the table.
     self.prettytable.clear_rows()
     try:
       for row in rows:
-        self.prettytable.add_row(row)
+        self.prettytable.add_row(decode_if_needed(row))
       return self.prettytable.get_string()
     except Exception as e:
       # beeswax returns each row as a tab separated string. If a string column
@@ -49,7 +63,8 @@
                    "embedded tabs. Reverting to tab delimited text output")
       print(error_msg, file=sys.stderr)
       print('{0}: {1}'.format(type(e), str(e)), file=sys.stderr)
-      return '\n'.join(['\t'.join(row) for row in rows])
+
+      return '\n'.join(['\t'.join(decode_if_needed(row)) for row in rows])
 
 
 class DelimitedOutputFormatter(object):
@@ -77,7 +92,7 @@
                         lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
     for row in rows:
       if sys.version_info.major == 2:
-        row = [val.encode('utf-8') for val in row]
+        row = [val.encode('utf-8', 'replace') for val in row]
       writer.writerow(row)
     rows = temp_buffer.getvalue().rstrip()
     temp_buffer.close()