IMPALA-10012: ds_hll_sketch() results ascii codec decoding error fix
While the ds_hll_sketch() generates a string value as output the data
is not an ascii encoded text but a bitsketch, because of this, when
the shell get this data it disconnect while it tries to decode it.
The issue can be reproduced with a simple method like using unhex
with a wrong input.
Example: SELECT unhex("aa");
This patch contains a solution, where we replace any not UTF-8
decodable characters if we run into an UnicodeDecodeError after
fetching it.
This solution is working with the Thrift 0.9.3 autogenerated gen-py
but still fails with Thrift 0.11.0.
For Thrift 0.11.0 the error is catched and an error message is sent
(not working with beeswax protocol, because it generates a different
error (TypeError) which can come for other reasons too).
Testing:
-manual testing with these protocols: 'hs2-http', 'hs2', 'beeswax'
Change-Id: I0c5f1290356e21aed8ca7f896f953541942aed05
Reviewed-on: http://gerrit.cloudera.org:8080/16418
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Reviewed-by: Gabor Kaszab <gaborkaszab@cloudera.com>
diff --git a/shell/impala_client.py b/shell/impala_client.py
index cb7cb39..19a570f 100755
--- a/shell/impala_client.py
+++ b/shell/impala_client.py
@@ -1095,7 +1095,15 @@
self.fetch_size))
if rpc_status != RpcStatus.OK:
raise RPCException()
- yield [row.split('\t') for row in result.data]
+
+ def split_row_and_decode_if_needed(row):
+ try:
+ return row.split('\t')
+ except UnicodeDecodeError:
+ return row.decode('utf-8', 'replace').split('\t')
+
+ yield [split_row_and_decode_if_needed(row) for row in result.data]
+
if not result.has_more:
return
diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index 8c18408..6cb9aa7 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -1221,6 +1221,15 @@
except RPCException as e:
# could not complete the rpc successfully
print(e, file=sys.stderr)
+ except UnicodeDecodeError as e:
+ # An error occoured possibly during the fetching.
+ # Depending of which protocol is at use it can come from different places.
+ # Possibly occours because we try to display binary data which contains
+ # undecodable elements.
+ if self.last_query_handle is not None:
+ self.imp_client.close_query(self.last_query_handle)
+ print('UnicodeDecodeError : %s \nPlease check for columns containing binary data '
+ 'to find the possible source of the error.' % (e,), file=sys.stderr)
except QueryStateException as e:
# an exception occurred while executing the query
if self.last_query_handle is not None:
@@ -1348,7 +1357,7 @@
if self.readline and self.readline.get_current_history_length() > 0:
for index in xrange(1, self.readline.get_current_history_length() + 1):
cmd = self.readline.get_history_item(index)
- print('[%d]: %s' % (index, cmd), file=sys.stderr)
+ print('[%d]: %s' % (index, cmd.decode('utf-8', 'replace')), file=sys.stderr)
else:
print(READLINE_UNAVAILABLE_ERROR, file=sys.stderr)
diff --git a/shell/shell_output.py b/shell/shell_output.py
index c6c2e99..31d91a0 100644
--- a/shell/shell_output.py
+++ b/shell/shell_output.py
@@ -35,11 +35,25 @@
def format(self, rows):
"""Returns string containing representation of the table data."""
+
+ def decode_if_needed(row):
+ # Checking if the values in row is decodable. If it is we just give back the row
+ # as it is, if not we generate a new row and give that back instead where the
+ # undecodable parts are swapped out.
+ try:
+ ''.join(str(row))
+ return row
+ except UnicodeDecodeError:
+ new_row = []
+ for entry in row:
+ new_row.append(entry.decode('UTF-8', 'replace'))
+ return new_row
+
# Clear rows that already exist in the table.
self.prettytable.clear_rows()
try:
for row in rows:
- self.prettytable.add_row(row)
+ self.prettytable.add_row(decode_if_needed(row))
return self.prettytable.get_string()
except Exception as e:
# beeswax returns each row as a tab separated string. If a string column
@@ -49,7 +63,8 @@
"embedded tabs. Reverting to tab delimited text output")
print(error_msg, file=sys.stderr)
print('{0}: {1}'.format(type(e), str(e)), file=sys.stderr)
- return '\n'.join(['\t'.join(row) for row in rows])
+
+ return '\n'.join(['\t'.join(decode_if_needed(row)) for row in rows])
class DelimitedOutputFormatter(object):
@@ -77,7 +92,7 @@
lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
for row in rows:
if sys.version_info.major == 2:
- row = [val.encode('utf-8') for val in row]
+ row = [val.encode('utf-8', 'replace') for val in row]
writer.writerow(row)
rows = temp_buffer.getvalue().rstrip()
temp_buffer.close()