IMPALA-10051: impala-shell exits with ValueError with WITH clauses

When a query contains WITH clause impala-shell tries to identify whether
it is a DML query or not, so that later it can provide appropriate
result messages. Earlier shlex was used to create tokens and assess the
query type based on that. However shlex can misinterpret some query
strings where whitespace charachters are mixed with quotes, because it
splits the string based on whitespace charachters. In some scenarios
'ValueError: No closing quotation' error can occur.

This change moves the tokenization from shlex to sqlparse.

Testing:
 - Added unit test to cover queries that contain mixed whitespaces
   and strings

Change-Id: I442d3bc65b90a55c73c847948d5179a8586d71ad
Reviewed-on: http://gerrit.cloudera.org:8080/16389
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
diff --git a/shell/impala_shell.py b/shell/impala_shell.py
index fbed3c6..6c5ba3b 100755
--- a/shell/impala_shell.py
+++ b/shell/impala_shell.py
@@ -1322,16 +1322,14 @@
   def do_with(self, args):
     """Executes a query with a WITH clause, fetching all rows"""
     query = self._build_query_string(self.last_leading_comment, self.orig_cmd, args)
-    # Use shlex to deal with escape quotes in string literals.
-    # Set posix=False to preserve the quotes.
-    tokens = shlex.split(strip_comments(query.lstrip()), posix=False)
+    # Parse the query with sqlparse to identify if it is a DML query or not.
+    # Because the WITH clause may precede DML or SELECT queries, checking the
+    # first string token is insufficient.
+    parsed = sqlparse.parse(query)[0]
+    query_type = sqlparse.sql.Statement(parsed.tokens).get_type()
     try:
-      # Because the WITH clause may precede DML or SELECT queries,
-      # just checking the first token is insufficient.
-      is_dml = False
-      if any(self.DML_REGEX.match(t) for t in tokens):
-        is_dml = True
-      return self._execute_stmt(query, is_dml=is_dml, print_web_link=True)
+      is_dml = self.DML_REGEX.match(query_type.lower())
+      return self._execute_stmt(query, bool(is_dml), print_web_link=True)
     except ValueError:
       return self._execute_stmt(query, print_web_link=True)
 
diff --git a/tests/shell/test_shell_interactive.py b/tests/shell/test_shell_interactive.py
index c6fe7e0..84cd42d 100755
--- a/tests/shell/test_shell_interactive.py
+++ b/tests/shell/test_shell_interactive.py
@@ -1058,6 +1058,18 @@
       result = p.get_result()
       assert "Fetched 0 row" in result.stderr
 
+  def test_quotes_in_with_clause(self, vector):
+    # IMPALA-10051: This test verifies that the fix prevents ValueErrors caused by
+    # shlex library when quotes and whitespace characters are mixed.
+    p = ImpalaShell(vector)
+    cmd = ("with foo as "
+           "(select *, regexp_replace(string_col,\"[a-zA-Z]\",\"+ \") "
+           "from functional.alltypestiny) "
+           "select * from foo limit 1")
+    p.send_cmd(cmd)
+    result = p.get_result()
+    assert "Fetched 1 row" in result.stderr
+
   def test_http_interactions(self, vector, http_503_server):
     """Test interactions with the http server when using hs2-http protocol.
     Check that the shell prints a good message when the server returns a 503 error."""