IMPALA-9131: Use single quotes around FORMAT clause in CAST

When running a CAST(..FORMAT..) query then the header of the output
shows the value of the FORMAT clause surrounded by double quotes.
However, the SQL way is to use single quotes for strings so this
patch changes the printout from using double quotes to use single
quotes instead.
Additionally, this fixes a bug where it wasn't possible to have a
single quote separator in a format string that was itself surrounded
by single quotes. As a fix the single quote separator can be escaped
by a backslash in this case.
Another additional fix to make the FX modifier case-insensitive.

Change-Id: I3310abfa6f3ccbbe4c437846c6dd05791153e6f7
Reviewed-on: http://gerrit.cloudera.org:8080/14665
Reviewed-by: Attila Jeges <attilaj@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
diff --git a/be/src/runtime/datetime-iso-sql-format-parser.cc b/be/src/runtime/datetime-iso-sql-format-parser.cc
index 5a39573..976f39a 100644
--- a/be/src/runtime/datetime-iso-sql-format-parser.cc
+++ b/be/src/runtime/datetime-iso-sql-format-parser.cc
@@ -216,7 +216,7 @@
   const char* format_end = tok->val + tok->len;
   // Take care of the double escaped quotes.
   if (tok->is_double_escaped && format_end - *format >= 4 &&
-      (strncmp(*format, "\\\\\\\"", 4) == 0 || strncmp(*format, "\\\\\\'", 4) == 0)) {
+      (strncmp(*format, "\\\\\\\"", 4) == 0)) {
     *format += 3;
     return **format;
   }
@@ -237,10 +237,11 @@
   DCHECK(end_pos != nullptr);
   DCHECK(current_tok_idx != nullptr && *current_tok_idx < dt_ctx.toks.size());
   DCHECK(dt_ctx.toks[*current_tok_idx].type == SEPARATOR);
-  if (!IsoSqlFormatTokenizer::IsSeparator(**current_pos)) return false;
+  if (!IsoSqlFormatTokenizer::IsSeparator(current_pos, end_pos, false)) return false;
   // Advance to the end of the separator sequence.
   ++(*current_pos);
-  while (*current_pos < end_pos && IsoSqlFormatTokenizer::IsSeparator(**current_pos)) {
+  while (*current_pos < end_pos &&
+      IsoSqlFormatTokenizer::IsSeparator(current_pos, end_pos, false)) {
     ++(*current_pos);
   }
   // Advance to the end of the separator sequence in the expected tokens list.
@@ -297,7 +298,7 @@
 
   const char* end_pos = start_of_token;
   while (end_pos < start_of_token + max_tok_len &&
-      !IsoSqlFormatTokenizer::IsSeparator(*end_pos)) {
+      !IsoSqlFormatTokenizer::IsSeparator(&end_pos, input_str + input_len, false)) {
     ++end_pos;
   }
   if (end_pos == input_str) return nullptr;
diff --git a/be/src/runtime/datetime-iso-sql-format-tokenizer.cc b/be/src/runtime/datetime-iso-sql-format-tokenizer.cc
index 1f477b0..fa584e2 100644
--- a/be/src/runtime/datetime-iso-sql-format-tokenizer.cc
+++ b/be/src/runtime/datetime-iso-sql-format-tokenizer.cc
@@ -98,7 +98,7 @@
   DCHECK(current_pos != nullptr && *current_pos != nullptr);
   const char* str_begin = dt_ctx_->fmt;
   const char* str_end = dt_ctx_->fmt + dt_ctx_->fmt_len;
-  while (*current_pos < str_end && IsSeparator(**current_pos)) {
+  while (*current_pos < str_end && IsSeparator(current_pos, str_end)) {
     dt_ctx_->toks.push_back(DateTimeFormatToken(SEPARATOR, *current_pos - str_begin, 1,
         *current_pos));
     ++(*current_pos);
@@ -227,15 +227,27 @@
   return SUCCESS;
 }
 
-bool IsoSqlFormatTokenizer::IsSeparator(char c) {
-  return c == '-' || c == ':' || c== ' ' || c == '.' || c == '/' || c== ',' ||
-      c == '\'' || c == ';' ;
+bool IsoSqlFormatTokenizer::IsSeparator(const char** current_pos, const char* str_end,
+    bool read_escaped_single_quotes) {
+  DCHECK(str_end != nullptr);
+  DCHECK(current_pos != nullptr && *current_pos != nullptr && *current_pos < str_end);
+  char c = **current_pos;
+  if (c == '-' || c == ':' || c == ' ' || c == '.' || c == '/' || c == ',' ||
+      c == '\'' || c == ';') {
+    return true;
+  }
+  if (read_escaped_single_quotes && str_end - *current_pos > 1 &&
+      strncmp(*current_pos, "\\'", 2) == 0) {
+    ++(*current_pos);
+    return true;
+  }
+  return false;
 }
 
 void IsoSqlFormatTokenizer::ProcessFXModifier(const char** current_pos) {
   DCHECK(current_pos != nullptr && *current_pos != nullptr);
   DCHECK(*current_pos == dt_ctx_->fmt);
-  if (strncmp(*current_pos, "FX", 2) == 0) {
+  if (strncasecmp(*current_pos, "FX", 2) == 0) {
     dt_ctx_->fx_modifier = true;
     *current_pos += 2;
   }
diff --git a/be/src/runtime/datetime-iso-sql-format-tokenizer.h b/be/src/runtime/datetime-iso-sql-format-tokenizer.h
index 2c3b838..9d973aa 100644
--- a/be/src/runtime/datetime-iso-sql-format-tokenizer.h
+++ b/be/src/runtime/datetime-iso-sql-format-tokenizer.h
@@ -50,8 +50,12 @@
   /// 'dt_ctx_' with metadata about the format string such as tokens found in the string.
   FormatTokenizationResult Tokenize();
 
-  /// Returns true if 'c' is a valid separator
-  static bool IsSeparator(char c);
+  /// Returns true if '*current_pos' points to a valid separator. If
+  /// 'read_escaped_single_quotes' is true then escaped single quotes are also taken as
+  /// valid separators. In this case '*current_pos' is advanced from the escaping
+  /// backslash to the single quote.
+  static bool IsSeparator(const char** current_pos, const char* str_end,
+      bool read_escaped_single_quotes = true);
 private:
   /// Stores metadata about a specific token type.
   struct TokenItem {
diff --git a/fe/src/main/java/org/apache/impala/analysis/CastExpr.java b/fe/src/main/java/org/apache/impala/analysis/CastExpr.java
index 8b40451..67fedec 100644
--- a/fe/src/main/java/org/apache/impala/analysis/CastExpr.java
+++ b/fe/src/main/java/org/apache/impala/analysis/CastExpr.java
@@ -194,6 +194,28 @@
     }
   }
 
+  public String getCastFormatWithEscapedSingleQuotes() {
+    Preconditions.checkNotNull(castFormat_);
+    Preconditions.checkState(!castFormat_.isEmpty());
+    StringBuilder result = new StringBuilder();
+    for(int i = 0; i < castFormat_.length(); ++i) {
+      char currentChar = castFormat_.charAt(i);
+      if (currentChar == '\'') {
+        // Count the preceeding backslashes
+        int backslashCount = 0;
+        int j = i - 1;
+        while (j >= 0 && castFormat_.charAt(j) == '\\') {
+          ++backslashCount;
+          --j;
+        }
+        // If the single quote is not escaped then adds an extra backslash to escape it.
+        if (backslashCount % 2 == 0) result.append('\\');
+      }
+      result.append(currentChar);
+    }
+    return result.toString();
+  }
+
   @Override
   public String toSqlImpl(ToSqlOptions options) {
     if (isImplicit_) {
@@ -206,7 +228,7 @@
     }
     String formatClause = "";
     if (castFormat_ != null && !castFormat_.isEmpty()) {
-      formatClause = " FORMAT \"" + castFormat_ + "\"";
+      formatClause = " FORMAT '" + getCastFormatWithEscapedSingleQuotes() + "'";
     }
     return "CAST(" + getChild(0).toSql(options) + " AS " + targetTypeDef_.toString()
         + formatClause + ")";
diff --git a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
index 17d7f22..5ce1ba2 100644
--- a/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
+++ b/fe/src/test/java/org/apache/impala/analysis/AnalyzeExprsTest.java
@@ -3290,13 +3290,38 @@
 
   @Test
   public void TestToStringOnCastFormatClause() throws AnalysisException {
-    String cast_str = "CAST('05-01-2017' AS TIMESTAMP FORMAT \"MM-dd-yyyy\")";
+    String cast_str = "CAST('05-01-2017' AS TIMESTAMP FORMAT 'MM-dd-yyyy')";
     SelectStmt select = (SelectStmt) AnalyzesOk("select " + cast_str);
     Assert.assertEquals(cast_str, select.getResultExprs().get(0).toSqlImpl());
 
-    cast_str = "CAST('05-01-2017' AS DATE FORMAT \"MM-dd-yyyy\")";
+    cast_str = "CAST('05-01-2017' AS DATE FORMAT 'MM-dd-yyyy')";
     select = (SelectStmt) AnalyzesOk("select " + cast_str);
     Assert.assertEquals(cast_str, select.getResultExprs().get(0).toSqlImpl());
+
+    // Check that escaped single quotes in the format remain escaped in the printout.
+    cast_str = "CAST('2019-01-01te\\\'xt' AS DATE FORMAT " +
+        "'YYYY\\\'MM\\\'DD\"te\\\'xt\"')";
+    select = (SelectStmt) AnalyzesOk("select " + cast_str);
+    Assert.assertEquals(cast_str, select.getResultExprs().get(0).toSqlImpl());
+
+    // Check that non-escaped single quotes in the format are escaped in the printout.
+    // Also check that the surrounding double quotes around the format string are
+    // replaced with single quotes in the output.
+    cast_str = "select CAST(\"2019-01-02te'xt\" AS DATE FORMAT " +
+        "\"YYYY'MM'DD\\\"te'xt\\\"\")";
+    select = (SelectStmt) AnalyzesOk(cast_str);
+    String expected_str = "CAST('2019-01-02te\\\'xt' AS DATE FORMAT " +
+        "'YYYY\\\'MM\\\'DD\\\"te\\\'xt\\\"')";
+    Assert.assertEquals(expected_str, select.getResultExprs().get(0).toSqlImpl());
+
+    // Check that a single quote in a free text token is escaped in the output even if it
+    // is after an escaped backslash.
+    cast_str = "select CAST(\"2019-01-02te\\'xt\" AS DATE FORMAT " +
+        "\"YYYY-MM-DD\\\"te\\\\'xt\\\"\")";
+    select = (SelectStmt) AnalyzesOk(cast_str);
+    expected_str = "CAST('2019-01-02te\\\'xt' AS DATE FORMAT " +
+        "'YYYY-MM-DD\\\"te\\\\\\'xt\\\"')";
+    Assert.assertEquals(expected_str, select.getResultExprs().get(0).toSqlImpl());
   }
 
 }
diff --git a/tests/query_test/test_cast_with_format.py b/tests/query_test/test_cast_with_format.py
index 5770e0f..88ab090 100644
--- a/tests/query_test/test_cast_with_format.py
+++ b/tests/query_test/test_cast_with_format.py
@@ -74,6 +74,38 @@
         "timestamp FORMAT '-YYYY--MM---DD---')")
     assert result.data == ["2017-05-01 00:00:00"]
 
+    # Loose separator type matching. Checking if the input/format is surrounded by
+    # either single or double quotes.
+    result = self.client.execute(r'''select cast("2017-./,';: 06-01" as '''
+        r'''timestamp FORMAT "YYYY', -MM;:.DD")''')
+    assert result.data == ["2017-06-01 00:00:00"]
+
+    result = self.client.execute(r'''select cast('2017-./,\';: 07-01' as '''
+        r'''timestamp FORMAT "YYYY', -MM;:.DD")''')
+    assert result.data == ["2017-07-01 00:00:00"]
+
+    result = self.client.execute(r'''select cast("2017-./,';: 08-01" as '''
+        r'''timestamp FORMAT 'YYYY\', -MM;:.DD')''')
+    assert result.data == ["2017-08-01 00:00:00"]
+
+    result = self.client.execute(r'''select cast('2017-./,\';: 09-01' as '''
+        r'''timestamp FORMAT 'YYYY\', -MM;:.DD')''')
+    assert result.data == ["2017-09-01 00:00:00"]
+
+    # Escaped double quotes in the input are not taken as the escaping character for the
+    # following single quote.
+    result = self.client.execute(r'''select cast("2013\\'09-01" as '''
+        r'''timestamp FORMAT "YYYY'MM-DD")''')
+    assert result.data == ["NULL"]
+
+    result = self.client.execute(r'''select cast("2013\\\'09-02" as '''
+        r'''timestamp FORMAT "YYYY'MM-DD")''')
+    assert result.data == ["NULL"]
+
+    result = self.client.execute(r'''select cast("2013\\\\'09-03" as '''
+        r'''timestamp FORMAT "YYYY'MM-DD")''')
+    assert result.data == ["NULL"]
+
     # If the input string has unprocessed tokens
     result = self.client.execute("select cast('2017-05-01 12:30' as "
         "timestamp FORMAT 'YYYY-MM-DD')")
@@ -781,6 +813,11 @@
         r'''"YYYY\"\\\"MM\"\\\"DD")''')
     assert "Bad date/time conversion format" in str(err)
 
+    # Free text token where an escaped backslash precedes an escaped single quote.
+    result = self.client.execute(r'''select cast("2010-\\'-02-01" as date format '''
+        r''' 'FXYYYY-"\\\'"-MM-DD') ''')
+    assert result.data == ["2010-02-01"]
+
   def test_fm_fx_modifiers(self):
     # Exact mathcing for the whole format.
     result = self.client.execute("select cast('2001-03-01 03:10:15.123456 -01:30' as "
@@ -800,6 +837,23 @@
         "'FXYYYY-MM-DD ')")
     assert result.data == ["NULL"]
 
+    # Strict matching of single quote separator.
+    result = self.client.execute(r'''select cast('2001\'04-01' as timestamp format'''
+        r''' 'FXYYYY\'MM-DD')''')
+    assert result.data == ["2001-04-01 00:00:00"]
+
+    result = self.client.execute(r'''select cast("2001'04-02" as date format'''
+        r''' 'FXYYYY\'MM-DD')''')
+    assert result.data == ["2001-04-02"]
+
+    result = self.client.execute(r'''select cast('2001\'04-03' as timestamp format'''
+        r''' "FXYYYY'MM-DD")''')
+    assert result.data == ["2001-04-03 00:00:00"]
+
+    result = self.client.execute(r'''select cast("2001'04-04" as date format'''
+        r''' "FXYYYY'MM-DD")''')
+    assert result.data == ["2001-04-04"]
+
     # Strict token length matching.
     result = self.client.execute("select cast('2001-3-05' as timestamp format "
         "'FXYYYY-MM-DD')")
@@ -846,6 +900,10 @@
         "format 'FXFMYYYY-MM-DD HH12:MI:SS')")
     assert result.data == ["2019-03-10 04:15:00"]
 
+    result = self.client.execute("select cast('2004-03-08 03:15:00 AM' as timestamp "
+        "format 'FXYYYY-MM-DD HH12:MI:SS FMP.M.')")
+    assert result.data == ["2004-03-08 03:15:00"]
+
     # Multiple FM modifiers in a format.
     result = self.client.execute("select cast('2001-3-11 3:15:00.12345' as timestamp "
         "format 'FXYYYY-FMMM-DD FMHH12:MI:SS.FMFF')")
@@ -866,7 +924,7 @@
         ''' 'FXYYYY-MMFM"text"DD')''')
     assert result.data == ["NULL"]
 
-    # FM modifier skips the separators and effects the next non-separator token.
+    # FM modifier skips the separators and affects the next non-separator token.
     result = self.client.execute(r'''select cast('1999-10-2' as timestamp format '''
         ''' 'FXYYYY-MMFM-DD')''')
     assert result.data == ["1999-10-02 00:00:00"]
@@ -914,6 +972,11 @@
         "as string format 'FXFMYYYY-FMMM-FMDD')")
     assert result.data == ["1-4-10"]
 
+    # FX and FM modifiers are case-insensitive.
+    result = self.client.execute("select cast('2019-5-10' as date format "
+        "'fxYYYY-fmMM-DD')")
+    assert result.data == ["2019-05-10"]
+
   def test_format_parse_errors(self):
     # Invalid format
     err = self.execute_query_expect_failure(self.client,