be/src/exprs/string-functions-ir.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "exprs/string-functions.h"

 #include <cctype>
 #include <numeric>
 #include <stdint.h>
 #include <re2/re2.h>
 #include <re2/stringpiece.h>

 #include <boost/static_assert.hpp>

 #include "exprs/anyval-util.h"
 #include "exprs/scalar-expr.h"
 #include "gen-cpp/Metrics_types.h"
 #include "gutil/strings/charset.h"
 #include "gutil/strings/substitute.h"
 #include "runtime/string-value.inline.h"
 #include "runtime/tuple-row.h"
 #include "util/bit-util.h"
 #include "util/coding-util.h"
 #include "util/pretty-printer.h"
 #include "util/string-util.h"
 #include "util/ubsan.h"
 #include "util/url-parser.h"

 #include "common/names.h"

 using namespace impala_udf;
 using std::bitset;
 using std::any_of;

 // NOTE: be careful not to use string::append.  It is not performant.
 namespace impala {

 const char* ERROR_CHARACTER_LIMIT_EXCEEDED =
   "$0 is larger than allowed limit of $1 character data.";

 uint64_t StringFunctions::re2_mem_limit_ = 8 << 20;

 // This behaves identically to the mysql implementation, namely:
 //  - 1-indexed positions
 //  - supported negative positions (count from the end of the string)
 //  - [optional] len.  No len indicates longest substr possible
 StringVal StringFunctions::Substring(FunctionContext* context,
     const StringVal& str, const BigIntVal& pos, const BigIntVal& len) {
   if (str.is_null || pos.is_null || len.is_null) return StringVal::null();
   if (context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE)) {
     return Utf8Substring(context, str, pos, len);
   }
   int fixed_pos = pos.val;
   if (fixed_pos < 0) fixed_pos = str.len + fixed_pos + 1;
   int max_len = str.len - fixed_pos + 1;
   int fixed_len = ::min(static_cast<int>(len.val), max_len);
   if (fixed_pos > 0 && fixed_pos <= str.len && fixed_len > 0) {
     return StringVal(str.ptr + fixed_pos - 1, fixed_len);
   } else {
     return StringVal();
   }
 }

 StringVal StringFunctions::Substring(FunctionContext* context,
     const StringVal& str, const BigIntVal& pos) {
   // StringVal.len is an int => INT32_MAX
   return Substring(context, str, pos, BigIntVal(INT32_MAX));
 }

 StringVal StringFunctions::Utf8Substring(FunctionContext* context, const StringVal& str,
     const BigIntVal& pos) {
   return Utf8Substring(context, str, pos, BigIntVal(INT32_MAX));
 }

 StringVal StringFunctions::Utf8Substring(FunctionContext* context, const StringVal& str,
     const BigIntVal& pos, const BigIntVal& len) {
   if (str.is_null || pos.is_null || len.is_null) return StringVal::null();
   if (str.len == 0 || pos.val == 0 || len.val <= 0) return StringVal();

   int byte_pos;
   int utf8_cnt = 0;
   // pos.val starts at 1 (1-indexed positions).
   if (pos.val > 0) {
     // Seek to the start byte of the pos-th UTF-8 character.
     for (byte_pos = 0; utf8_cnt < pos.val && byte_pos < str.len; ++byte_pos) {
       if (BitUtil::IsUtf8StartByte(str.ptr[byte_pos])) ++utf8_cnt;
     }
     // Not enough UTF-8 characters.
     if (utf8_cnt < pos.val) return StringVal();
     // Back to the start byte of the pos-th UTF-8 character.
     --byte_pos;
     int byte_start = byte_pos;
     // Seek to the end until we get enough UTF-8 characters.
     for (utf8_cnt = 0; utf8_cnt < len.val && byte_pos < str.len; ++byte_pos) {
       if (BitUtil::IsUtf8StartByte(str.ptr[byte_pos])) ++utf8_cnt;
     }
     if (utf8_cnt == len.val) {
       // We are now at the middle byte of the last UTF-8 character. Seek to the end of it.
       while (byte_pos < str.len && !BitUtil::IsUtf8StartByte(str.ptr[byte_pos])) {
         ++byte_pos;
       }
     }
     return StringVal(str.ptr + byte_start, byte_pos - byte_start);
   }
   // pos.val is negative. Seek from the end of the string.
   int byte_end = str.len;
   utf8_cnt = 0;
   byte_pos = str.len - 1;
   while (utf8_cnt < -pos.val && byte_pos >= 0) {
     if (BitUtil::IsUtf8StartByte(str.ptr[byte_pos])) {
       ++utf8_cnt;
       // Remember the end of the substring's last UTF-8 character.
       if (utf8_cnt > 0 && utf8_cnt == -pos.val - len.val) byte_end = byte_pos;
     }
     --byte_pos;
   }
   // Not enough UTF-8 characters.
   if (utf8_cnt < -pos.val) return StringVal();
   // Back to the start byte of the substring's first UTF-8 character.
   ++byte_pos;
   return StringVal(str.ptr + byte_pos, byte_end - byte_pos);
 }

 // This behaves identically to the mysql implementation.
 StringVal StringFunctions::Left(
     FunctionContext* context, const StringVal& str, const BigIntVal& len) {
   return Substring(context, str, 1, len);
 }

 // This behaves identically to the mysql implementation.
 StringVal StringFunctions::Right(
     FunctionContext* context, const StringVal& str, const BigIntVal& len) {
   // Don't index past the beginning of str, otherwise we'll get an empty string back
   int64_t pos = ::max(-len.val, static_cast<int64_t>(-str.len));
   return Substring(context, str, BigIntVal(pos), len);
 }

 StringVal StringFunctions::Space(FunctionContext* context, const BigIntVal& len) {
   if (len.is_null) return StringVal::null();
   if (len.val <= 0) return StringVal();
   if (len.val > StringVal::MAX_LENGTH) {
     context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
          "space() result",
          PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
     return StringVal::null();
   }
   StringVal result(context, len.val);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   memset(result.ptr, ' ', len.val);
   return result;
 }

 StringVal StringFunctions::Repeat(
     FunctionContext* context, const StringVal& str, const BigIntVal& n) {
   if (str.is_null || n.is_null) return StringVal::null();
   if (str.len == 0 || n.val <= 0) return StringVal();
   if (n.val > StringVal::MAX_LENGTH) {
     context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
         "Number of repeats in repeat() call",
         PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
     return StringVal::null();
   }
   static_assert(numeric_limits<int64_t>::max() / numeric_limits<int>::max()
       >= StringVal::MAX_LENGTH,
       "multiplying StringVal::len with positive int fits in int64_t");
   int64_t out_len = str.len * n.val;
   if (out_len > StringVal::MAX_LENGTH) {
     context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
         "repeat() result",
         PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
     return StringVal::null();
   }
   StringVal result(context, static_cast<int>(out_len));
   if (UNLIKELY(result.is_null)) return StringVal::null();
   uint8_t* ptr = result.ptr;
   for (int64_t i = 0; i < n.val; ++i) {
     memcpy(ptr, str.ptr, str.len);
     ptr += str.len;
   }
   return result;
 }

 StringVal StringFunctions::Lpad(FunctionContext* context, const StringVal& str,
     const BigIntVal& len, const StringVal& pad) {
   if (str.is_null || len.is_null || pad.is_null || len.val < 0) return StringVal::null();
   // Corner cases: Shrink the original string, or leave it alone.
   // TODO: Hive seems to go into an infinite loop if pad.len == 0,
   // so we should pay attention to Hive's future solution to be compatible.
   if (len.val <= str.len || pad.len == 0) return StringVal(str.ptr, len.val);
   if (len.val > StringVal::MAX_LENGTH) {
     context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
         "lpad() result",
         PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
     return StringVal::null();
   }
   StringVal result(context, len.val);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   int padded_prefix_len = len.val - str.len;
   int pad_index = 0;
   int result_index = 0;
   uint8_t* ptr = result.ptr;

   // Prepend chars of pad.
   while (result_index < padded_prefix_len) {
     ptr[result_index++] = pad.ptr[pad_index++];
     pad_index = pad_index % pad.len;
   }

   // Append given string.
   memcpy(ptr + result_index, str.ptr, str.len);
   return result;
 }

 StringVal StringFunctions::Rpad(FunctionContext* context, const StringVal& str,
     const BigIntVal& len, const StringVal& pad) {
   if (str.is_null || len.is_null || pad.is_null || len.val < 0) return StringVal::null();
   // Corner cases: Shrink the original string, or leave it alone.
   // TODO: Hive seems to go into an infinite loop if pad->len == 0,
   // so we should pay attention to Hive's future solution to be compatible.
   if (len.val <= str.len || pad.len == 0) {
     return StringVal(str.ptr, len.val);
   }
   if (len.val > StringVal::MAX_LENGTH) {
     context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
         "rpad() result",
         PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
     return StringVal::null();
   }

   StringVal result(context, len.val);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   memcpy(result.ptr, str.ptr, str.len);

   // Append chars of pad until desired length
   uint8_t* ptr = result.ptr;
   int pad_index = 0;
   int result_len = str.len;
   while (result_len < len.val) {
     ptr[result_len++] = pad.ptr[pad_index++];
     pad_index = pad_index % pad.len;
   }
   return result;
 }

 IntVal StringFunctions::Length(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return IntVal::null();
   if (context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE, 0)) {
     return Utf8Length(context, str);
   }
   return IntVal(str.len);
 }
 IntVal StringFunctions::Bytes(FunctionContext* context,const StringVal& str){
   if(str.is_null) return IntVal::null();
   return IntVal(str.len);
 }

 IntVal StringFunctions::CharLength(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return IntVal::null();
   const FunctionContext::TypeDesc* t = context->GetArgType(0);
   DCHECK_EQ(t->type, FunctionContext::TYPE_FIXED_BUFFER);
   return StringValue::UnpaddedCharLength(reinterpret_cast<char*>(str.ptr), t->len);
 }

 static int CountUtf8Chars(uint8_t* ptr, int len) {
   if (ptr == nullptr) return 0;
   int cnt = 0;
   for (int i = 0; i < len; ++i) {
     if (BitUtil::IsUtf8StartByte(ptr[i])) ++cnt;
   }
   return cnt;
 }

 IntVal StringFunctions::Utf8Length(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return IntVal::null();
   return IntVal(CountUtf8Chars(str.ptr, str.len));
 }

 StringVal StringFunctions::Lower(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return StringVal::null();
   if (context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE)) {
     return LowerUtf8(context, str);
   }
   return LowerAscii(context, str);
 }

 StringVal StringFunctions::LowerAscii(FunctionContext* context, const StringVal& str) {
   // Not in UTF-8 mode, only English alphabetic characters will be converted.
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   for (int i = 0; i < str.len; ++i) {
     result.ptr[i] = ::tolower(str.ptr[i]);
   }
   return result;
 }

 StringVal StringFunctions::Upper(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return StringVal::null();
   if (context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE)) {
     return UpperUtf8(context, str);
   }
   return UpperAscii(context, str);
 }

 StringVal StringFunctions::UpperAscii(FunctionContext* context, const StringVal& str) {
   // Not in UTF-8 mode, only English alphabetic characters will be converted.
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   for (int i = 0; i < str.len; ++i) {
     result.ptr[i] = ::toupper(str.ptr[i]);
   }
   return result;
 }

 // Returns a string identical to the input, but with the first character
 // of each word mapped to its upper-case equivalent. All other characters
 // will be mapped to their lower-case equivalents. If input == NULL it
 // will return NULL
 StringVal StringFunctions::InitCap(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return StringVal::null();
   if (context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE)) {
     return InitCapUtf8(context, str);
   }
   return InitCapAscii(context, str);
 }

 StringVal StringFunctions::InitCapAscii(FunctionContext* context, const StringVal& str) {
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   uint8_t* result_ptr = result.ptr;
   bool word_start = true;
   for (int i = 0; i < str.len; ++i) {
     if (isspace(str.ptr[i])) {
       result_ptr[i] = str.ptr[i];
       word_start = true;
     } else {
       result_ptr[i] = (word_start ? toupper(str.ptr[i]) : tolower(str.ptr[i]));
       word_start = false;
     }
   }
   return result;
 }

 /// Reports the error in parsing multibyte characters with leading bytes and current
 /// locale. Used in Utf8CaseConversion().
 static void ReportErrorBytes(FunctionContext* context, const StringVal& str,
     int current_idx) {
   DCHECK_LT(current_idx, str.len);
   stringstream ss;
   ss << "[0x" << std::hex << (int)DCHECK_NOTNULL(str.ptr)[current_idx];
   for (int k = 1; k < 4 && current_idx + k < str.len; ++k) {
     ss << ", 0x" << std::hex << (int)str.ptr[current_idx + k];
   }
   ss << "]";
   context->AddWarning(Substitute(
       "Illegal multi-byte character in string. Leading bytes: $0. Current locale: $1",
       ss.str(), std::locale("").name()).c_str());
 }

 /// Converts string based on the transform function 'fn'. The unit of the conversion is
 /// a wchar_t (i.e. uint32_t) which is parsed from multi bytes using std::mbtowc().
 /// The transform function 'fn' accepts two parameters: the original wchar_t and a flag
 /// indicating whether it's the first character of a word.
 /// After the transformation, the wchar_t is converted back to bytes.
 static StringVal Utf8CaseConversion(FunctionContext* context, const StringVal& str,
     uint32_t (*fn)(uint32_t, bool*)) {
   // Usually the upper/lower cases have the same size in bytes. Here we add 4 bytes
   // buffer in case of illegal Unicodes.
   int max_result_bytes = str.len + 4;
   StringVal result(context, max_result_bytes);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   wchar_t wc;
   int wc_bytes;
   bool word_start = true;
   uint8_t* result_ptr = result.ptr;
   std::mbstate_t wc_state{};
   std::mbstate_t mb_state{};
   for (int i = 0; i < str.len; i += wc_bytes) {
     // std::mbtowc converts a multibyte sequence to a wide character. It's not
     // thread safe. Here we use std::mbrtowc instead.
     wc_bytes = std::mbrtowc(&wc, reinterpret_cast<char*>(str.ptr + i), str.len - i,
         &wc_state);
     bool needs_conversion = true;
     if (wc_bytes == 0) {
       // std::mbtowc returns 0 when hitting '\0'.
       wc = 0;
       wc_bytes = 1;
     } else if (wc_bytes < 0) {
       ReportErrorBytes(context, str, i);
       // Replace it to the replacement character (U+FFFD)
       wc = 0xFFFD;
       needs_conversion = false;
       // Jump to the next legal UTF-8 start byte.
       wc_bytes = 1;
       while (i + wc_bytes < str.len && !BitUtil::IsUtf8StartByte(str.ptr[i + wc_bytes])) {
         wc_bytes++;
       }
     }
     if (needs_conversion) wc = fn(wc, &word_start);
     // std::wctomb converts a wide character to a multibyte sequence. It's not
     // thread safe. Here we use std::wcrtomb instead.
     int res_bytes = std::wcrtomb(reinterpret_cast<char*>(result_ptr), wc, &mb_state);
     if (res_bytes <= 0) {
       if (needs_conversion) {
         context->AddWarning(Substitute(
             "Ignored illegal wide character in results: $0. Current locale: $1",
             wc, std::locale("").name()).c_str());
       }
       continue;
     }
     result_ptr += res_bytes;
     if (result_ptr - result.ptr > max_result_bytes - 4) {
       // Double the result buffer for overflow
       max_result_bytes *= 2;
       max_result_bytes = min<int>(StringVal::MAX_LENGTH,
           static_cast<int>(BitUtil::RoundUpToPowerOfTwo(max_result_bytes)));
       int offset = result_ptr - result.ptr;
       if (UNLIKELY(!result.Resize(context, max_result_bytes))) return StringVal::null();
       result_ptr = result.ptr + offset;
     }
   }
   result.len = result_ptr - result.ptr;
   return result;
 }

 StringVal StringFunctions::LowerUtf8(FunctionContext* context, const StringVal& str) {
   return Utf8CaseConversion(context, str,
       [](uint32_t wide_char, bool* word_start) {
         return std::towlower(wide_char);
       });
 }

 StringVal StringFunctions::UpperUtf8(FunctionContext* context, const StringVal& str) {
   return Utf8CaseConversion(context, str,
       [](uint32_t wide_char, bool* word_start) {
         return std::towupper(wide_char);
       });
 }

 StringVal StringFunctions::InitCapUtf8(FunctionContext* context, const StringVal& str) {
   return Utf8CaseConversion(context, str,
       [](uint32_t wide_char, bool* word_start) {
         if (UNLIKELY(iswspace(wide_char))) {
           *word_start = true;
           return wide_char;
         }
         uint32_t res = *word_start ? std::towupper(wide_char) : std::towlower(wide_char);
         *word_start = false;
         return res;
       });
 }

 struct ReplaceContext {
   ReplaceContext(StringVal *pattern_in) {
     pattern = StringValue::FromStringVal(*pattern_in);
     search = StringSearch(&pattern);
   }
   StringValue pattern;
   StringSearch search;
 };

 void StringFunctions::ReplacePrepare(FunctionContext* context,
     FunctionContext::FunctionStateScope scope) {
   if (scope != FunctionContext::FRAGMENT_LOCAL) return;
   if (!context->IsArgConstant(1)) return;
   DCHECK_EQ(context->GetArgType(1)->type, FunctionContext::TYPE_STRING);
   StringVal* pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1));
   if (pattern->is_null || pattern->len == 0) return;

   struct ReplaceContext* replace = context->Allocate<ReplaceContext>();
   if (replace != nullptr) {
     new(replace) ReplaceContext(pattern);
     context->SetFunctionState(scope, replace);
   }
 }

 void StringFunctions::ReplaceClose(FunctionContext* context,
     FunctionContext::FunctionStateScope scope) {
   if (scope != FunctionContext::FRAGMENT_LOCAL) return;
   ReplaceContext* rptr = reinterpret_cast<ReplaceContext*>
       (context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL));
   context->Free(reinterpret_cast<uint8_t*>(rptr));
   context->SetFunctionState(scope, nullptr);
 }

 StringVal StringFunctions::Replace(FunctionContext* context, const StringVal& str,
     const StringVal& pattern, const StringVal& replace) {
   DCHECK_LE(str.len, StringVal::MAX_LENGTH);
   DCHECK_LE(pattern.len, StringVal::MAX_LENGTH);
   DCHECK_LE(replace.len, StringVal::MAX_LENGTH);
   if (str.is_null || pattern.is_null || replace.is_null) return StringVal::null();
   if (pattern.len == 0 || pattern.len > str.len) return str;

   // StringSearch keeps a pointer to the StringValue object, so it must remain
   // in scope if used.
   StringSearch search;
   StringValue needle;
   const StringSearch *search_ptr;
   const ReplaceContext* rptr = reinterpret_cast<ReplaceContext*>
       (context->GetFunctionState(FunctionContext::FRAGMENT_LOCAL));
   if (UNLIKELY(rptr == nullptr)) {
     needle = StringValue::FromStringVal(pattern);
     search = StringSearch(&needle);
     search_ptr = &search;
   } else {
     search_ptr = &rptr->search;
   }

   const StringValue haystack = StringValue::FromStringVal(str);
   int64_t match_pos = search_ptr->Search(&haystack);

   // No match?  Skip everything.
   if (match_pos < 0) return str;

   StringValue::SimpleString haystack_s = haystack.ToSimpleString();

   DCHECK_GT(pattern.len, 0);
   DCHECK_GE(haystack_s.len, pattern.len);
   int buffer_space;
   const int delta = replace.len - pattern.len;
   // MAX_LENGTH is unsigned, so convert back to int to do correctly signed compare
   DCHECK_LE(delta, static_cast<int>(StringVal::MAX_LENGTH) - 1);
   if ((delta > 0 && delta < 128) && haystack_s.len <= 128) {
     // Quick estimate for potential matches - this heuristic is needed to win
     // over regexp_replace on expanding patterns.  128 is arbitrarily chosen so
     // we can't massively over-estimate the buffer size.
     int matches_possible = 0;
     char c = pattern.ptr[0];
     for (int i = 0; i <= haystack_s.len - pattern.len; ++i) {
       if (haystack_s.ptr[i] == c) ++matches_possible;
     }
     buffer_space = haystack_s.len + matches_possible * delta;
   } else {
     // Note - cannot overflow because pattern.len is at least one
     static_assert(StringVal::MAX_LENGTH - 1 + StringVal::MAX_LENGTH <=
         std::numeric_limits<decltype(buffer_space)>::max(),
         "Buffer space computation can overflow");
     buffer_space = haystack_s.len + delta;
   }

   StringVal result(context, buffer_space);
   // result may be NULL if we went over MAX_LENGTH or the allocation failed.
   if (UNLIKELY(result.is_null)) return result;

   uint8_t* ptr = result.ptr;
   int consumed = 0;
   while (match_pos + pattern.len <= haystack_s.len) {
     // Copy in original string
     const int unmatched_bytes = match_pos - consumed;
     memcpy(ptr, &haystack_s.ptr[consumed], unmatched_bytes);
     DCHECK_LE(ptr - result.ptr + unmatched_bytes, buffer_space);
     ptr += unmatched_bytes;

     // Copy in replacement - always safe since we always leave room for one more replace
     DCHECK_LE(ptr - result.ptr + replace.len, buffer_space);
     Ubsan::MemCpy(ptr, replace.ptr, replace.len);
     ptr += replace.len;

     // Don't want to re-match within already replaced pattern
     match_pos += pattern.len;
     consumed = match_pos;

     StringValue haystack_substring = haystack.Substring(match_pos);
     int match_pos_in_substring = search_ptr->Search(&haystack_substring);
     if (match_pos_in_substring < 0) break;

     match_pos += match_pos_in_substring;

     // If we had an enlarging pattern, we may need more space
     if (delta > 0) {
       const int bytes_produced = ptr - result.ptr;
       const int bytes_remaining = haystack_s.len - consumed;
       DCHECK_LE(bytes_produced, StringVal::MAX_LENGTH);
       DCHECK_LE(bytes_remaining, StringVal::MAX_LENGTH - 1);
       // Note: by above, cannot overflow
       const int min_output = bytes_produced + bytes_remaining;
       DCHECK_LE(min_output, StringVal::MAX_LENGTH);
       // Also no overflow: min_output <= MAX_LENGTH and delta <= MAX_LENGTH - 1
       const int64_t space_needed = min_output + delta;
       if (UNLIKELY(space_needed > buffer_space)) {
         // Check to see if we can allocate a large enough buffer.
         if (space_needed > StringVal::MAX_LENGTH) {
           context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
               "replace() result",
               PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
           return StringVal::null();
         }
         // Double the buffer size whenever it fills up to amortise cost of resizing.
         // Must compute next power of two using 64-bit math to avoid signed overflow.
         buffer_space = min<int>(StringVal::MAX_LENGTH,
             static_cast<int>(BitUtil::RoundUpToPowerOfTwo(space_needed)));

         // Give up if the allocation fails or we hit an error. This prevents us from
         // continuing to blow past the mem limit.
         if (UNLIKELY(!result.Resize(context, buffer_space) || context->has_error())) {
           return StringVal::null();
         }
         // Don't forget to move the pointer
         ptr = result.ptr + bytes_produced;
       }
     }
   }

   // Copy in remainder and re-adjust size
   const int bytes_remaining = haystack_s.len - consumed;
   result.len = ptr - result.ptr + bytes_remaining;
   DCHECK_LE(result.len, buffer_space);
   memcpy(ptr, &haystack_s.ptr[consumed], bytes_remaining);

   return result;
 }

 StringVal StringFunctions::Reverse(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return StringVal::null();
   if (context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE)) {
     return Utf8Reverse(context, str);
   }
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   BitUtil::ByteSwap(result.ptr, str.ptr, str.len);
   return result;
 }

 static inline void InPlaceReverse(uint8_t* ptr, int len) {
   for (int i = 0, j = len - 1; i < j; ++i, --j) {
     uint8_t tmp = ptr[i];
     ptr[i] = ptr[j];
     ptr[j] = tmp;
   }
 }

 // Returns a string with the UTF-8 characters (code points) in revrese order. Note that
 // this function operates on Unicode code points and not user visible characters (or
 // grapheme clusters). This is consistent with other systems, e.g. Hive, SparkSQL.
 StringVal StringFunctions::Utf8Reverse(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return StringVal::null();
   if (str.len == 0) return StringVal();
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   // First make a copy of the reversed string.
   BitUtil::ByteSwap(result.ptr, str.ptr, str.len);
   // Then reverse bytes inside each UTF-8 character.
   int last = result.len;
   for (int i = result.len - 1; i >= 0; --i) {
     if (BitUtil::IsUtf8StartByte(result.ptr[i])) {
       // Only reverse bytes of a UTF-8 character
       if (last - i > 1) InPlaceReverse(result.ptr + i + 1, last - i);
       last = i;
     }
   }
   if (last > 0) InPlaceReverse(result.ptr, last + 1);
   return result;
 }

 StringVal StringFunctions::Translate(FunctionContext* context, const StringVal& str,
     const StringVal& src, const StringVal& dst) {
   if (str.is_null || src.is_null || dst.is_null) return StringVal::null();
   StringVal result(context, str.len);
   if (UNLIKELY(result.is_null)) return result;

   // TODO: if we know src and dst are constant, we can prebuild a conversion
   // table to remove the inner loop.
   int result_len = 0;
   for (int i = 0; i < str.len; ++i) {
     bool matched_src = false;
     for (int j = 0; j < src.len; ++j) {
       if (str.ptr[i] == src.ptr[j]) {
         if (j < dst.len) {
           result.ptr[result_len++] = dst.ptr[j];
         } else {
           // src[j] doesn't map to any char in dst, the char is dropped.
         }
         matched_src = true;
         break;
       }
     }
     if (!matched_src) result.ptr[result_len++] = str.ptr[i];
   }
   result.len = result_len;
   return result;
 }

 void StringFunctions::TrimContext::Reset(const StringVal& chars_to_trim) {
   single_byte_chars_.reset();
   double_byte_chars_.clear();
   triple_byte_chars_.clear();
   quadruple_byte_chars_.clear();

   if (!utf8_mode_) {
     for (size_t i = 0; i < chars_to_trim.len; ++i) {
       single_byte_chars_.set(chars_to_trim.ptr[i], true);
     }
     return;
   }

   for (size_t i = 0, char_size = 0; i < chars_to_trim.len; i += char_size) {
     char_size = BitUtil::NumBytesInUtf8Encoding(chars_to_trim.ptr[i]);

     // If the remaining number of bytes does not match the number of bytes specified by
     // the UTF-8 character, we may have encountered an illegal UTF-8 character.
     // In order to prevent subsequent data access from going out of bounds, restrictions
     // are placed here to ensure that accessing pointers to multi-byte characters is
     // always safe.
     if (UNLIKELY(i + char_size > chars_to_trim.len)) {
       char_size = chars_to_trim.len - i;
     }

     switch (char_size) {
       case 1: single_byte_chars_.set(chars_to_trim.ptr[i], true); break;
       case 2: double_byte_chars_.push_back(&chars_to_trim.ptr[i]); break;
       case 3: triple_byte_chars_.push_back(&chars_to_trim.ptr[i]); break;
       case 4: quadruple_byte_chars_.push_back(&chars_to_trim.ptr[i]); break;
       default: DCHECK(false); break;
     }
   }
 }

 bool StringFunctions::TrimContext::Contains(const uint8_t* utf8_char, int len) const {
   auto eq = [&](const uint8_t* c){ return memcmp(c, utf8_char, len) == 0; };
   switch (len) {
     case 1: return single_byte_chars_.test(*utf8_char);
     case 2: return any_of(double_byte_chars_.begin(), double_byte_chars_.end(), eq);
     case 3: return any_of(triple_byte_chars_.begin(), triple_byte_chars_.end(), eq);
     case 4: return any_of(quadruple_byte_chars_.begin(), quadruple_byte_chars_.end(), eq);
     default: DCHECK(false); return false;
   }
 }

 void StringFunctions::TrimPrepare(FunctionContext* context,
     FunctionContext::FunctionStateScope scope) {
   bool utf8_mode = context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE);
   DoTrimPrepare(context, scope, utf8_mode);
 }

 void StringFunctions::Utf8TrimPrepare(FunctionContext* context,
     FunctionContext::FunctionStateScope scope) {
   DoTrimPrepare(context, scope, true /* utf8_mode */);
 }

 void StringFunctions::DoTrimPrepare(FunctionContext* context,
     FunctionContext::FunctionStateScope scope, bool utf8_mode) {
   if (scope != FunctionContext::THREAD_LOCAL) return;
   TrimContext* trim_ctx = new TrimContext(utf8_mode);
   context->SetFunctionState(scope, trim_ctx);

   // If the caller didn't specify the set of characters to trim, it means
   // that we're only trimming whitespace. Return early in that case.
   // There can be either 1 or 2 arguments.
   DCHECK(context->GetNumArgs() == 1 || context->GetNumArgs() == 2);
   if (context->GetNumArgs() == 1) {
     trim_ctx->Reset(StringVal(" "));
     return;
   }
   if (!context->IsArgConstant(1)) return;
   DCHECK_EQ(context->GetArgType(1)->type, FunctionContext::TYPE_STRING);
   StringVal* chars_to_trim = reinterpret_cast<StringVal*>(context->GetConstantArg(1));
   if (chars_to_trim->is_null) return; // We shouldn't peek into Null StringVals
   trim_ctx->Reset(*chars_to_trim);
 }

 void StringFunctions::TrimClose(
     FunctionContext* context, FunctionContext::FunctionStateScope scope) {
   if (scope != FunctionContext::THREAD_LOCAL) return;
   TrimContext* trim_ctx =
       reinterpret_cast<TrimContext*>(context->GetFunctionState(scope));
   delete trim_ctx;
   context->SetFunctionState(scope, nullptr);
 }

 template <StringFunctions::TrimPosition D, bool IS_IMPLICIT_WHITESPACE>
 StringVal StringFunctions::DoTrimString(FunctionContext* ctx,
     const StringVal& str, const StringVal& chars_to_trim) {
   if (str.is_null) return StringVal::null();
   TrimContext* trim_ctx = reinterpret_cast<TrimContext*>(
       ctx->GetFunctionState(FunctionContext::THREAD_LOCAL));

   // When 'chars_to_trim' is not a constant, we need to reset TrimContext with new
   // 'chars_to_trim'.
   if (!IS_IMPLICIT_WHITESPACE && !ctx->IsArgConstant(1)) {
     if (chars_to_trim.is_null) return str;
     trim_ctx->Reset(chars_to_trim);
   }

   // When dealing with UTF-8 characters in UTF-8 mode, use DoUtf8TrimString().
   if (trim_ctx->utf8_mode()) {
     return DoUtf8TrimString<D>(str, *trim_ctx);
   }

   // Otherwise, we continue to maintain the old behavior.
   int32_t begin = 0;
   int32_t end = str.len - 1;
   // Find new starting position.
   if constexpr (D == LEADING || D == BOTH) {
     while (begin < str.len && trim_ctx->Contains(str.ptr[begin])) {
       ++begin;
     }
   }
   // Find new ending position.
   if constexpr (D == TRAILING || D == BOTH) {
     while (end >= begin && trim_ctx->Contains(str.ptr[end])) {
       --end;
     }
   }
   return StringVal(str.ptr + begin, end - begin + 1);
 }

 template <StringFunctions::TrimPosition D>
 StringVal StringFunctions::DoUtf8TrimString(const StringVal& str,
     const TrimContext& trim_ctx) {
   if (UNLIKELY(str.len == 0)) return str;

   const uint8_t* begin = str.ptr;
   const uint8_t* end = begin + str.len;
   // Find new starting position.
   if constexpr (D == LEADING || D == BOTH) {
     while (begin < end) {
       size_t char_size = BitUtil::NumBytesInUtf8Encoding(*begin);
       if (UNLIKELY(begin + char_size > end)) char_size = end - begin;
       if (!trim_ctx.Contains(begin, char_size)) break;
       begin += char_size;
     }
   }
   // Find new ending position.
   if constexpr (D == TRAILING || D == BOTH) {
     while (begin < end) {
       int char_index = FindUtf8PosBackward(begin, end - begin, 0);
       DCHECK_NE(char_index, -1);
       const uint8_t* char_begin = begin + char_index;
       if (!trim_ctx.Contains(char_begin, end - char_begin)) break;
       end = char_begin;
     }
   }

   return StringVal(const_cast<uint8_t*>(begin), end - begin);
 }

 StringVal StringFunctions::Trim(FunctionContext* context, const StringVal& str) {
   return DoTrimString<BOTH, true>(context, str, StringVal(" "));
 }

 StringVal StringFunctions::Ltrim(FunctionContext* context, const StringVal& str) {
   return DoTrimString<LEADING, true>(context, str, StringVal(" "));
 }

 StringVal StringFunctions::Rtrim(FunctionContext* context, const StringVal& str) {
   return DoTrimString<TRAILING, true>(context, str, StringVal(" "));
 }

 StringVal StringFunctions::LTrimString(FunctionContext* ctx,
     const StringVal& str, const StringVal& chars_to_trim) {
   return DoTrimString<LEADING, false>(ctx, str, chars_to_trim);
 }

 StringVal StringFunctions::RTrimString(FunctionContext* ctx,
     const StringVal& str, const StringVal& chars_to_trim) {
   return DoTrimString<TRAILING, false>(ctx, str, chars_to_trim);
 }

 StringVal StringFunctions::BTrimString(FunctionContext* ctx,
     const StringVal& str, const StringVal& chars_to_trim) {
   return DoTrimString<BOTH, false>(ctx, str, chars_to_trim);
 }

 IntVal StringFunctions::Ascii(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return IntVal::null();
   // Hive returns 0 when given an empty string.
   return IntVal((str.len == 0) ? 0 : static_cast<int32_t>(str.ptr[0]));
 }

 IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str,
     const StringVal& substr, const BigIntVal& start_position,
     const BigIntVal& occurrence) {
   if (str.is_null || substr.is_null || start_position.is_null || occurrence.is_null) {
     return IntVal::null();
   }
   if (occurrence.val <= 0) {
     stringstream ss;
     ss << "Invalid occurrence parameter to instr function: " << occurrence.val;
     context->SetError(ss.str().c_str());
     return IntVal(0);
   }
   if (start_position.val == 0) return IntVal(0);

   bool utf8_mode = context->impl()->GetConstFnAttr(FunctionContextImpl::UTF8_MODE);
   StringValue haystack = StringValue::FromStringVal(str);
   StringValue::SimpleString haystack_s = haystack.ToSimpleString();
   StringValue needle = StringValue::FromStringVal(substr);
   StringValue::SimpleString needle_s = needle.ToSimpleString();
   StringSearch search(&needle);
   int match_pos = -1;
   if (start_position.val > 0) {
     // A positive starting position indicates regular searching from the left.
     int search_start_pos = start_position.val - 1;
     if (utf8_mode) {
       search_start_pos = FindUtf8PosForward(str.ptr, str.len, search_start_pos);
     }
     if (search_start_pos >= haystack_s.len) return IntVal(0);
     for (int match_num = 0; match_num < occurrence.val; ++match_num) {
       DCHECK_LE(search_start_pos, haystack_s.len);
       StringValue haystack_substring = haystack.Substring(search_start_pos);
       int match_pos_in_substring = search.Search(&haystack_substring);
       if (match_pos_in_substring < 0) return IntVal(0);
       match_pos = search_start_pos + match_pos_in_substring;
       search_start_pos = match_pos + 1;
     }
   } else {
     // A negative starting position indicates searching from the right.
     int search_start_pos = utf8_mode ?
         FindUtf8PosBackward(str.ptr, str.len, -start_position.val - 1) :
         haystack_s.len + start_position.val;
     // The needle must fit between search_start_pos and the end of the string
     if (search_start_pos + needle_s.len > haystack_s.len) {
       search_start_pos = haystack_s.len - needle_s.len;
     }
     if (search_start_pos < 0) return IntVal(0);
     for (int match_num = 0; match_num < occurrence.val; ++match_num) {
       DCHECK_GE(search_start_pos + needle_s.len, 0);
       DCHECK_LE(search_start_pos + needle_s.len, haystack_s.len);
       StringValue haystack_substring =
           haystack.Substring(0, search_start_pos + needle_s.len);
       match_pos = search.RSearch(&haystack_substring);
       if (match_pos < 0) return IntVal(0);
       search_start_pos = match_pos - 1;
     }
   }
   // In UTF8 mode, positions are counted by Unicode characters in UTF8 encoding.
   // If not in UTF8 mode, return positions starting from 1 at the leftmost position.
   return utf8_mode ? IntVal(CountUtf8Chars(str.ptr, match_pos) + 1) :
       IntVal(match_pos + 1);
 }

 IntVal StringFunctions::Instr(FunctionContext* context, const StringVal& str,
     const StringVal& substr, const BigIntVal& start_position) {
   return Instr(context, str, substr, start_position, BigIntVal(1));
 }

 IntVal StringFunctions::Instr(
     FunctionContext* context, const StringVal& str, const StringVal& substr) {
   return Instr(context, str, substr, BigIntVal(1), BigIntVal(1));
 }

 IntVal StringFunctions::Locate(FunctionContext* context, const StringVal& substr,
     const StringVal& str) {
   return Instr(context, str, substr);
 }

 IntVal StringFunctions::LocatePos(FunctionContext* context, const StringVal& substr,
     const StringVal& str, const BigIntVal& start_pos) {
   if (str.is_null || substr.is_null || start_pos.is_null) return IntVal::null();
   // Hive returns 0 for *start_pos <= 0,
   // but throws an exception for *start_pos > str->len.
   // Since returning 0 seems to be Hive's error condition, return 0.
   if (start_pos.val <= 0 || start_pos.val > str.len) return IntVal(0);
   return Instr(context, str, substr, start_pos);
 }

 // The caller owns the returned regex. Returns NULL if the pattern could not be compiled.
 re2::RE2* CompileRegex(const StringVal& pattern, string* error_str,
     const StringVal& match_parameter) {
   DCHECK(error_str != NULL);
   re2::StringPiece pattern_sp(reinterpret_cast<char*>(pattern.ptr), pattern.len);
   re2::RE2::Options options;
   // Disable error logging in case e.g. every row causes an error
   options.set_log_errors(false);
   // Return the leftmost longest match (rather than the first match).
   options.set_longest_match(true);
   // Set the maximum memory used by re2's regex engine for storage
   StringFunctions::SetRE2MemOpt(&options);
   if (!match_parameter.is_null &&
       !StringFunctions::SetRE2Options(match_parameter, error_str, &options)) {
     return NULL;
   }
   re2::RE2* re = new re2::RE2(pattern_sp, options);
   if (!re->ok()) {
     stringstream ss;
     ss << "Could not compile regexp pattern: " << AnyValUtil::ToString(pattern) << endl
        << "Error: " << re->error();
     *error_str = ss.str();
     delete re;
     return NULL;
   }
   return re;
 }

 // This function sets options in the RE2 library before pattern matching.
 bool StringFunctions::SetRE2Options(const StringVal& match_parameter,
     string* error_str, re2::RE2::Options* opts) {
   for (int i = 0; i < match_parameter.len; i++) {
     char match = match_parameter.ptr[i];
     switch (match) {
       case 'i':
         opts->set_case_sensitive(false);
         break;
       case 'c':
         opts->set_case_sensitive(true);
         break;
       case 'm':
         opts->set_posix_syntax(true);
         opts->set_one_line(false);
         break;
       case 'n':
         opts->set_never_nl(false);
         opts->set_dot_nl(true);
         break;
       default:
         stringstream error;
         error << "Illegal match parameter " << match;
         *error_str = error.str();
         return false;
     }
   }
   return true;
 }

 void StringFunctions::SetRE2MemLimit(int64_t re2_mem_limit) {
   // TODO: include the memory requirement for re2 in the memory planner estimates
   DCHECK(re2_mem_limit > 0);
   StringFunctions::re2_mem_limit_ = re2_mem_limit;
 }

 // Set the maximum memory used by re2's regex engine for a compiled regex expression's
 // storage. By default, it uses 8 MiB. This can be used to avoid DFA state cache flush
 // resulting in slower execution
 void StringFunctions::SetRE2MemOpt(re2::RE2::Options* opts) {
   opts->set_max_mem(StringFunctions::re2_mem_limit_);
 }

 void StringFunctions::RegexpPrepare(
     FunctionContext* context, FunctionContext::FunctionStateScope scope) {
   if (scope != FunctionContext::THREAD_LOCAL) return;
   if (!context->IsArgConstant(1)) return;
   DCHECK_EQ(context->GetArgType(1)->type, FunctionContext::TYPE_STRING);
   StringVal* pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1));
   if (pattern->is_null) return;

   string error_str;
   re2::RE2* re = CompileRegex(*pattern, &error_str, StringVal::null());
   if (re == NULL) {
     context->SetError(error_str.c_str());
     return;
   }
   context->SetFunctionState(scope, re);
 }

 void StringFunctions::RegexpClose(
     FunctionContext* context, FunctionContext::FunctionStateScope scope) {
   if (scope != FunctionContext::THREAD_LOCAL) return;
   re2::RE2* re = reinterpret_cast<re2::RE2*>(context->GetFunctionState(scope));
   delete re;
   context->SetFunctionState(scope, nullptr);
 }

 StringVal StringFunctions::RegexpEscape(FunctionContext* context, const StringVal& str) {
   if (str.is_null) return StringVal::null();
   if (str.len == 0) return str;

   static const strings::CharSet REGEX_ESCAPE_CHARACTERS(".\\+*?[^]$(){}=!<>|:-");
   const uint8_t* const start_ptr = str.ptr;
   const uint8_t* const end_ptr = start_ptr + str.len;
   StringVal result(context, str.len * 2);
   if (UNLIKELY(result.is_null)) return StringVal::null();
   uint8_t* dest_ptr = result.ptr;
   for (const uint8_t* c = start_ptr; c < end_ptr; ++c) {
     if (REGEX_ESCAPE_CHARACTERS.Test(*c)) {
       *dest_ptr++ = '\\';
     }
     *dest_ptr++ = *c;
   }
   result.len = dest_ptr - result.ptr;
   DCHECK_GE(result.len, str.len);

   return result;
 }

 StringVal StringFunctions::RegexpExtract(FunctionContext* context, const StringVal& str,
     const StringVal& pattern, const BigIntVal& index) {
   if (str.is_null || pattern.is_null || index.is_null) return StringVal::null();
   if (index.val < 0) return StringVal();

   re2::RE2* re = reinterpret_cast<re2::RE2*>(
       context->GetFunctionState(FunctionContext::THREAD_LOCAL));
   scoped_ptr<re2::RE2> scoped_re; // destroys re if we have to locally compile it
   if (re == NULL) {
     DCHECK(!context->IsArgConstant(1));
     string error_str;
     re = CompileRegex(pattern, &error_str, StringVal::null());
     if (re == NULL) {
       context->AddWarning(error_str.c_str());
       return StringVal::null();
     }
     scoped_re.reset(re);
   }

   re2::StringPiece str_sp(reinterpret_cast<char*>(str.ptr), str.len);
   int max_matches = 1 + re->NumberOfCapturingGroups();
   if (index.val >= max_matches) return StringVal();
   // Use a vector because clang complains about non-POD varlen arrays
   // TODO: fix this
   vector<re2::StringPiece> matches(max_matches);
   bool success =
       re->Match(str_sp, 0, str.len, re2::RE2::UNANCHORED, matches.data(), max_matches);
   if (!success) return StringVal();
   // matches[0] is the whole string, matches[1] the first group, etc.
   const re2::StringPiece& match = matches[index.val];
   return AnyValUtil::FromBuffer(context, match.data(), match.size());
 }

 StringVal StringFunctions::RegexpReplace(FunctionContext* context, const StringVal& str,
     const StringVal& pattern, const StringVal& replace) {
   if (str.is_null || pattern.is_null || replace.is_null) return StringVal::null();

   re2::RE2* re = reinterpret_cast<re2::RE2*>(
       context->GetFunctionState(FunctionContext::THREAD_LOCAL));
   scoped_ptr<re2::RE2> scoped_re; // destroys re if state->re is NULL
   if (re == NULL) {
     DCHECK(!context->IsArgConstant(1));
     string error_str;
     re = CompileRegex(pattern, &error_str, StringVal::null());
     if (re == NULL) {
       context->AddWarning(error_str.c_str());
       return StringVal::null();
     }
     scoped_re.reset(re);
   }

   re2::StringPiece replace_str =
       re2::StringPiece(reinterpret_cast<char*>(replace.ptr), replace.len);
   string result_str = AnyValUtil::ToString(str);
   re2::RE2::GlobalReplace(&result_str, *re, replace_str);
   return AnyValUtil::FromString(context, result_str);
 }

 void StringFunctions::RegexpMatchCountPrepare(FunctionContext* context,
     FunctionContext::FunctionStateScope scope) {
   if (scope != FunctionContext::THREAD_LOCAL) return;
   int num_args = context->GetNumArgs();
   DCHECK(num_args == 2 || num_args == 4);
   if (!context->IsArgConstant(1) || (num_args == 4 && !context->IsArgConstant(3))) return;

   DCHECK_EQ(context->GetArgType(1)->type, FunctionContext::TYPE_STRING);
   StringVal* pattern = reinterpret_cast<StringVal*>(context->GetConstantArg(1));
   if (pattern->is_null) return;

   StringVal* match_parameter = NULL;
   if (num_args == 4) {
     DCHECK_EQ(context->GetArgType(3)->type, FunctionContext::TYPE_STRING);
     match_parameter = reinterpret_cast<StringVal*>(context->GetConstantArg(3));
   }
   string error_str;
   re2::RE2* re = CompileRegex(*pattern, &error_str, match_parameter == NULL ?
       StringVal::null() : *match_parameter);
   if (re == NULL) {
     context->SetError(error_str.c_str());
     return;
   }
   context->SetFunctionState(scope, re);
 }

 IntVal StringFunctions::RegexpMatchCount2Args(FunctionContext* context,
     const StringVal& str, const StringVal& pattern) {
   return RegexpMatchCount4Args(context, str, pattern, IntVal::null(), StringVal::null());
 }

 IntVal StringFunctions::RegexpMatchCount4Args(FunctionContext* context,
     const StringVal& str, const StringVal& pattern, const IntVal& start_pos,
     const StringVal& match_parameter) {
   if (str.is_null || pattern.is_null) return IntVal::null();

   int offset = 0;
   DCHECK_GE(str.len, 0);
   // The parameter "start_pos" starts counting at 1 instead of 0. If "start_pos" is
   // beyond the end of the string, "str" will be considered an empty string.
   if (!start_pos.is_null) offset = min(start_pos.val - 1, str.len);
   if (offset < 0) {
     stringstream error;
     error << "Illegal starting position " << start_pos.val << endl;
     context->SetError(error.str().c_str());
     return IntVal::null();
   }

   re2::RE2* re = reinterpret_cast<re2::RE2*>(
       context->GetFunctionState(FunctionContext::THREAD_LOCAL));
   // Destroys re if we have to locally compile it.
   scoped_ptr<re2::RE2> scoped_re;
   if (re == NULL) {
     DCHECK(!context->IsArgConstant(1) || (context->GetNumArgs() == 4 &&
         !context->IsArgConstant(3)));
     string error_str;
     re = CompileRegex(pattern, &error_str, match_parameter);
     if (re == NULL) {
       context->SetError(error_str.c_str());
       return IntVal::null();
     }
     scoped_re.reset(re);
   }

   DCHECK_GE(str.len, offset);
   re2::StringPiece str_sp(reinterpret_cast<char*>(str.ptr), str.len);
   int count = 0;
   re2::StringPiece match;
   while (offset <= str.len &&
       re->Match(str_sp, offset, str.len, re2::RE2::UNANCHORED, &match, 1)) {
     // Empty string is a valid match for pattern with '*'. Start matching at the next
     // character until we reach the end of the string.
     count++;
     if (match.size() == 0) {
       if (offset == str.len) {
         break;
       }
       offset++;
     } else {
       // Make sure forward progress is being made or we will be in an infinite loop.
       DCHECK_GT(match.data() - str_sp.data() + match.size(), offset);
       offset = match.data() - str_sp.data() + match.size();
     }
   }
   return IntVal(count);
 }

 // NULL handling of function Concat and ConcatWs are different.
 // Function concat was reimplemented to keep the original
 // NULL handling.
 StringVal StringFunctions::Concat(
     FunctionContext* context, int num_children, const StringVal* strs) {
   DCHECK_GE(num_children, 1);
   DCHECK(strs != nullptr);
   // Pass through if there's only one argument.
   if (num_children == 1) return strs[0];

   // Loop once to compute the final size and reserve space.
   int64_t total_size = 0;
   for (int32_t i = 0; i < num_children; ++i) {
     if (strs[i].is_null) return StringVal::null();
     total_size += strs[i].len;
   }

   if (total_size > StringVal::MAX_LENGTH) {
     context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
          "Concatenated string length",
          PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
     return StringVal::null();
   }

   // If total_size is zero, directly returns empty string
   if (total_size <= 0) return StringVal();

   StringVal result(context, total_size);
   if (UNLIKELY(result.is_null)) return StringVal::null();

   // Loop again to append the data.
   uint8_t* ptr = result.ptr;
   for (int32_t i = 0; i < num_children; ++i) {
     Ubsan::MemCpy(ptr, strs[i].ptr, strs[i].len);
     ptr += strs[i].len;
   }
   return result;
 }

 StringVal StringFunctions::ConcatWs(FunctionContext* context, const StringVal& sep,
     int num_children, const StringVal* strs) {
   DCHECK_GE(num_children, 1);
   DCHECK(strs != nullptr);
   if (sep.is_null) return StringVal::null();

   // Loop once to compute valid start index, final string size and valid string object
   // count.
   int32_t valid_num_children = 0;
   int32_t valid_start_index = -1;
   int64_t total_size = 0;
   for (int32_t i = 0; i < num_children; ++i) {
     if (strs[i].is_null) continue;

     if (valid_start_index == -1) {
       valid_start_index = i;
       // Calculate the space required by first valid string object.
       total_size += strs[i].len;
     } else {
       // Calculate the space required by subsequent valid string object.
       total_size += sep.len + strs[i].len;
     }
     // Record the count of valid string object.
     valid_num_children++;
   }

   if (total_size > StringVal::MAX_LENGTH) {
     context->SetError(Substitute(ERROR_CHARACTER_LIMIT_EXCEEDED,
          "Concatenated string length",
          PrettyPrinter::Print(StringVal::MAX_LENGTH, TUnit::BYTES)).c_str());
     return StringVal::null();
   }

   // If all data are invalid, or data size is zero, return empty string.
   if (valid_start_index < 0 || total_size <= 0) {
     return StringVal();
   }
   DCHECK_GT(valid_num_children, 0);

   // Pass through if there's only one argument.
   if (valid_num_children == 1) return strs[valid_start_index];

   // Reserve space needed by final result.
   StringVal result(context, total_size);
   if (UNLIKELY(result.is_null)) return StringVal::null();

   // Loop to append the data.
   uint8_t* ptr = result.ptr;
   Ubsan::MemCpy(ptr, strs[valid_start_index].ptr, strs[valid_start_index].len);
   ptr += strs[valid_start_index].len;
   for (int32_t i = valid_start_index + 1; i < num_children; ++i) {
     if (strs[i].is_null) continue;
     Ubsan::MemCpy(ptr, sep.ptr, sep.len);
     ptr += sep.len;
     Ubsan::MemCpy(ptr, strs[i].ptr, strs[i].len);
     ptr += strs[i].len;
   }
   return result;
 }

 IntVal StringFunctions::FindInSet(FunctionContext* context, const StringVal& str,
     const StringVal& str_set) {
   if (str.is_null || str_set.is_null) return IntVal::null();
   // Check str for commas.
   for (int i = 0; i < str.len; ++i) {
     if (str.ptr[i] == ',') return IntVal(0);
   }
   // The result index starts from 1 since 0 is an error condition.
   int32_t token_index = 1;
   int32_t start = 0;
   int32_t end;
   StringValue str_sv = StringValue::FromStringVal(str);
   do {
     end = start;
     // Position end.
     while (end < str_set.len && str_set.ptr[end] != ',') ++end;
     StringValue token(reinterpret_cast<char*>(str_set.ptr) + start, end - start);
     if (str_sv.Eq(token)) return IntVal(token_index);

     // Re-position start and end past ','
     start = end + 1;
     ++token_index;
   } while (start < str_set.len);
   return IntVal(0);
 }

 void StringFunctions::ParseUrlPrepare(
     FunctionContext* ctx, FunctionContext::FunctionStateScope scope) {
   if (scope != FunctionContext::FRAGMENT_LOCAL) return;
   if (!ctx->IsArgConstant(1)) return;
   DCHECK_EQ(ctx->GetArgType(1)->type, FunctionContext::TYPE_STRING);
   StringVal* part = reinterpret_cast<StringVal*>(ctx->GetConstantArg(1));
   if (part->is_null) return;
   auto url_part = make_unique<UrlParser::UrlPart>(
       UrlParser::GetUrlPart(StringValue::FromStringVal(*part)));
   if (*url_part == UrlParser::INVALID) {
     stringstream ss;
     ss << "Invalid URL part: " << AnyValUtil::ToString(*part) << endl
        << "(Valid URL parts are 'PROTOCOL', 'HOST', 'PATH', 'REF', 'AUTHORITY', 'FILE', "
        << "'USERINFO', and 'QUERY')";
     ctx->SetError(ss.str().c_str());
     return;
   }
   ctx->SetFunctionState(scope, url_part.release());
 }

 StringVal StringFunctions::ParseUrl(
     FunctionContext* ctx, const StringVal& url, const StringVal& part) {
   if (url.is_null || part.is_null) return StringVal::null();
   void* state = ctx->GetFunctionState(FunctionContext::FRAGMENT_LOCAL);
   UrlParser::UrlPart url_part;
   if (state != NULL) {
     url_part = *reinterpret_cast<UrlParser::UrlPart*>(state);
   } else {
     DCHECK(!ctx->IsArgConstant(1));
     url_part = UrlParser::GetUrlPart(StringValue::FromStringVal(part));
   }

   StringValue result;
   if (!UrlParser::ParseUrl(StringValue::FromStringVal(url), url_part, &result)) {
     // url is malformed, or url_part is invalid.
     if (url_part == UrlParser::INVALID) {
       stringstream ss;
       ss << "Invalid URL part: " << AnyValUtil::ToString(part);
       ctx->AddWarning(ss.str().c_str());
     } else {
       stringstream ss;
       ss << "Could not parse URL: " << AnyValUtil::ToString(url);
       ctx->AddWarning(ss.str().c_str());
     }
     return StringVal::null();
   }
   StringVal result_sv;
   result.ToStringVal(&result_sv);
   return result_sv;
 }

 void StringFunctions::ParseUrlClose(
     FunctionContext* ctx, FunctionContext::FunctionStateScope scope) {
   if (scope != FunctionContext::FRAGMENT_LOCAL) return;
   UrlParser::UrlPart* url_part =
       reinterpret_cast<UrlParser::UrlPart*>(ctx->GetFunctionState(scope));
   delete url_part;
   ctx->SetFunctionState(scope, nullptr);
 }

 StringVal StringFunctions::ParseUrlKey(FunctionContext* ctx, const StringVal& url,
     const StringVal& part, const StringVal& key) {
   if (url.is_null || part.is_null || key.is_null) return StringVal::null();
   void* state = ctx->GetFunctionState(FunctionContext::FRAGMENT_LOCAL);
   UrlParser::UrlPart url_part;
   if (state != NULL) {
     url_part = *reinterpret_cast<UrlParser::UrlPart*>(state);
   } else {
     DCHECK(!ctx->IsArgConstant(1));
     url_part = UrlParser::GetUrlPart(StringValue::FromStringVal(part));
   }

   StringValue result;
   if (!UrlParser::ParseUrlKey(StringValue::FromStringVal(url), url_part,
                               StringValue::FromStringVal(key), &result)) {
     // url is malformed, or url_part is invalid.
     if (url_part == UrlParser::INVALID) {
       stringstream ss;
       ss << "Invalid URL part: " << AnyValUtil::ToString(part);
       ctx->AddWarning(ss.str().c_str());
     } else {
       stringstream ss;
       ss << "Could not parse URL: " << AnyValUtil::ToString(url);
       ctx->AddWarning(ss.str().c_str());
     }
     return StringVal::null();
   }
   StringVal result_sv;
   result.ToStringVal(&result_sv);
   return result_sv;
 }

 StringVal StringFunctions::Chr(FunctionContext* ctx, const IntVal& val) {
   if (val.is_null) return StringVal::null();
   if (val.val < 0 || val.val > 255) return "";
   char c = static_cast<char>(val.val);
   return AnyValUtil::FromBuffer(ctx, &c, 1);
 }

 // Similar to strstr() except that the strings are not null-terminated
 // Parameter 'direction' controls the direction of searching, can be either 1 or -1
 static char* LocateSubstring(char* haystack, const int hay_len, const char* needle,
     const int needle_len, const int direction = 1) {
   DCHECK_GT(needle_len, 0);
   DCHECK(needle != NULL);
   DCHECK(hay_len == 0 || haystack != NULL);
   DCHECK(direction == 1 || direction == -1);
   if (hay_len < needle_len) return nullptr;
   char* start = haystack;
   if (direction == -1) start += hay_len - needle_len;
   for (int i = 0; i < hay_len - needle_len + 1; ++i) {
     char* possible_needle = start + direction * i;
     if (strncmp(possible_needle, needle, needle_len) == 0) return possible_needle;
   }
   return nullptr;
 }

 StringVal StringFunctions::SplitPart(FunctionContext* context,
     const StringVal& str, const StringVal& delim, const BigIntVal& field) {
   if (str.is_null || delim.is_null || field.is_null) return StringVal::null();
   int field_pos = field.val;
   if (field_pos == 0) {
     stringstream ss;
     ss << "Invalid field position: " << field.val;
     context->SetError(ss.str().c_str());
     return StringVal::null();
   }
   if (delim.len == 0) return str;
   char* str_start = reinterpret_cast<char*>(str.ptr);
   char* delimiter = reinterpret_cast<char*>(delim.ptr);
   const int DIRECTION = field_pos > 0 ? 1 : -1;
   char* window_start = str_start;
   char* window_end = str_start + str.len;
   for (int cur_pos = DIRECTION; ; cur_pos += DIRECTION) {
     int remaining_len = window_end - window_start;
     char* delim_ref = LocateSubstring(window_start, remaining_len, delimiter, delim.len,
         DIRECTION);
     if (delim_ref == nullptr) {
       if (cur_pos == field_pos) {
         return StringVal(reinterpret_cast<uint8_t*>(window_start), remaining_len);
       }
       // Return empty string if required field position is not found.
       return StringVal();
     }
     if (cur_pos == field_pos) {
       if (DIRECTION < 0) {
         window_start = delim_ref + delim.len;
       }
       else {
         window_end = delim_ref;
       }
       return StringVal(reinterpret_cast<uint8_t*>(window_start),
           window_end - window_start);
     }
     if (DIRECTION < 0) {
       window_end = delim_ref;
     } else {
       window_start = delim_ref + delim.len;
     }
   }
   return StringVal();
 }

 StringVal StringFunctions::Base64Encode(FunctionContext* ctx, const StringVal& str) {
   if (str.is_null) return StringVal::null();
   if (str.len == 0) return StringVal(ctx, 0);
   int64_t out_max = 0;
   if (UNLIKELY(!Base64EncodeBufLen(str.len, &out_max))) {
     stringstream ss;
     ss << "Could not base64 encode a string of length " << str.len;
     ctx->AddWarning(ss.str().c_str());
     return StringVal::null();
   }
   StringVal result(ctx, out_max);
   if (UNLIKELY(result.is_null)) return result;
   unsigned out_len = 0;
   if (UNLIKELY(!impala::Base64Encode(
           reinterpret_cast<const char*>(str.ptr), str.len,
           out_max, reinterpret_cast<char*>(result.ptr), &out_len))) {
     stringstream ss;
     ss << "Could not base64 encode input in space " << out_max
        << "; actual output length " << out_len;
     ctx->AddWarning(ss.str().c_str());
     return StringVal::null();
   }
   result.len = out_len;
   return result;
 }

 StringVal StringFunctions::Base64Decode(FunctionContext* ctx, const StringVal& str) {
   if (str.is_null) return StringVal::null();
   if (0 == str.len) return StringVal(ctx, 0);
   int64_t out_max = 0;
   if (UNLIKELY(!Base64DecodeBufLen(
           reinterpret_cast<const char*>(str.ptr), static_cast<int64_t>(str.len),
           &out_max))) {
     stringstream ss;
     ss << "Invalid base64 string; input length is " << str.len
        << ", which is not a multiple of 4.";
     ctx->AddWarning(ss.str().c_str());
     return StringVal::null();
   }
   StringVal result(ctx, out_max);
   if (UNLIKELY(result.is_null)) return result;
   unsigned out_len = 0;
   if (UNLIKELY(!impala::Base64Decode(
           reinterpret_cast<const char*>(str.ptr), static_cast<int64_t>(str.len),
           out_max, reinterpret_cast<char*>(result.ptr), &out_len))) {
     stringstream ss;
     ss << "Could not base64 decode input in space " << out_max
        << "; actual output length " << out_len;
     ctx->AddWarning(ss.str().c_str());
     return StringVal::null();
   }
   result.len = out_len;
   return result;
 }

 StringVal StringFunctions::GetJsonObject(FunctionContext *ctx, const StringVal &json_str,
     const StringVal &path_str) {
   return GetJsonObjectImpl(ctx, json_str, path_str);
 }

 IntVal StringFunctions::Levenshtein(
     FunctionContext* ctx, const StringVal& s1, const StringVal& s2) {
   // Adapted from https://bit.ly/2SbDgN4
   // under the Creative Commons Attribution-ShareAlike License

   int s1len = s1.len;
   int s2len = s2.len;

   // error if either input exceeds 255 characters
   if (s1len > 255 || s2len > 255) {
     ctx->SetError("levenshtein argument exceeds maximum length of 255 characters");
     return IntVal(-1);
   }

   // short cut cases:
   // - null strings
   // - zero length strings
   // - identical length and value strings
   if (s1.is_null || s2.is_null) return IntVal::null();
   if (s1len == 0) return IntVal(s2len);
   if (s2len == 0) return IntVal(s1len);
   if (s1len == s2len && memcmp(s1.ptr, s2.ptr, s1len) == 0) return IntVal(0);

   int column_start = 1;

   int* column = reinterpret_cast<int*>(ctx->Allocate(sizeof(int) * (s1len + 1)));
   if (UNLIKELY(column == nullptr)) {
     DCHECK(!ctx->impl()->state()->GetQueryStatus().ok());
     return IntVal::null();
   }

   std::iota(column + column_start - 1, column + s1len + 1, column_start - 1);

   for (int x = column_start; x <= s2len; x++) {
     column[0] = x;
     int last_diagonal = x - column_start;
     for (int y = column_start; y <= s1len; y++) {
       int old_diagonal = column[y];
       auto possibilities = {column[y] + 1, column[y - 1] + 1,
           last_diagonal + (s1.ptr[y - 1] == s2.ptr[x - 1] ? 0 : 1)};
       column[y] = std::min(possibilities);
       last_diagonal = old_diagonal;
     }
   }
   int result = column[s1len];
   ctx->Free(reinterpret_cast<uint8_t*>(column));

   return IntVal(result);
 }

 // Based on https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
 // Implements Jaro similarity
 DoubleVal StringFunctions::JaroSimilarity(
     FunctionContext* ctx, const StringVal& s1, const StringVal& s2) {

   int s1len = s1.len;
   int s2len = s2.len;

   // error if either input exceeds 255 characters
   if (s1len > 255 || s2len > 255) {
     ctx->SetError("jaro argument exceeds maximum length of 255 characters");
     return DoubleVal(-1.0);
   }

   // short cut cases:
   // - null strings
   // - zero length strings
   // - identical length and value strings
   if (s1.is_null || s2.is_null) return DoubleVal::null();
   if (s1len == 0 && s2len == 0) return DoubleVal(1.0);
   if (s1len == 0 || s2len == 0) return DoubleVal(0.0);
   if (s1len == s2len && memcmp(s1.ptr, s2.ptr, s1len) == 0) return DoubleVal(1.0);

   // the window size to search for matches in the other string
   int max_range = std::max(0, std::max(s1len, s2len) / 2 - 1);

   int* s1_matching = reinterpret_cast<int*>(ctx->Allocate(sizeof(int) * (s1len)));
   if (UNLIKELY(s1_matching == nullptr)) {
     DCHECK(!ctx->impl()->state()->GetQueryStatus().ok());
     return DoubleVal::null();
   }

   int* s2_matching = reinterpret_cast<int*>(ctx->Allocate(sizeof(int) * (s2len)));
   if (UNLIKELY(s2_matching == nullptr)) {
     ctx->Free(reinterpret_cast<uint8_t*>(s1_matching));
     DCHECK(!ctx->impl()->state()->GetQueryStatus().ok());
     return DoubleVal::null();
   }

   std::fill_n(s1_matching, s1len, -1);
   std::fill_n(s2_matching, s2len, -1);

   // calculate matching characters
   int matching_characters = 0;
   for (int i = 0; i < s1len; i++) {
     // matching window
     int min_index = std::max(i - max_range, 0);
     int max_index = std::min(i + max_range + 1, s2len);
     if (min_index >= max_index) break;

     for (int j = min_index; j < max_index; j++) {
       if (s2_matching[j] == -1 && s1.ptr[i] == s2.ptr[j]) {
         s1_matching[i] = i;
         s2_matching[j] = j;
         matching_characters++;
         break;
       }
     }
   }

   if (matching_characters == 0) {
     ctx->Free(reinterpret_cast<uint8_t*>(s1_matching));
     ctx->Free(reinterpret_cast<uint8_t*>(s2_matching));
     return DoubleVal(0.0);
   }

   // transpositions (one-way only)
   double transpositions = 0.0;
   for (int i = 0, s1i = 0, s2i = 0; i < matching_characters; i++) {
     while (s1_matching[s1i] == -1) {
       s1i++;
     }
     while (s2_matching[s2i] == -1) {
       s2i++;
     }
     if (s1.ptr[s1i] != s2.ptr[s2i]) transpositions += 0.5;
     s1i++;
     s2i++;
   }
   double m = static_cast<double>(matching_characters);
   double jaro_similarity = 1.0 / 3.0  * ( m / static_cast<double>(s1len)
                                         + m / static_cast<double>(s2len)
                                         + (m - transpositions) / m );

   ctx->Free(reinterpret_cast<uint8_t*>(s1_matching));
   ctx->Free(reinterpret_cast<uint8_t*>(s2_matching));

   return DoubleVal(jaro_similarity);
 }

 DoubleVal StringFunctions::JaroDistance(
     FunctionContext* ctx, const StringVal& s1, const StringVal& s2) {

   DoubleVal jaro_similarity = StringFunctions::JaroSimilarity(ctx, s1, s2);
   if (jaro_similarity.is_null) return DoubleVal::null();
   if (jaro_similarity.val == -1.0) return DoubleVal(-1.0);
   return DoubleVal(1.0 - jaro_similarity.val);
 }

 DoubleVal StringFunctions::JaroWinklerDistance(FunctionContext* ctx,
       const StringVal& s1, const StringVal& s2) {
   return StringFunctions::JaroWinklerDistance(ctx, s1, s2,
     DoubleVal(0.1), DoubleVal(0.7));
 }

 DoubleVal StringFunctions::JaroWinklerDistance(FunctionContext* ctx,
       const StringVal& s1, const StringVal& s2,
       const DoubleVal& scaling_factor) {
   return StringFunctions::JaroWinklerDistance(ctx, s1, s2,
     scaling_factor, DoubleVal(0.7));
 }

 // Based on https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
 // Implements Jaro-Winkler distance
 // Extended with boost_theshold: Winkler's modification only applies if Jaro exceeds it
 DoubleVal StringFunctions::JaroWinklerDistance(FunctionContext* ctx,
       const StringVal& s1, const StringVal& s2,
       const DoubleVal& scaling_factor, const DoubleVal& boost_threshold) {

   DoubleVal jaro_winkler_similarity = StringFunctions::JaroWinklerSimilarity(
     ctx, s1, s2, scaling_factor, boost_threshold);

   if (jaro_winkler_similarity.is_null) return DoubleVal::null();
   if (jaro_winkler_similarity.val == -1.0) return DoubleVal(-1.0);
   return DoubleVal(1.0 - jaro_winkler_similarity.val);
 }

 DoubleVal StringFunctions::JaroWinklerSimilarity(FunctionContext* ctx,
       const StringVal& s1, const StringVal& s2) {
   return StringFunctions::JaroWinklerSimilarity(ctx, s1, s2,
     DoubleVal(0.1), DoubleVal(0.7));
 }

 DoubleVal StringFunctions::JaroWinklerSimilarity(FunctionContext* ctx,
       const StringVal& s1, const StringVal& s2,
       const DoubleVal& scaling_factor) {
   return StringFunctions::JaroWinklerSimilarity(ctx, s1, s2,
     scaling_factor, DoubleVal(0.7));
 }

 // Based on https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
 // Implements Jaro-Winkler similarity
 // Extended with boost_theshold: Winkler's modification only applies if Jaro exceeds it
 DoubleVal StringFunctions::JaroWinklerSimilarity(FunctionContext* ctx,
       const StringVal& s1, const StringVal& s2,
       const DoubleVal& scaling_factor, const DoubleVal& boost_threshold) {

   constexpr int MAX_PREFIX_LENGTH = 4;
   int s1len = s1.len;
   int s2len = s2.len;

   // error if either input exceeds 255 characters
   if (s1len > 255 || s2len > 255) {
     ctx->SetError("jaro-winkler argument exceeds maximum length of 255 characters");
     return DoubleVal(-1.0);
   }
   // scaling factor has to be between 0.0 and 0.25
   if (scaling_factor.val < 0.0 || scaling_factor.val > 0.25) {
     ctx->SetError("jaro-winkler scaling factor values can range between 0.0 and 0.25");
     return DoubleVal(-1.0);
   }
   // error if boost threshold is out of range 0.0..1.0
   if (boost_threshold.val < 0.0 || boost_threshold.val > 1.0) {
     ctx->SetError("jaro-winkler boost threshold values can range between 0.0 and 1.0");
     return DoubleVal(-1.0);
   }

   if (s1.is_null || s2.is_null) return DoubleVal::null();

   DoubleVal jaro_similarity = StringFunctions::JaroSimilarity(ctx, s1, s2);
   if (jaro_similarity.is_null) return DoubleVal::null();
   if (jaro_similarity.val == -1.0) return DoubleVal(-1.0);

   double jaro_winkler_similarity = jaro_similarity.val;

   if (jaro_similarity.val > boost_threshold.val) {
     int common_length = std::min(MAX_PREFIX_LENGTH, std::min(s1len, s2len));
     int common_prefix = 0;
     while (common_prefix < common_length &&
            s1.ptr[common_prefix] == s2.ptr[common_prefix]) {
       common_prefix++;
     }

     jaro_winkler_similarity += common_prefix * scaling_factor.val *
       (1.0 - jaro_similarity.val);
   }
   return DoubleVal(jaro_winkler_similarity);
 }

 IntVal StringFunctions::DamerauLevenshtein(
     FunctionContext* ctx, const StringVal& s1, const StringVal& s2) {
   // Based on https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
   // Implements restricted Damerau-Levenshtein (optimal string alignment)

   int s1len = s1.len;
   int s2len = s2.len;

   // error if either input exceeds 255 characters
   if (s1len > 255 || s2len > 255) {
     ctx->SetError("damerau-levenshtein argument exceeds maximum length of 255 "
                   "characters");
     return IntVal(-1);
   }

   // short cut cases:
   // - null strings
   // - zero length strings
   // - identical length and value strings
   if (s1.is_null || s2.is_null) return IntVal::null();
   if (s1len == 0) return IntVal(s2len);
   if (s2len == 0) return IntVal(s1len);
   if (s1len == s2len && memcmp(s1.ptr, s2.ptr, s1len) == 0) return IntVal(0);

   int i;
   int j;
   int l_cost;
   int ptr_array_length = sizeof(int*) * (s1len + 1);
   int int_array_length = sizeof(int) * (s2len + 1) * (s1len + 1);

   // Allocating a 2D array (with d being an array of pointers to the start of the rows)
   int** d = reinterpret_cast<int**>(ctx->Allocate(ptr_array_length));
   if (UNLIKELY(d == nullptr)) {
     DCHECK(!ctx->impl()->state()->GetQueryStatus().ok());
     return IntVal::null();
   }
   int* rows = reinterpret_cast<int*>(ctx->Allocate(int_array_length));
   if (UNLIKELY(rows == nullptr)) {
     ctx->Free(reinterpret_cast<uint8_t*>(d));
     DCHECK(!ctx->impl()->state()->GetQueryStatus().ok());
     return IntVal::null();
   }
   // Setting the pointers in the pointer-array to the start of (s2len + 1) length
   // intervals and initializing its values based on the mentioned algorithm.
   for (i = 0; i <= s1len; ++i) {
     d[i] = rows + (s2len + 1) * i;
     d[i][0] = i;
   }
   std::iota(d[0], d[0] + s2len + 1, 0);

   for (i = 1; i <= s1len; ++i) {
     for (j = 1; j <= s2len; ++j) {
       if (s1.ptr[i - 1] == s2.ptr[j - 1]) {
         l_cost = 0;
       } else {
         l_cost = 1;
       }
       d[i][j] = std::min(d[i - 1][j - 1] + l_cost, // substitution
                          std::min(d[i][j - 1] + 1, // insertion
                                   d[i - 1][j] + 1) // deletion
       );
       if (i > 1 && j > 1 && s1.ptr[i - 1] == s2.ptr[j - 2]
           && s1.ptr[i - 2] == s2.ptr[j - 1]) {
         d[i][j] = std::min(d[i][j], d[i - 2][j - 2] + l_cost); // transposition
       }
     }
   }
   int result = d[s1len][s2len];

   ctx->Free(reinterpret_cast<uint8_t*>(d));
   ctx->Free(reinterpret_cast<uint8_t*>(rows));
   return IntVal(result);
 }

 template <typename T>
 static StringVal prettyPrint(FunctionContext* context, const T& int_val,
     const TUnit::type& unit) {
   if (int_val.is_null) {
     return StringVal::null();
   }

   const string& fmt_str = PrettyPrinter::Print(int_val.val, unit);

   StringVal result(context, fmt_str.size());
   if (UNLIKELY(result.is_null)) return StringVal::null();
   uint8_t* ptr = result.ptr;
   memcpy(ptr, fmt_str.c_str(), fmt_str.size());

   return result;
 }

 StringVal StringFunctions::PrettyPrintMemory(FunctionContext* context,
     const BigIntVal& bytes) {
   return prettyPrint(context, bytes, TUnit::BYTES);
 }

 StringVal StringFunctions::PrettyPrintMemory(FunctionContext* context,
     const IntVal& bytes) {
   return prettyPrint(context, bytes, TUnit::BYTES);
 }

 StringVal StringFunctions::PrettyPrintMemory(FunctionContext* context,
     const SmallIntVal& bytes) {
   return prettyPrint(context, bytes, TUnit::BYTES);
 }

 StringVal StringFunctions::PrettyPrintMemory(FunctionContext* context,
     const TinyIntVal& bytes) {
   return prettyPrint(context, bytes, TUnit::BYTES);
 }

 }