cpp/src/gandiva/gdv_string_function_stubs.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "gandiva/gdv_function_stubs.h"

 #include <utf8proc.h>
 #include <string>
 #include <string_view>
 #include <unordered_map>
 #include <vector>

 #include "arrow/util/double_conversion.h"
 #include "arrow/util/utf8_internal.h"
 #include "arrow/util/value_parsing.h"

 #include "gandiva/engine.h"
 #include "gandiva/exported_funcs.h"
 #include "gandiva/formatting_utils.h"
 #include "gandiva/precompiled/types.h"
 #include "gandiva/regex_functions_holder.h"

 extern "C" {

 bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
                            const char* pattern, int pattern_len) {
   gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
   return (*holder)(std::string(data, data_len));
 }

 bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
                                 const char* pattern, int pattern_len,
                                 const char* escape_char, int escape_char_len) {
   gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
   return (*holder)(std::string(data, data_len));
 }

 bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len,
                             const char* pattern, int pattern_len) {
   gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
   return (*holder)(std::string(data, data_len));
 }

 const char* gdv_fn_regexp_replace_utf8_utf8(
     int64_t ptr, int64_t holder_ptr, const char* data, int32_t data_len,
     const char* /*pattern*/, int32_t /*pattern_len*/, const char* replace_string,
     int32_t replace_string_len, int32_t* out_length) {
   gandiva::ExecutionContext* context = reinterpret_cast<gandiva::ExecutionContext*>(ptr);

   gandiva::ReplaceHolder* holder = reinterpret_cast<gandiva::ReplaceHolder*>(holder_ptr);

   return (*holder)(context, data, data_len, replace_string, replace_string_len,
                    out_length);
 }

 const char* gdv_fn_regexp_extract_utf8_utf8_int32(int64_t ptr, int64_t holder_ptr,
                                                   const char* data, int32_t data_len,
                                                   const char* /*pattern*/,
                                                   int32_t /*pattern_len*/,
                                                   int32_t extract_index,
                                                   int32_t* out_length) {
   gandiva::ExecutionContext* context = reinterpret_cast<gandiva::ExecutionContext*>(ptr);

   gandiva::ExtractHolder* holder = reinterpret_cast<gandiva::ExtractHolder*>(holder_ptr);

   return (*holder)(context, data, data_len, extract_index, out_length);
 }

 #define GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(IN_TYPE, CAST_NAME, ARROW_TYPE)         \
   GANDIVA_EXPORT                                                                  \
   const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(                         \
       int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {     \
     if (len < 0) {                                                                \
       gdv_fn_context_set_error_msg(context, "Buffer length cannot be negative");  \
       *out_len = 0;                                                               \
       return "";                                                                  \
     }                                                                             \
     if (len == 0) {                                                               \
       *out_len = 0;                                                               \
       return "";                                                                  \
     }                                                                             \
     arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter;                \
     char* ret = reinterpret_cast<char*>(                                          \
         gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len)));         \
     if (ret == nullptr) {                                                         \
       gdv_fn_context_set_error_msg(context, "Could not allocate memory");         \
       *out_len = 0;                                                               \
       return "";                                                                  \
     }                                                                             \
     arrow::Status status = formatter(value, [&](std::string_view v) {             \
       int64_t size = static_cast<int64_t>(v.size());                              \
       *out_len = static_cast<int32_t>(len < size ? len : size);                   \
       memcpy(ret, v.data(), *out_len);                                            \
       return arrow::Status::OK();                                                 \
     });                                                                           \
     if (!status.ok()) {                                                           \
       std::string err = "Could not cast " + std::to_string(value) + " to string"; \
       gdv_fn_context_set_error_msg(context, err.c_str());                         \
       *out_len = 0;                                                               \
       return "";                                                                  \
     }                                                                             \
     return ret;                                                                   \
   }

 #define GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(IN_TYPE, CAST_NAME, ARROW_TYPE)         \
   GANDIVA_EXPORT                                                                  \
   const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64(                         \
       int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) {     \
     if (len < 0) {                                                                \
       gdv_fn_context_set_error_msg(context, "Buffer length cannot be negative");  \
       *out_len = 0;                                                               \
       return "";                                                                  \
     }                                                                             \
     if (len == 0) {                                                               \
       *out_len = 0;                                                               \
       return "";                                                                  \
     }                                                                             \
     gandiva::GdvStringFormatter<arrow::ARROW_TYPE> formatter;                     \
     char* ret = reinterpret_cast<char*>(                                          \
         gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len)));         \
     if (ret == nullptr) {                                                         \
       gdv_fn_context_set_error_msg(context, "Could not allocate memory");         \
       *out_len = 0;                                                               \
       return "";                                                                  \
     }                                                                             \
     arrow::Status status = formatter(value, [&](std::string_view v) {             \
       int64_t size = static_cast<int64_t>(v.size());                              \
       *out_len = static_cast<int32_t>(len < size ? len : size);                   \
       memcpy(ret, v.data(), *out_len);                                            \
       return arrow::Status::OK();                                                 \
     });                                                                           \
     if (!status.ok()) {                                                           \
       std::string err = "Could not cast " + std::to_string(value) + " to string"; \
       gdv_fn_context_set_error_msg(context, err.c_str());                         \
       *out_len = 0;                                                               \
       return "";                                                                  \
     }                                                                             \
     return ret;                                                                   \
   }

 #define CAST_VARLEN_TYPE_FROM_NUMERIC(VARLEN_TYPE)                   \
   GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(int32, VARLEN_TYPE, Int32Type)   \
   GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(int64, VARLEN_TYPE, Int64Type)   \
   GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(date64, VARLEN_TYPE, Date64Type) \
   GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float32, VARLEN_TYPE, FloatType) \
   GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float64, VARLEN_TYPE, DoubleType)

 CAST_VARLEN_TYPE_FROM_NUMERIC(VARCHAR)
 CAST_VARLEN_TYPE_FROM_NUMERIC(VARBINARY)

 #undef CAST_VARLEN_TYPE_FROM_NUMERIC
 #undef GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE
 #undef GDV_FN_CAST_VARLEN_TYPE_FROM_REAL

 GDV_FORCE_INLINE
 void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) {
   char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
   int size = static_cast<int>(strlen(fmt)) + 64;
   char* error = reinterpret_cast<char*>(malloc(size));
   snprintf(error, size, fmt, (unsigned char)val);
   gdv_fn_context_set_error_msg(execution_context, error);
   free(error);
 }

 GDV_FORCE_INLINE
 int32_t gdv_fn_utf8_char_length(char c) {
   if ((signed char)c >= 0) {  // 1-byte char (0x00 ~ 0x7F)
     return 1;
   } else if ((c & 0xE0) == 0xC0) {  // 2-byte char
     return 2;
   } else if ((c & 0xF0) == 0xE0) {  // 3-byte char
     return 3;
   } else if ((c & 0xF8) == 0xF0) {  // 4-byte char
     return 4;
   }
   // invalid char
   return 0;
 }

 // Convert an utf8 string to its corresponding lowercase string
 GANDIVA_EXPORT
 const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
                               int32_t* out_len) {
   if (data_len == 0) {
     *out_len = 0;
     return "";
   }

   // If it is a single-byte character (ASCII), corresponding lowercase is always 1-byte
   // long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so length of
   // the output can be at most twice the length of the input
   char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
   if (out == nullptr) {
     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
     *out_len = 0;
     return "";
   }

   int32_t char_len, out_char_len, out_idx = 0;
   uint32_t char_codepoint;

   for (int32_t i = 0; i < data_len; i += char_len) {
     char_len = gdv_fn_utf8_char_length(data[i]);
     // For single byte characters:
     // If it is an uppercase ASCII character, set the output to its corresponding
     // lowercase character; else, set the output to the read character
     if (char_len == 1) {
       char cur = data[i];
       // 'A' - 'Z' : 0x41 - 0x5a
       // 'a' - 'z' : 0x61 - 0x7a
       if (cur >= 0x41 && cur <= 0x5a) {
         out[out_idx++] = static_cast<char>(cur + 0x20);
       } else {
         out[out_idx++] = cur;
       }
       continue;
     }

     // Control reaches here when we encounter a multibyte character
     const auto* in_char = (const uint8_t*)(data + i);

     // Decode the multibyte character
     bool is_valid_utf8_char =
         arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);

     // If it is an invalid utf8 character, UTF8Decode evaluates to false
     if (!is_valid_utf8_char) {
       gdv_fn_set_error_for_invalid_utf8(context, data[i]);
       *out_len = 0;
       return "";
     }

     // Convert the encoded codepoint to its lowercase codepoint
     int32_t lower_codepoint = utf8proc_tolower(char_codepoint);

     // UTF8Encode advances the pointer by the number of bytes present in the lowercase
     // character
     auto* out_char = (uint8_t*)(out + out_idx);
     uint8_t* out_char_start = out_char;

     // Encode the lowercase character
     out_char = arrow::util::UTF8Encode(out_char, lower_codepoint);

     out_char_len = static_cast<int32_t>(out_char - out_char_start);
     out_idx += out_char_len;
   }

   *out_len = out_idx;
   return out;
 }

 // Convert an utf8 string to its corresponding uppercase string
 GANDIVA_EXPORT
 const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
                               int32_t* out_len) {
   if (data_len == 0) {
     *out_len = 0;
     return "";
   }

   // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
   // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
   // the output can be at most twice the length of the input
   char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
   if (out == nullptr) {
     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
     *out_len = 0;
     return "";
   }

   int32_t char_len, out_char_len, out_idx = 0;
   uint32_t char_codepoint;

   for (int32_t i = 0; i < data_len; i += char_len) {
     char_len = gdv_fn_utf8_char_length(data[i]);
     // For single byte characters:
     // If it is a lowercase ASCII character, set the output to its corresponding uppercase
     // character; else, set the output to the read character
     if (char_len == 1) {
       char cur = data[i];
       // 'A' - 'Z' : 0x41 - 0x5a
       // 'a' - 'z' : 0x61 - 0x7a
       if (cur >= 0x61 && cur <= 0x7a) {
         out[out_idx++] = static_cast<char>(cur - 0x20);
       } else {
         out[out_idx++] = cur;
       }
       continue;
     }

     // Control reaches here when we encounter a multibyte character
     const auto* in_char = (const uint8_t*)(data + i);

     // Decode the multibyte character
     bool is_valid_utf8_char =
         arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);

     // If it is an invalid utf8 character, UTF8Decode evaluates to false
     if (!is_valid_utf8_char) {
       gdv_fn_set_error_for_invalid_utf8(context, data[i]);
       *out_len = 0;
       return "";
     }

     // Convert the encoded codepoint to its uppercase codepoint
     int32_t upper_codepoint = utf8proc_toupper(char_codepoint);

     // UTF8Encode advances the pointer by the number of bytes present in the uppercase
     // character
     auto* out_char = (uint8_t*)(out + out_idx);
     uint8_t* out_char_start = out_char;

     // Encode the uppercase character
     out_char = arrow::util::UTF8Encode(out_char, upper_codepoint);

     out_char_len = static_cast<int32_t>(out_char - out_char_start);
     out_idx += out_char_len;
   }

   *out_len = out_idx;
   return out;
 }

 // Substring_index
 GDV_FORCE_INLINE
 const char* gdv_fn_substring_index(int64_t context, const char* txt, int32_t txt_len,
                                    const char* pat, int32_t pat_len, int32_t cnt,
                                    int32_t* out_len) {
   if (txt_len == 0 || pat_len == 0 || cnt == 0) {
     *out_len = 0;
     return "";
   }

   char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, txt_len));
   if (out == nullptr) {
     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
     *out_len = 0;
     return "";
   }

   std::vector<int> lps(pat_len);
   int len = 0;

   lps[0] = 0;  // lps[0] is always 0

   // the loop calculates lps[i] for i = 1 to M-1
   int i = 1;
   while (i < pat_len) {
     if (pat[i] == pat[len]) {
       len++;
       lps[i] = len;
       i++;
     } else {
       // (pat[i] != pat[len])
       // This is tricky. Consider the example.
       // AAACAAAA and i = 7. The idea is similar
       // to search step.
       if (len != 0) {
         len = lps[len - 1];

         // Also, note that we do not increment
         // i here
       } else {
         // if (len == 0)
         lps[i] = 0;
         i++;
       }
     }
   }

   std::vector<int> occ;

   i = 0;      // index for txt[]
   int j = 0;  // index for pat[]
   while (i < txt_len) {
     if (pat[j] == txt[i]) {
       j++;
       i++;
     }

     if (j == pat_len) {
       occ.push_back(i - j);
       j = lps[j - 1];
     } else if (i < txt_len && pat[j] != txt[i]) {
       // mismatch after j matches
       // Do not match lps[0..lps[j-1]] characters,
       // they will match anyway
       if (j != 0)
         j = lps[j - 1];
       else
         i = i + 1;
     }
   }

   if (static_cast<int32_t>(abs(cnt)) <= static_cast<int32_t>(occ.size()) && cnt > 0) {
     memcpy(out, txt, occ[cnt - 1]);
     *out_len = occ[cnt - 1];
     return out;
   } else if (static_cast<int32_t>(abs(cnt)) <= static_cast<int32_t>(occ.size()) &&
              cnt < 0) {
     int32_t sz = static_cast<int32_t>(occ.size());
     int32_t temp = static_cast<int32_t>(abs(cnt));

     memcpy(out, txt + occ[sz - temp] + pat_len, txt_len - occ[sz - temp] - pat_len);
     *out_len = txt_len - occ[sz - temp] - pat_len;
     return out;

   } else {
     *out_len = txt_len;
     memcpy(out, txt, txt_len);
     return out;
   }
 }

 // Any codepoint, except the ones for lowercase letters, uppercase letters,
 // titlecase letters, decimal digits and letter numbers categories will be
 // considered as word separators.
 //
 // The Unicode characters also are divided between categories. This link
 // https://www.compart.com/en/unicode/category shows
 // more information about characters categories.
 GDV_FORCE_INLINE
 bool gdv_fn_is_codepoint_for_space(uint32_t val) {
   auto category = utf8proc_category(val);

   return category != utf8proc_category_t::UTF8PROC_CATEGORY_LU &&
          category != utf8proc_category_t::UTF8PROC_CATEGORY_LL &&
          category != utf8proc_category_t::UTF8PROC_CATEGORY_LT &&
          category != utf8proc_category_t::UTF8PROC_CATEGORY_NL &&
          category != utf8proc_category_t ::UTF8PROC_CATEGORY_ND;
 }

 // For a given text, initialize the first letter after a word-separator and lowercase
 // the others e.g:
 //     - "IT is a tEXt str" -> "It Is A Text Str"
 GANDIVA_EXPORT
 const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
                                 int32_t* out_len) {
   if (data_len == 0) {
     *out_len = data_len;
     return "";
   }

   // If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
   // long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
   // the output can be at most twice the length of the input
   char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
   if (out == nullptr) {
     gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
     *out_len = 0;
     return "";
   }

   int32_t char_len = 0;
   int32_t out_char_len = 0;
   int32_t out_idx = 0;
   uint32_t char_codepoint;

   // Any character is considered as space, except if it is alphanumeric
   bool last_char_was_space = true;

   for (int32_t i = 0; i < data_len; i += char_len) {
     // An optimization for single byte characters:
     if (static_cast<signed char>(data[i]) >= 0) {  // 1-byte char (0x00 ~ 0x7F)
       char_len = 1;
       char cur = data[i];

       if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
         // Check if the character is the first one of the word and it is
         // lowercase -> 'a' - 'z' : 0x61 - 0x7a.
         // Then turn it into uppercase -> 'A' - 'Z' : 0x41 - 0x5a
         out[out_idx++] = static_cast<char>(cur - 0x20);
         last_char_was_space = false;
       } else if (cur >= 0x41 && cur <= 0x5a && !last_char_was_space) {
         out[out_idx++] = static_cast<char>(cur + 0x20);
       } else {
         // Check if the ASCII character is not an alphanumeric character:
         // '0' - '9': 0x30 - 0x39
         // 'a' - 'z' : 0x61 - 0x7a
         // 'A' - 'Z' : 0x41 - 0x5a
         last_char_was_space = (cur < 0x30) || (cur > 0x39 && cur < 0x41) ||
                               (cur > 0x5a && cur < 0x61) || (cur > 0x7a);
         out[out_idx++] = cur;
       }
       continue;
     }

     char_len = gdv_fn_utf8_char_length(data[i]);

     // Control reaches here when we encounter a multibyte character
     const auto* in_char = (const uint8_t*)(data + i);

     // Decode the multibyte character
     bool is_valid_utf8_char =
         arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);

     // If it is an invalid utf8 character, UTF8Decode evaluates to false
     if (!is_valid_utf8_char) {
       gdv_fn_set_error_for_invalid_utf8(context, data[i]);
       *out_len = 0;
       return "";
     }

     bool is_char_space = gdv_fn_is_codepoint_for_space(char_codepoint);

     int32_t formatted_codepoint;
     if (last_char_was_space && !is_char_space) {
       formatted_codepoint = utf8proc_toupper(char_codepoint);
     } else {
       formatted_codepoint = utf8proc_tolower(char_codepoint);
     }

     // UTF8Encode advances the pointer by the number of bytes present in the character
     auto* out_char = (uint8_t*)(out + out_idx);
     uint8_t* out_char_start = out_char;

     // Encode the character
     out_char = arrow::util::UTF8Encode(out_char, formatted_codepoint);

     out_char_len = static_cast<int32_t>(out_char - out_char_start);
     out_idx += out_char_len;

     last_char_was_space = is_char_space;
   }

   *out_len = out_idx;
   return out;
 }
 GANDIVA_EXPORT
 const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in_len,
                                      const char* from, int32_t from_len, const char* to,
                                      int32_t to_len, int32_t* out_len) {
   if (in_len <= 0) {
     *out_len = 0;
     return "";
   }

   if (from_len <= 0) {
     *out_len = in_len;
     return in;
   }

   // This variable is to control if there are multi-byte utf8 entries
   bool has_multi_byte = false;

   // This variable is to store the final result
   char* result;
   int result_len;

   // Searching multi-bytes in In
   for (int i = 0; i < in_len; i++) {
     unsigned char char_single_byte = in[i];
     if (char_single_byte > 127) {
       // found a multi-byte utf-8 char
       has_multi_byte = true;
       break;
     }
   }

   // Searching multi-bytes in From
   if (!has_multi_byte) {
     for (int i = 0; i < from_len; i++) {
       unsigned char char_single_byte = from[i];
       if (char_single_byte > 127) {
         // found a multi-byte utf-8 char
         has_multi_byte = true;
         break;
       }
     }
   }

   // Searching multi-bytes in To
   if (!has_multi_byte) {
     for (int i = 0; i < to_len; i++) {
       unsigned char char_single_byte = to[i];
       if (char_single_byte > 127) {
         // found a multi-byte utf-8 char
         has_multi_byte = true;
         break;
       }
     }
   }

   // If there are no multibytes in the input, work only with char
   if (!has_multi_byte) {
     // This variable is for receive the substitutions
     result = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len));

     if (result == nullptr) {
       gdv_fn_context_set_error_msg(context,
                                    "Could not allocate memory for output string");
       *out_len = 0;
       return "";
     }
     result_len = 0;

     // Creating a Map to mark substitutions to make
     std::unordered_map<char, char> subs_list;

     // This variable is for controlling the position in entry TO, for never repeat the
     // changes
     int start_compare;

     if (to_len > 0) {
       start_compare = 0;
     } else {
       start_compare = -1;
     }

     // If the position in TO is out of range, this variable will be associated to map
     // list, to mark deletion positions
     const char empty = '\0';

     for (int in_for = 0; in_for < in_len; in_for++) {
       if (subs_list.find(in[in_for]) != subs_list.end()) {
         if (subs_list[in[in_for]] != empty) {
           // If exist in map, only add the correspondent value in result
           result[result_len] = subs_list[in[in_for]];
           result_len++;
         }
       } else {
         for (int from_for = 0; from_for <= from_len; from_for++) {
           if (from_for == from_len) {
             // If it's not in the FROM list, just add it to the map and the result.
             subs_list.insert(std::pair<char, char>(in[in_for], in[in_for]));
             result[result_len] = in[in_for];
             result_len++;
             break;
           }
           if (in[in_for] != from[from_for]) {
             // If this character does not exist in FROM list, don't need treatment
             continue;
           } else if (start_compare == -1 || start_compare == to_len) {
             // If exist but the start_compare is out of range, add to map as empty, to
             // deletion later
             subs_list.insert(std::pair<char, char>(in[in_for], empty));
             break;
           } else {
             // If exist and the start_compare is in range, add to map with the
             // corresponding TO in position start_compare
             subs_list.insert(std::pair<char, char>(in[in_for], to[start_compare]));
             result[result_len] = subs_list[in[in_for]];
             result_len++;
             start_compare++;
             break;  // for ignore duplicates entries in FROM, ex: ("adad")
           }
         }
       }
     }
   } else {  // If there are no multibytes in the input, work with std::strings
     // This variable is for receive the substitutions, malloc is in_len * 4 to receive
     // possible inputs with 4 bytes
     result = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len * 4));

     if (result == nullptr) {
       gdv_fn_context_set_error_msg(context,
                                    "Could not allocate memory for output string");
       *out_len = 0;
       return "";
     }
     result_len = 0;

     // This map is std::string to store multi-bytes entries
     std::unordered_map<std::string, std::string> subs_list;

     // This variable is for controlling the position in entry TO, for never repeat the
     // changes
     int start_compare;

     if (to_len > 0) {
       start_compare = 0;
     } else {
       start_compare = -1;
     }

     // If the position in TO is out of range, this variable will be associated to map
     // list, to mark deletion positions
     const std::string empty = "";

     // This variables is to control len of multi-bytes entries
     int len_char_in = 0;
     int len_char_from = 0;
     int len_char_to = 0;

     for (int in_for = 0; in_for < in_len; in_for += len_char_in) {
       // Updating len to char in this position
       len_char_in = gdv_fn_utf8_char_length(in[in_for]);
       // Making copy to std::string with length for this char position
       std::string insert_copy_key(in + in_for, len_char_in);
       if (subs_list.find(insert_copy_key) != subs_list.end()) {
         if (subs_list[insert_copy_key] != empty) {
           // If exist in map, only add the correspondent value in result
           memcpy(result + result_len, subs_list[insert_copy_key].c_str(),
                  subs_list[insert_copy_key].length());
           result_len += static_cast<int>(subs_list[insert_copy_key].length());
         }
       } else {
         for (int from_for = 0; from_for <= from_len; from_for += len_char_from) {
           // Updating len to char in this position
           len_char_from = gdv_fn_utf8_char_length(from[from_for]);
           // Making copy to std::string with length for this char position
           std::string copy_from_compare(from + from_for, len_char_from);
           if (from_for == from_len) {
             // If it's not in the FROM list, just add it to the map and the result.
             std::string insert_copy_value(in + in_for, len_char_in);
             // Insert in map to next loops
             subs_list.insert(
                 std::pair<std::string, std::string>(insert_copy_key, insert_copy_value));
             memcpy(result + result_len, subs_list[insert_copy_key].c_str(),
                    subs_list[insert_copy_key].length());
             result_len += static_cast<int>(subs_list[insert_copy_key].length());
             break;
           }

           if (insert_copy_key != copy_from_compare) {
             // If this character does not exist in FROM list, don't need treatment
             continue;
           } else if (start_compare == -1 || start_compare >= to_len) {
             // If exist but the start_compare is out of range, add to map as empty, to
             // deletion later
             subs_list.insert(std::pair<std::string, std::string>(insert_copy_key, empty));
             break;
           } else {
             // If exist and the start_compare is in range, add to map with the
             // corresponding TO in position start_compare
             len_char_to = gdv_fn_utf8_char_length(to[start_compare]);
             std::string insert_copy_value(to + start_compare, len_char_to);
             // Insert in map to next loops
             subs_list.insert(
                 std::pair<std::string, std::string>(insert_copy_key, insert_copy_value));
             memcpy(result + result_len, subs_list[insert_copy_key].c_str(),
                    subs_list[insert_copy_key].length());
             result_len += static_cast<int>(subs_list[insert_copy_key].length());
             start_compare += len_char_to;
             break;  // for ignore duplicates entries in FROM, ex: ("adad")
           }
         }
       }
     }
   }

   *out_len = result_len;
   return result;
 }
 }

 namespace gandiva {

 arrow::Status ExportedStringFunctions::AddMappings(Engine* engine) const {
   std::vector<llvm::Type*> args;
   auto types = engine->types();

   // gdv_fn_like_utf8_utf8
   args = {types->i64_type(),     // int64_t ptr
           types->i8_ptr_type(),  // const char* data
           types->i32_type(),     // int data_len
           types->i8_ptr_type(),  // const char* pattern
           types->i32_type()};    // int pattern_len

   engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8",
                                   types->i1_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_like_utf8_utf8));

   // gdv_fn_like_utf8_utf8_utf8
   args = {types->i64_type(),     // int64_t ptr
           types->i8_ptr_type(),  // const char* data
           types->i32_type(),     // int data_len
           types->i8_ptr_type(),  // const char* pattern
           types->i32_type(),     // int pattern_len
           types->i8_ptr_type(),  // const char* escape_char
           types->i32_type()};    // int escape_char_len

   engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8_utf8",
                                   types->i1_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_like_utf8_utf8_utf8));

   // gdv_fn_ilike_utf8_utf8
   args = {types->i64_type(),     // int64_t ptr
           types->i8_ptr_type(),  // const char* data
           types->i32_type(),     // int data_len
           types->i8_ptr_type(),  // const char* pattern
           types->i32_type()};    // int pattern_len

   engine->AddGlobalMappingForFunc("gdv_fn_ilike_utf8_utf8",
                                   types->i1_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_ilike_utf8_utf8));

   // gdv_fn_regexp_replace_utf8_utf8
   args = {types->i64_type(),       // int64_t ptr
           types->i64_type(),       // int64_t holder_ptr
           types->i8_ptr_type(),    // const char* data
           types->i32_type(),       // int data_len
           types->i8_ptr_type(),    // const char* pattern
           types->i32_type(),       // int pattern_len
           types->i8_ptr_type(),    // const char* replace_string
           types->i32_type(),       // int32_t replace_string_len
           types->i32_ptr_type()};  // int32_t* out_length

   engine->AddGlobalMappingForFunc(
       "gdv_fn_regexp_replace_utf8_utf8", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_regexp_replace_utf8_utf8));

   // gdv_fn_regexp_extract_utf8_utf8_int32
   args = {types->i64_type(),       // int64_t ptr
           types->i64_type(),       // int64_t holder_ptr
           types->i8_ptr_type(),    // const char* data
           types->i32_type(),       // int data_len
           types->i8_ptr_type(),    // const char* pattern
           types->i32_type(),       // int pattern_len
           types->i32_type(),       // int32_t extract_index
           types->i32_ptr_type()};  // int32_t* out_length

   engine->AddGlobalMappingForFunc(
       "gdv_fn_regexp_extract_utf8_utf8_int32", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_regexp_extract_utf8_utf8_int32));

   // gdv_fn_castVARCHAR_int32_int64
   args = {types->i64_type(),       // int64_t execution_context
           types->i32_type(),       // int32_t value
           types->i64_type(),       // int64_t len
           types->i32_ptr_type()};  // int32_t* out_len
   engine->AddGlobalMappingForFunc(
       "gdv_fn_castVARCHAR_int32_int64", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_castVARCHAR_int32_int64));

   // gdv_fn_castVARCHAR_int64_int64
   args = {types->i64_type(),       // int64_t execution_context
           types->i64_type(),       // int64_t value
           types->i64_type(),       // int64_t len
           types->i32_ptr_type()};  // int32_t* out_len
   engine->AddGlobalMappingForFunc(
       "gdv_fn_castVARCHAR_int64_int64", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_castVARCHAR_int64_int64));

   // gdv_fn_castVARCHAR_milliseconds
   args = {types->i64_type(),       // int64_t execution_context
           types->i64_type(),       // gdv_date64 value
           types->i64_type(),       // int64_t len
           types->i32_ptr_type()};  // int32_t* out_len
   engine->AddGlobalMappingForFunc(
       "gdv_fn_castVARCHAR_date64_int64", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_castVARCHAR_date64_int64));

   // gdv_fn_castVARCHAR_float32_int64
   args = {types->i64_type(),       // int64_t execution_context
           types->float_type(),     // float value
           types->i64_type(),       // int64_t len
           types->i32_ptr_type()};  // int32_t* out_len
   engine->AddGlobalMappingForFunc(
       "gdv_fn_castVARCHAR_float32_int64", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_castVARCHAR_float32_int64));

   // gdv_fn_castVARCHAR_float64_int64
   args = {types->i64_type(),       // int64_t execution_context
           types->double_type(),    // double value
           types->i64_type(),       // int64_t len
           types->i32_ptr_type()};  // int32_t* out_len
   engine->AddGlobalMappingForFunc(
       "gdv_fn_castVARCHAR_float64_int64", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_castVARCHAR_float64_int64));

   // gdv_fn_castVARBINARY_int32
   args = {
       types->i64_type(),     // context
       types->i32_type(),     // int32_t value
       types->i64_type(),     // int64_t out value length
       types->i32_ptr_type()  // int32_t out_length
   };

   engine->AddGlobalMappingForFunc(
       "gdv_fn_castVARBINARY_int32_int64", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_castVARBINARY_int32_int64));

   // gdv_fn_castVARBINARY_int64
   args = {
       types->i64_type(),     // context
       types->i64_type(),     // int64_t value
       types->i64_type(),     // int64_t out value length
       types->i32_ptr_type()  // int32_t out_length
   };

   engine->AddGlobalMappingForFunc(
       "gdv_fn_castVARBINARY_int64_int64", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_castVARBINARY_int64_int64));

   // gdv_fn_castVARBINARY_float32
   args = {
       types->i64_type(),     // context
       types->float_type(),   // float value
       types->i64_type(),     // int64_t out value length
       types->i64_ptr_type()  // int32_t out_length
   };

   engine->AddGlobalMappingForFunc(
       "gdv_fn_castVARBINARY_float32_int64", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_castVARBINARY_float32_int64));

   // gdv_fn_castVARBINARY_float64
   args = {
       types->i64_type(),     // context
       types->i64_type(),     // double value
       types->i64_type(),     // int64_t out value length
       types->i32_ptr_type()  // int32_t out_length
   };

   engine->AddGlobalMappingForFunc(
       "gdv_fn_castVARBINARY_float64_int64", types->i8_ptr_type() /*return_type*/, args,
       reinterpret_cast<void*>(gdv_fn_castVARBINARY_float64_int64));

   // gdv_fn_lower_utf8
   args = {
       types->i64_type(),      // context
       types->i8_ptr_type(),   // data
       types->i32_type(),      // data_len
       types->i32_ptr_type(),  // out_len
   };

   engine->AddGlobalMappingForFunc("gdv_fn_lower_utf8",
                                   types->i8_ptr_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_lower_utf8));

   // gdv_fn_upper_utf8
   args = {
       types->i64_type(),      // context
       types->i8_ptr_type(),   // data
       types->i32_type(),      // data_len
       types->i32_ptr_type(),  // out_len
   };

   engine->AddGlobalMappingForFunc("gdv_fn_upper_utf8",
                                   types->i8_ptr_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_upper_utf8));

   // gdv_fn_substring_index
   args = {
       types->i64_type(),      // context
       types->i8_ptr_type(),   // txt
       types->i32_type(),      // txt_len
       types->i8_ptr_type(),   // pat
       types->i32_type(),      // pat_len
       types->i32_type(),      // cnt
       types->i32_ptr_type(),  // out_len
   };

   engine->AddGlobalMappingForFunc("gdv_fn_substring_index",
                                   types->i8_ptr_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_substring_index));

   // gdv_fn_initcap_utf8
   args = {
       types->i64_type(),     // context
       types->i8_ptr_type(),  // const char*
       types->i32_type(),     // value_length
       types->i32_ptr_type()  // out_length
   };

   engine->AddGlobalMappingForFunc("gdv_fn_initcap_utf8",
                                   types->i8_ptr_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(gdv_fn_initcap_utf8));

   // translate_utf8_utf8_utf8
   args = {
       types->i64_type(),     // context
       types->i8_ptr_type(),  // const char*
       types->i32_type(),     // value_length
       types->i8_ptr_type(),  // const char*
       types->i32_type(),     // value_length
       types->i8_ptr_type(),  // const char*
       types->i32_type(),     // value_length
       types->i32_ptr_type()  // out_length
   };

   engine->AddGlobalMappingForFunc("translate_utf8_utf8_utf8",
                                   types->i8_ptr_type() /*return_type*/, args,
                                   reinterpret_cast<void*>(translate_utf8_utf8_utf8));
   return arrow::Status::OK();
 }
 }  // namespace gandiva