blob: 17eefbe22e31bbc1980a29e64bffbd0f99eb5da2 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "gandiva/gdv_function_stubs.h"
#include <utf8proc.h>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>
#include "arrow/util/double_conversion.h"
#include "arrow/util/utf8_internal.h"
#include "arrow/util/value_parsing.h"
#include "gandiva/engine.h"
#include "gandiva/exported_funcs.h"
#include "gandiva/formatting_utils.h"
#include "gandiva/precompiled/types.h"
#include "gandiva/regex_functions_holder.h"
extern "C" {
bool gdv_fn_like_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len) {
gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
return (*holder)(std::string(data, data_len));
}
bool gdv_fn_like_utf8_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len,
const char* escape_char, int escape_char_len) {
gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
return (*holder)(std::string(data, data_len));
}
bool gdv_fn_ilike_utf8_utf8(int64_t ptr, const char* data, int data_len,
const char* pattern, int pattern_len) {
gandiva::LikeHolder* holder = reinterpret_cast<gandiva::LikeHolder*>(ptr);
return (*holder)(std::string(data, data_len));
}
const char* gdv_fn_regexp_replace_utf8_utf8(
int64_t ptr, int64_t holder_ptr, const char* data, int32_t data_len,
const char* /*pattern*/, int32_t /*pattern_len*/, const char* replace_string,
int32_t replace_string_len, int32_t* out_length) {
gandiva::ExecutionContext* context = reinterpret_cast<gandiva::ExecutionContext*>(ptr);
gandiva::ReplaceHolder* holder = reinterpret_cast<gandiva::ReplaceHolder*>(holder_ptr);
return (*holder)(context, data, data_len, replace_string, replace_string_len,
out_length);
}
const char* gdv_fn_regexp_extract_utf8_utf8_int32(int64_t ptr, int64_t holder_ptr,
const char* data, int32_t data_len,
const char* /*pattern*/,
int32_t /*pattern_len*/,
int32_t extract_index,
int32_t* out_length) {
gandiva::ExecutionContext* context = reinterpret_cast<gandiva::ExecutionContext*>(ptr);
gandiva::ExtractHolder* holder = reinterpret_cast<gandiva::ExtractHolder*>(holder_ptr);
return (*holder)(context, data, data_len, extract_index, out_length);
}
#define GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(IN_TYPE, CAST_NAME, ARROW_TYPE) \
GANDIVA_EXPORT \
const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64( \
int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) { \
if (len < 0) { \
gdv_fn_context_set_error_msg(context, "Buffer length cannot be negative"); \
*out_len = 0; \
return ""; \
} \
if (len == 0) { \
*out_len = 0; \
return ""; \
} \
arrow::internal::StringFormatter<arrow::ARROW_TYPE> formatter; \
char* ret = reinterpret_cast<char*>( \
gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len))); \
if (ret == nullptr) { \
gdv_fn_context_set_error_msg(context, "Could not allocate memory"); \
*out_len = 0; \
return ""; \
} \
arrow::Status status = formatter(value, [&](std::string_view v) { \
int64_t size = static_cast<int64_t>(v.size()); \
*out_len = static_cast<int32_t>(len < size ? len : size); \
memcpy(ret, v.data(), *out_len); \
return arrow::Status::OK(); \
}); \
if (!status.ok()) { \
std::string err = "Could not cast " + std::to_string(value) + " to string"; \
gdv_fn_context_set_error_msg(context, err.c_str()); \
*out_len = 0; \
return ""; \
} \
return ret; \
}
#define GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(IN_TYPE, CAST_NAME, ARROW_TYPE) \
GANDIVA_EXPORT \
const char* gdv_fn_cast##CAST_NAME##_##IN_TYPE##_int64( \
int64_t context, gdv_##IN_TYPE value, int64_t len, int32_t * out_len) { \
if (len < 0) { \
gdv_fn_context_set_error_msg(context, "Buffer length cannot be negative"); \
*out_len = 0; \
return ""; \
} \
if (len == 0) { \
*out_len = 0; \
return ""; \
} \
gandiva::GdvStringFormatter<arrow::ARROW_TYPE> formatter; \
char* ret = reinterpret_cast<char*>( \
gdv_fn_context_arena_malloc(context, static_cast<int32_t>(len))); \
if (ret == nullptr) { \
gdv_fn_context_set_error_msg(context, "Could not allocate memory"); \
*out_len = 0; \
return ""; \
} \
arrow::Status status = formatter(value, [&](std::string_view v) { \
int64_t size = static_cast<int64_t>(v.size()); \
*out_len = static_cast<int32_t>(len < size ? len : size); \
memcpy(ret, v.data(), *out_len); \
return arrow::Status::OK(); \
}); \
if (!status.ok()) { \
std::string err = "Could not cast " + std::to_string(value) + " to string"; \
gdv_fn_context_set_error_msg(context, err.c_str()); \
*out_len = 0; \
return ""; \
} \
return ret; \
}
#define CAST_VARLEN_TYPE_FROM_NUMERIC(VARLEN_TYPE) \
GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(int32, VARLEN_TYPE, Int32Type) \
GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(int64, VARLEN_TYPE, Int64Type) \
GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE(date64, VARLEN_TYPE, Date64Type) \
GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float32, VARLEN_TYPE, FloatType) \
GDV_FN_CAST_VARLEN_TYPE_FROM_REAL(float64, VARLEN_TYPE, DoubleType)
CAST_VARLEN_TYPE_FROM_NUMERIC(VARCHAR)
CAST_VARLEN_TYPE_FROM_NUMERIC(VARBINARY)
#undef CAST_VARLEN_TYPE_FROM_NUMERIC
#undef GDV_FN_CAST_VARLEN_TYPE_FROM_TYPE
#undef GDV_FN_CAST_VARLEN_TYPE_FROM_REAL
GDV_FORCE_INLINE
void gdv_fn_set_error_for_invalid_utf8(int64_t execution_context, char val) {
char const* fmt = "unexpected byte \\%02hhx encountered while decoding utf8 string";
int size = static_cast<int>(strlen(fmt)) + 64;
char* error = reinterpret_cast<char*>(malloc(size));
snprintf(error, size, fmt, (unsigned char)val);
gdv_fn_context_set_error_msg(execution_context, error);
free(error);
}
GDV_FORCE_INLINE
int32_t gdv_fn_utf8_char_length(char c) {
if ((signed char)c >= 0) { // 1-byte char (0x00 ~ 0x7F)
return 1;
} else if ((c & 0xE0) == 0xC0) { // 2-byte char
return 2;
} else if ((c & 0xF0) == 0xE0) { // 3-byte char
return 3;
} else if ((c & 0xF8) == 0xF0) { // 4-byte char
return 4;
}
// invalid char
return 0;
}
// Convert an utf8 string to its corresponding lowercase string
GANDIVA_EXPORT
const char* gdv_fn_lower_utf8(int64_t context, const char* data, int32_t data_len,
int32_t* out_len) {
if (data_len == 0) {
*out_len = 0;
return "";
}
// If it is a single-byte character (ASCII), corresponding lowercase is always 1-byte
// long; if it is >= 2 bytes long, lowercase can be at most 4 bytes long, so length of
// the output can be at most twice the length of the input
char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
if (out == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
int32_t char_len, out_char_len, out_idx = 0;
uint32_t char_codepoint;
for (int32_t i = 0; i < data_len; i += char_len) {
char_len = gdv_fn_utf8_char_length(data[i]);
// For single byte characters:
// If it is an uppercase ASCII character, set the output to its corresponding
// lowercase character; else, set the output to the read character
if (char_len == 1) {
char cur = data[i];
// 'A' - 'Z' : 0x41 - 0x5a
// 'a' - 'z' : 0x61 - 0x7a
if (cur >= 0x41 && cur <= 0x5a) {
out[out_idx++] = static_cast<char>(cur + 0x20);
} else {
out[out_idx++] = cur;
}
continue;
}
// Control reaches here when we encounter a multibyte character
const auto* in_char = (const uint8_t*)(data + i);
// Decode the multibyte character
bool is_valid_utf8_char =
arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
// If it is an invalid utf8 character, UTF8Decode evaluates to false
if (!is_valid_utf8_char) {
gdv_fn_set_error_for_invalid_utf8(context, data[i]);
*out_len = 0;
return "";
}
// Convert the encoded codepoint to its lowercase codepoint
int32_t lower_codepoint = utf8proc_tolower(char_codepoint);
// UTF8Encode advances the pointer by the number of bytes present in the lowercase
// character
auto* out_char = (uint8_t*)(out + out_idx);
uint8_t* out_char_start = out_char;
// Encode the lowercase character
out_char = arrow::util::UTF8Encode(out_char, lower_codepoint);
out_char_len = static_cast<int32_t>(out_char - out_char_start);
out_idx += out_char_len;
}
*out_len = out_idx;
return out;
}
// Convert an utf8 string to its corresponding uppercase string
GANDIVA_EXPORT
const char* gdv_fn_upper_utf8(int64_t context, const char* data, int32_t data_len,
int32_t* out_len) {
if (data_len == 0) {
*out_len = 0;
return "";
}
// If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
// long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
// the output can be at most twice the length of the input
char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
if (out == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
int32_t char_len, out_char_len, out_idx = 0;
uint32_t char_codepoint;
for (int32_t i = 0; i < data_len; i += char_len) {
char_len = gdv_fn_utf8_char_length(data[i]);
// For single byte characters:
// If it is a lowercase ASCII character, set the output to its corresponding uppercase
// character; else, set the output to the read character
if (char_len == 1) {
char cur = data[i];
// 'A' - 'Z' : 0x41 - 0x5a
// 'a' - 'z' : 0x61 - 0x7a
if (cur >= 0x61 && cur <= 0x7a) {
out[out_idx++] = static_cast<char>(cur - 0x20);
} else {
out[out_idx++] = cur;
}
continue;
}
// Control reaches here when we encounter a multibyte character
const auto* in_char = (const uint8_t*)(data + i);
// Decode the multibyte character
bool is_valid_utf8_char =
arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
// If it is an invalid utf8 character, UTF8Decode evaluates to false
if (!is_valid_utf8_char) {
gdv_fn_set_error_for_invalid_utf8(context, data[i]);
*out_len = 0;
return "";
}
// Convert the encoded codepoint to its uppercase codepoint
int32_t upper_codepoint = utf8proc_toupper(char_codepoint);
// UTF8Encode advances the pointer by the number of bytes present in the uppercase
// character
auto* out_char = (uint8_t*)(out + out_idx);
uint8_t* out_char_start = out_char;
// Encode the uppercase character
out_char = arrow::util::UTF8Encode(out_char, upper_codepoint);
out_char_len = static_cast<int32_t>(out_char - out_char_start);
out_idx += out_char_len;
}
*out_len = out_idx;
return out;
}
// Substring_index
GDV_FORCE_INLINE
const char* gdv_fn_substring_index(int64_t context, const char* txt, int32_t txt_len,
const char* pat, int32_t pat_len, int32_t cnt,
int32_t* out_len) {
if (txt_len == 0 || pat_len == 0 || cnt == 0) {
*out_len = 0;
return "";
}
char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, txt_len));
if (out == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
std::vector<int> lps(pat_len);
int len = 0;
lps[0] = 0; // lps[0] is always 0
// the loop calculates lps[i] for i = 1 to M-1
int i = 1;
while (i < pat_len) {
if (pat[i] == pat[len]) {
len++;
lps[i] = len;
i++;
} else {
// (pat[i] != pat[len])
// This is tricky. Consider the example.
// AAACAAAA and i = 7. The idea is similar
// to search step.
if (len != 0) {
len = lps[len - 1];
// Also, note that we do not increment
// i here
} else {
// if (len == 0)
lps[i] = 0;
i++;
}
}
}
std::vector<int> occ;
i = 0; // index for txt[]
int j = 0; // index for pat[]
while (i < txt_len) {
if (pat[j] == txt[i]) {
j++;
i++;
}
if (j == pat_len) {
occ.push_back(i - j);
j = lps[j - 1];
} else if (i < txt_len && pat[j] != txt[i]) {
// mismatch after j matches
// Do not match lps[0..lps[j-1]] characters,
// they will match anyway
if (j != 0)
j = lps[j - 1];
else
i = i + 1;
}
}
if (static_cast<int32_t>(abs(cnt)) <= static_cast<int32_t>(occ.size()) && cnt > 0) {
memcpy(out, txt, occ[cnt - 1]);
*out_len = occ[cnt - 1];
return out;
} else if (static_cast<int32_t>(abs(cnt)) <= static_cast<int32_t>(occ.size()) &&
cnt < 0) {
int32_t sz = static_cast<int32_t>(occ.size());
int32_t temp = static_cast<int32_t>(abs(cnt));
memcpy(out, txt + occ[sz - temp] + pat_len, txt_len - occ[sz - temp] - pat_len);
*out_len = txt_len - occ[sz - temp] - pat_len;
return out;
} else {
*out_len = txt_len;
memcpy(out, txt, txt_len);
return out;
}
}
// Any codepoint, except the ones for lowercase letters, uppercase letters,
// titlecase letters, decimal digits and letter numbers categories will be
// considered as word separators.
//
// The Unicode characters also are divided between categories. This link
// https://www.compart.com/en/unicode/category shows
// more information about characters categories.
GDV_FORCE_INLINE
bool gdv_fn_is_codepoint_for_space(uint32_t val) {
auto category = utf8proc_category(val);
return category != utf8proc_category_t::UTF8PROC_CATEGORY_LU &&
category != utf8proc_category_t::UTF8PROC_CATEGORY_LL &&
category != utf8proc_category_t::UTF8PROC_CATEGORY_LT &&
category != utf8proc_category_t::UTF8PROC_CATEGORY_NL &&
category != utf8proc_category_t ::UTF8PROC_CATEGORY_ND;
}
// For a given text, initialize the first letter after a word-separator and lowercase
// the others e.g:
// - "IT is a tEXt str" -> "It Is A Text Str"
GANDIVA_EXPORT
const char* gdv_fn_initcap_utf8(int64_t context, const char* data, int32_t data_len,
int32_t* out_len) {
if (data_len == 0) {
*out_len = data_len;
return "";
}
// If it is a single-byte character (ASCII), corresponding uppercase is always 1-byte
// long; if it is >= 2 bytes long, uppercase can be at most 4 bytes long, so length of
// the output can be at most twice the length of the input
char* out = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, 2 * data_len));
if (out == nullptr) {
gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string");
*out_len = 0;
return "";
}
int32_t char_len = 0;
int32_t out_char_len = 0;
int32_t out_idx = 0;
uint32_t char_codepoint;
// Any character is considered as space, except if it is alphanumeric
bool last_char_was_space = true;
for (int32_t i = 0; i < data_len; i += char_len) {
// An optimization for single byte characters:
if (static_cast<signed char>(data[i]) >= 0) { // 1-byte char (0x00 ~ 0x7F)
char_len = 1;
char cur = data[i];
if (cur >= 0x61 && cur <= 0x7a && last_char_was_space) {
// Check if the character is the first one of the word and it is
// lowercase -> 'a' - 'z' : 0x61 - 0x7a.
// Then turn it into uppercase -> 'A' - 'Z' : 0x41 - 0x5a
out[out_idx++] = static_cast<char>(cur - 0x20);
last_char_was_space = false;
} else if (cur >= 0x41 && cur <= 0x5a && !last_char_was_space) {
out[out_idx++] = static_cast<char>(cur + 0x20);
} else {
// Check if the ASCII character is not an alphanumeric character:
// '0' - '9': 0x30 - 0x39
// 'a' - 'z' : 0x61 - 0x7a
// 'A' - 'Z' : 0x41 - 0x5a
last_char_was_space = (cur < 0x30) || (cur > 0x39 && cur < 0x41) ||
(cur > 0x5a && cur < 0x61) || (cur > 0x7a);
out[out_idx++] = cur;
}
continue;
}
char_len = gdv_fn_utf8_char_length(data[i]);
// Control reaches here when we encounter a multibyte character
const auto* in_char = (const uint8_t*)(data + i);
// Decode the multibyte character
bool is_valid_utf8_char =
arrow::util::UTF8Decode((const uint8_t**)&in_char, &char_codepoint);
// If it is an invalid utf8 character, UTF8Decode evaluates to false
if (!is_valid_utf8_char) {
gdv_fn_set_error_for_invalid_utf8(context, data[i]);
*out_len = 0;
return "";
}
bool is_char_space = gdv_fn_is_codepoint_for_space(char_codepoint);
int32_t formatted_codepoint;
if (last_char_was_space && !is_char_space) {
formatted_codepoint = utf8proc_toupper(char_codepoint);
} else {
formatted_codepoint = utf8proc_tolower(char_codepoint);
}
// UTF8Encode advances the pointer by the number of bytes present in the character
auto* out_char = (uint8_t*)(out + out_idx);
uint8_t* out_char_start = out_char;
// Encode the character
out_char = arrow::util::UTF8Encode(out_char, formatted_codepoint);
out_char_len = static_cast<int32_t>(out_char - out_char_start);
out_idx += out_char_len;
last_char_was_space = is_char_space;
}
*out_len = out_idx;
return out;
}
GANDIVA_EXPORT
const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in_len,
const char* from, int32_t from_len, const char* to,
int32_t to_len, int32_t* out_len) {
if (in_len <= 0) {
*out_len = 0;
return "";
}
if (from_len <= 0) {
*out_len = in_len;
return in;
}
// This variable is to control if there are multi-byte utf8 entries
bool has_multi_byte = false;
// This variable is to store the final result
char* result;
int result_len;
// Searching multi-bytes in In
for (int i = 0; i < in_len; i++) {
unsigned char char_single_byte = in[i];
if (char_single_byte > 127) {
// found a multi-byte utf-8 char
has_multi_byte = true;
break;
}
}
// Searching multi-bytes in From
if (!has_multi_byte) {
for (int i = 0; i < from_len; i++) {
unsigned char char_single_byte = from[i];
if (char_single_byte > 127) {
// found a multi-byte utf-8 char
has_multi_byte = true;
break;
}
}
}
// Searching multi-bytes in To
if (!has_multi_byte) {
for (int i = 0; i < to_len; i++) {
unsigned char char_single_byte = to[i];
if (char_single_byte > 127) {
// found a multi-byte utf-8 char
has_multi_byte = true;
break;
}
}
}
// If there are no multibytes in the input, work only with char
if (!has_multi_byte) {
// This variable is for receive the substitutions
result = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len));
if (result == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
result_len = 0;
// Creating a Map to mark substitutions to make
std::unordered_map<char, char> subs_list;
// This variable is for controlling the position in entry TO, for never repeat the
// changes
int start_compare;
if (to_len > 0) {
start_compare = 0;
} else {
start_compare = -1;
}
// If the position in TO is out of range, this variable will be associated to map
// list, to mark deletion positions
const char empty = '\0';
for (int in_for = 0; in_for < in_len; in_for++) {
if (subs_list.find(in[in_for]) != subs_list.end()) {
if (subs_list[in[in_for]] != empty) {
// If exist in map, only add the correspondent value in result
result[result_len] = subs_list[in[in_for]];
result_len++;
}
} else {
for (int from_for = 0; from_for <= from_len; from_for++) {
if (from_for == from_len) {
// If it's not in the FROM list, just add it to the map and the result.
subs_list.insert(std::pair<char, char>(in[in_for], in[in_for]));
result[result_len] = in[in_for];
result_len++;
break;
}
if (in[in_for] != from[from_for]) {
// If this character does not exist in FROM list, don't need treatment
continue;
} else if (start_compare == -1 || start_compare == to_len) {
// If exist but the start_compare is out of range, add to map as empty, to
// deletion later
subs_list.insert(std::pair<char, char>(in[in_for], empty));
break;
} else {
// If exist and the start_compare is in range, add to map with the
// corresponding TO in position start_compare
subs_list.insert(std::pair<char, char>(in[in_for], to[start_compare]));
result[result_len] = subs_list[in[in_for]];
result_len++;
start_compare++;
break; // for ignore duplicates entries in FROM, ex: ("adad")
}
}
}
}
} else { // If there are no multibytes in the input, work with std::strings
// This variable is for receive the substitutions, malloc is in_len * 4 to receive
// possible inputs with 4 bytes
result = reinterpret_cast<char*>(gdv_fn_context_arena_malloc(context, in_len * 4));
if (result == nullptr) {
gdv_fn_context_set_error_msg(context,
"Could not allocate memory for output string");
*out_len = 0;
return "";
}
result_len = 0;
// This map is std::string to store multi-bytes entries
std::unordered_map<std::string, std::string> subs_list;
// This variable is for controlling the position in entry TO, for never repeat the
// changes
int start_compare;
if (to_len > 0) {
start_compare = 0;
} else {
start_compare = -1;
}
// If the position in TO is out of range, this variable will be associated to map
// list, to mark deletion positions
const std::string empty = "";
// This variables is to control len of multi-bytes entries
int len_char_in = 0;
int len_char_from = 0;
int len_char_to = 0;
for (int in_for = 0; in_for < in_len; in_for += len_char_in) {
// Updating len to char in this position
len_char_in = gdv_fn_utf8_char_length(in[in_for]);
// Making copy to std::string with length for this char position
std::string insert_copy_key(in + in_for, len_char_in);
if (subs_list.find(insert_copy_key) != subs_list.end()) {
if (subs_list[insert_copy_key] != empty) {
// If exist in map, only add the correspondent value in result
memcpy(result + result_len, subs_list[insert_copy_key].c_str(),
subs_list[insert_copy_key].length());
result_len += static_cast<int>(subs_list[insert_copy_key].length());
}
} else {
for (int from_for = 0; from_for <= from_len; from_for += len_char_from) {
// Updating len to char in this position
len_char_from = gdv_fn_utf8_char_length(from[from_for]);
// Making copy to std::string with length for this char position
std::string copy_from_compare(from + from_for, len_char_from);
if (from_for == from_len) {
// If it's not in the FROM list, just add it to the map and the result.
std::string insert_copy_value(in + in_for, len_char_in);
// Insert in map to next loops
subs_list.insert(
std::pair<std::string, std::string>(insert_copy_key, insert_copy_value));
memcpy(result + result_len, subs_list[insert_copy_key].c_str(),
subs_list[insert_copy_key].length());
result_len += static_cast<int>(subs_list[insert_copy_key].length());
break;
}
if (insert_copy_key != copy_from_compare) {
// If this character does not exist in FROM list, don't need treatment
continue;
} else if (start_compare == -1 || start_compare >= to_len) {
// If exist but the start_compare is out of range, add to map as empty, to
// deletion later
subs_list.insert(std::pair<std::string, std::string>(insert_copy_key, empty));
break;
} else {
// If exist and the start_compare is in range, add to map with the
// corresponding TO in position start_compare
len_char_to = gdv_fn_utf8_char_length(to[start_compare]);
std::string insert_copy_value(to + start_compare, len_char_to);
// Insert in map to next loops
subs_list.insert(
std::pair<std::string, std::string>(insert_copy_key, insert_copy_value));
memcpy(result + result_len, subs_list[insert_copy_key].c_str(),
subs_list[insert_copy_key].length());
result_len += static_cast<int>(subs_list[insert_copy_key].length());
start_compare += len_char_to;
break; // for ignore duplicates entries in FROM, ex: ("adad")
}
}
}
}
}
*out_len = result_len;
return result;
}
}
namespace gandiva {
arrow::Status ExportedStringFunctions::AddMappings(Engine* engine) const {
std::vector<llvm::Type*> args;
auto types = engine->types();
// gdv_fn_like_utf8_utf8
args = {types->i64_type(), // int64_t ptr
types->i8_ptr_type(), // const char* data
types->i32_type(), // int data_len
types->i8_ptr_type(), // const char* pattern
types->i32_type()}; // int pattern_len
engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8",
types->i1_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_like_utf8_utf8));
// gdv_fn_like_utf8_utf8_utf8
args = {types->i64_type(), // int64_t ptr
types->i8_ptr_type(), // const char* data
types->i32_type(), // int data_len
types->i8_ptr_type(), // const char* pattern
types->i32_type(), // int pattern_len
types->i8_ptr_type(), // const char* escape_char
types->i32_type()}; // int escape_char_len
engine->AddGlobalMappingForFunc("gdv_fn_like_utf8_utf8_utf8",
types->i1_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_like_utf8_utf8_utf8));
// gdv_fn_ilike_utf8_utf8
args = {types->i64_type(), // int64_t ptr
types->i8_ptr_type(), // const char* data
types->i32_type(), // int data_len
types->i8_ptr_type(), // const char* pattern
types->i32_type()}; // int pattern_len
engine->AddGlobalMappingForFunc("gdv_fn_ilike_utf8_utf8",
types->i1_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_ilike_utf8_utf8));
// gdv_fn_regexp_replace_utf8_utf8
args = {types->i64_type(), // int64_t ptr
types->i64_type(), // int64_t holder_ptr
types->i8_ptr_type(), // const char* data
types->i32_type(), // int data_len
types->i8_ptr_type(), // const char* pattern
types->i32_type(), // int pattern_len
types->i8_ptr_type(), // const char* replace_string
types->i32_type(), // int32_t replace_string_len
types->i32_ptr_type()}; // int32_t* out_length
engine->AddGlobalMappingForFunc(
"gdv_fn_regexp_replace_utf8_utf8", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_regexp_replace_utf8_utf8));
// gdv_fn_regexp_extract_utf8_utf8_int32
args = {types->i64_type(), // int64_t ptr
types->i64_type(), // int64_t holder_ptr
types->i8_ptr_type(), // const char* data
types->i32_type(), // int data_len
types->i8_ptr_type(), // const char* pattern
types->i32_type(), // int pattern_len
types->i32_type(), // int32_t extract_index
types->i32_ptr_type()}; // int32_t* out_length
engine->AddGlobalMappingForFunc(
"gdv_fn_regexp_extract_utf8_utf8_int32", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_regexp_extract_utf8_utf8_int32));
// gdv_fn_castVARCHAR_int32_int64
args = {types->i64_type(), // int64_t execution_context
types->i32_type(), // int32_t value
types->i64_type(), // int64_t len
types->i32_ptr_type()}; // int32_t* out_len
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARCHAR_int32_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARCHAR_int32_int64));
// gdv_fn_castVARCHAR_int64_int64
args = {types->i64_type(), // int64_t execution_context
types->i64_type(), // int64_t value
types->i64_type(), // int64_t len
types->i32_ptr_type()}; // int32_t* out_len
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARCHAR_int64_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARCHAR_int64_int64));
// gdv_fn_castVARCHAR_milliseconds
args = {types->i64_type(), // int64_t execution_context
types->i64_type(), // gdv_date64 value
types->i64_type(), // int64_t len
types->i32_ptr_type()}; // int32_t* out_len
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARCHAR_date64_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARCHAR_date64_int64));
// gdv_fn_castVARCHAR_float32_int64
args = {types->i64_type(), // int64_t execution_context
types->float_type(), // float value
types->i64_type(), // int64_t len
types->i32_ptr_type()}; // int32_t* out_len
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARCHAR_float32_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARCHAR_float32_int64));
// gdv_fn_castVARCHAR_float64_int64
args = {types->i64_type(), // int64_t execution_context
types->double_type(), // double value
types->i64_type(), // int64_t len
types->i32_ptr_type()}; // int32_t* out_len
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARCHAR_float64_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARCHAR_float64_int64));
// gdv_fn_castVARBINARY_int32
args = {
types->i64_type(), // context
types->i32_type(), // int32_t value
types->i64_type(), // int64_t out value length
types->i32_ptr_type() // int32_t out_length
};
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARBINARY_int32_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARBINARY_int32_int64));
// gdv_fn_castVARBINARY_int64
args = {
types->i64_type(), // context
types->i64_type(), // int64_t value
types->i64_type(), // int64_t out value length
types->i32_ptr_type() // int32_t out_length
};
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARBINARY_int64_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARBINARY_int64_int64));
// gdv_fn_castVARBINARY_float32
args = {
types->i64_type(), // context
types->float_type(), // float value
types->i64_type(), // int64_t out value length
types->i64_ptr_type() // int32_t out_length
};
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARBINARY_float32_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARBINARY_float32_int64));
// gdv_fn_castVARBINARY_float64
args = {
types->i64_type(), // context
types->i64_type(), // double value
types->i64_type(), // int64_t out value length
types->i32_ptr_type() // int32_t out_length
};
engine->AddGlobalMappingForFunc(
"gdv_fn_castVARBINARY_float64_int64", types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_castVARBINARY_float64_int64));
// gdv_fn_lower_utf8
args = {
types->i64_type(), // context
types->i8_ptr_type(), // data
types->i32_type(), // data_len
types->i32_ptr_type(), // out_len
};
engine->AddGlobalMappingForFunc("gdv_fn_lower_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_lower_utf8));
// gdv_fn_upper_utf8
args = {
types->i64_type(), // context
types->i8_ptr_type(), // data
types->i32_type(), // data_len
types->i32_ptr_type(), // out_len
};
engine->AddGlobalMappingForFunc("gdv_fn_upper_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_upper_utf8));
// gdv_fn_substring_index
args = {
types->i64_type(), // context
types->i8_ptr_type(), // txt
types->i32_type(), // txt_len
types->i8_ptr_type(), // pat
types->i32_type(), // pat_len
types->i32_type(), // cnt
types->i32_ptr_type(), // out_len
};
engine->AddGlobalMappingForFunc("gdv_fn_substring_index",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_substring_index));
// gdv_fn_initcap_utf8
args = {
types->i64_type(), // context
types->i8_ptr_type(), // const char*
types->i32_type(), // value_length
types->i32_ptr_type() // out_length
};
engine->AddGlobalMappingForFunc("gdv_fn_initcap_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(gdv_fn_initcap_utf8));
// translate_utf8_utf8_utf8
args = {
types->i64_type(), // context
types->i8_ptr_type(), // const char*
types->i32_type(), // value_length
types->i8_ptr_type(), // const char*
types->i32_type(), // value_length
types->i8_ptr_type(), // const char*
types->i32_type(), // value_length
types->i32_ptr_type() // out_length
};
engine->AddGlobalMappingForFunc("translate_utf8_utf8_utf8",
types->i8_ptr_type() /*return_type*/, args,
reinterpret_cast<void*>(translate_utf8_utf8_utf8));
return arrow::Status::OK();
}
} // namespace gandiva