cpp/src/arrow/compute/kernels/scalar_string.cc - arrow - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <algorithm>
 #include <cctype>
 #include <string>

 #ifdef ARROW_WITH_UTF8PROC
 #include <utf8proc.h>
 #endif

 #ifdef ARROW_WITH_RE2
 #include <re2/re2.h>
 #endif

 #include "arrow/array/builder_binary.h"
 #include "arrow/array/builder_nested.h"
 #include "arrow/buffer_builder.h"
 #include "arrow/compute/api_scalar.h"
 #include "arrow/compute/kernels/common.h"
 #include "arrow/util/utf8.h"
 #include "arrow/util/value_parsing.h"

 namespace arrow {
 namespace compute {
 namespace internal {

 namespace {

 // Code units in the range [a-z] can only be an encoding of an ascii
 // character/codepoint, not the 2nd, 3rd or 4th code unit (byte) of an different
 // codepoint. This guaranteed by non-overlap design of the unicode standard. (see
 // section 2.5 of Unicode Standard Core Specification v13.0)

 static inline uint8_t ascii_tolower(uint8_t utf8_code_unit) {
   return ((utf8_code_unit >= 'A') && (utf8_code_unit <= 'Z')) ? (utf8_code_unit + 32)
                                                               : utf8_code_unit;
 }

 static inline uint8_t ascii_toupper(uint8_t utf8_code_unit) {
   return ((utf8_code_unit >= 'a') && (utf8_code_unit <= 'z')) ? (utf8_code_unit - 32)
                                                               : utf8_code_unit;
 }

 template <typename T>
 static inline bool IsAsciiCharacter(T character) {
   return character < 128;
 }

 struct BinaryLength {
   template <typename OutValue, typename Arg0Value = util::string_view>
   static OutValue Call(KernelContext*, Arg0Value val) {
     return static_cast<OutValue>(val.size());
   }
 };

 struct Utf8Length {
   template <typename OutValue, typename Arg0Value = util::string_view>
   static OutValue Call(KernelContext*, Arg0Value val) {
     auto str = reinterpret_cast<const uint8_t*>(val.data());
     auto strlen = val.size();

     OutValue length = 0;
     while (strlen > 0) {
       length += ((*str & 0xc0) != 0x80);
       ++str;
       --strlen;
     }
     return length;
   }
 };

 #ifdef ARROW_WITH_UTF8PROC

 // Direct lookup tables for unicode properties
 constexpr uint32_t kMaxCodepointLookup =
     0xffff;  // up to this codepoint is in a lookup table
 std::vector<uint32_t> lut_upper_codepoint;
 std::vector<uint32_t> lut_lower_codepoint;
 std::vector<utf8proc_category_t> lut_category;
 std::once_flag flag_case_luts;

 void EnsureLookupTablesFilled() {
   std::call_once(flag_case_luts, []() {
     lut_upper_codepoint.reserve(kMaxCodepointLookup + 1);
     lut_lower_codepoint.reserve(kMaxCodepointLookup + 1);
     for (uint32_t i = 0; i <= kMaxCodepointLookup; i++) {
       lut_upper_codepoint.push_back(utf8proc_toupper(i));
       lut_lower_codepoint.push_back(utf8proc_tolower(i));
       lut_category.push_back(utf8proc_category(i));
     }
   });
 }

 #endif  // ARROW_WITH_UTF8PROC

 /// Transform string -> string with a reasonable guess on the maximum number of codepoints
 template <typename Type, typename Derived>
 struct StringTransform {
   using offset_type = typename Type::offset_type;
   using ArrayType = typename TypeTraits<Type>::ArrayType;

   static int64_t MaxCodeunits(offset_type input_ncodeunits) { return input_ncodeunits; }
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     Derived().Execute(ctx, batch, out);
   }
   void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     if (batch[0].kind() == Datum::ARRAY) {
       const ArrayData& input = *batch[0].array();
       ArrayType input_boxed(batch[0].array());
       ArrayData* output = out->mutable_array();

       offset_type input_ncodeunits = input_boxed.total_values_length();
       offset_type input_nstrings = static_cast<offset_type>(input.length);

       int64_t output_ncodeunits_max = Derived::MaxCodeunits(input_ncodeunits);
       if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
         ctx->SetStatus(Status::CapacityError(
             "Result might not fit in a 32bit utf8 array, convert to large_utf8"));
         return;
       }

       KERNEL_ASSIGN_OR_RAISE(auto values_buffer, ctx,
                              ctx->Allocate(output_ncodeunits_max));
       output->buffers[2] = values_buffer;

       // String offsets are preallocated
       offset_type* output_string_offsets = output->GetMutableValues<offset_type>(1);
       uint8_t* output_str = output->buffers[2]->mutable_data();
       offset_type output_ncodeunits = 0;

       output_string_offsets[0] = 0;
       for (int64_t i = 0; i < input_nstrings; i++) {
         offset_type input_string_ncodeunits;
         const uint8_t* input_string = input_boxed.GetValue(i, &input_string_ncodeunits);
         offset_type encoded_nbytes = 0;
         if (ARROW_PREDICT_FALSE(!static_cast<Derived&>(*this).Transform(
                 input_string, input_string_ncodeunits, output_str + output_ncodeunits,
                 &encoded_nbytes))) {
           ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
           return;
         }
         output_ncodeunits += encoded_nbytes;
         output_string_offsets[i + 1] = output_ncodeunits;
       }

       // Trim the codepoint buffer, since we allocated too much
       KERNEL_RETURN_IF_ERROR(
           ctx, values_buffer->Resize(output_ncodeunits, /*shrink_to_fit=*/true));
     } else {
       const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
       auto result = checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()));
       if (input.is_valid) {
         result->is_valid = true;
         offset_type data_nbytes = static_cast<offset_type>(input.value->size());

         int64_t output_ncodeunits_max = Derived::MaxCodeunits(data_nbytes);
         if (output_ncodeunits_max > std::numeric_limits<offset_type>::max()) {
           ctx->SetStatus(Status::CapacityError(
               "Result might not fit in a 32bit utf8 array, convert to large_utf8"));
           return;
         }
         KERNEL_ASSIGN_OR_RAISE(auto value_buffer, ctx,
                                ctx->Allocate(output_ncodeunits_max));
         result->value = value_buffer;
         offset_type encoded_nbytes = 0;
         if (ARROW_PREDICT_FALSE(!static_cast<Derived&>(*this).Transform(
                 input.value->data(), data_nbytes, value_buffer->mutable_data(),
                 &encoded_nbytes))) {
           ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
           return;
         }
         KERNEL_RETURN_IF_ERROR(
             ctx, value_buffer->Resize(encoded_nbytes, /*shrink_to_fit=*/true));
       }
       out->value = result;
     }
   }
 };

 #ifdef ARROW_WITH_UTF8PROC

 // transforms per codepoint
 template <typename Type, typename Derived>
 struct StringTransformCodepoint : StringTransform<Type, Derived> {
   using Base = StringTransform<Type, Derived>;
   using offset_type = typename Base::offset_type;

   bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
                  uint8_t* output, offset_type* output_written) {
     uint8_t* output_start = output;
     if (ARROW_PREDICT_FALSE(
             !arrow::util::UTF8Transform(input, input + input_string_ncodeunits, &output,
                                         Derived::TransformCodepoint))) {
       return false;
     }
     *output_written = static_cast<offset_type>(output - output_start);
     return true;
   }
   static int64_t MaxCodeunits(offset_type input_ncodeunits) {
     // Section 5.18 of the Unicode spec claim that the number of codepoints for case
     // mapping can grow by a factor of 3. This means grow by a factor of 3 in bytes
     // However, since we don't support all casings (SpecialCasing.txt) the growth
     // in bytes iss actually only at max 3/2 (as covered by the unittest).
     // Note that rounding down the 3/2 is ok, since only codepoints encoded by
     // two code units (even) can grow to 3 code units.
     return static_cast<int64_t>(input_ncodeunits) * 3 / 2;
   }
   void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     EnsureLookupTablesFilled();
     Base::Execute(ctx, batch, out);
   }
 };

 template <typename Type>
 struct UTF8Upper : StringTransformCodepoint<Type, UTF8Upper<Type>> {
   inline static uint32_t TransformCodepoint(uint32_t codepoint) {
     return codepoint <= kMaxCodepointLookup ? lut_upper_codepoint[codepoint]
                                             : utf8proc_toupper(codepoint);
   }
 };

 template <typename Type>
 struct UTF8Lower : StringTransformCodepoint<Type, UTF8Lower<Type>> {
   inline static uint32_t TransformCodepoint(uint32_t codepoint) {
     return codepoint <= kMaxCodepointLookup ? lut_lower_codepoint[codepoint]
                                             : utf8proc_tolower(codepoint);
   }
 };

 #else

 void EnsureLookupTablesFilled() {}

 #endif  // ARROW_WITH_UTF8PROC

 using TransformFunc = std::function<void(const uint8_t*, int64_t, uint8_t*)>;

 // Transform a buffer of offsets to one which begins with 0 and has same
 // value lengths.
 template <typename T>
 Status GetShiftedOffsets(KernelContext* ctx, const Buffer& input_buffer, int64_t offset,
                          int64_t length, std::shared_ptr<Buffer>* out) {
   ARROW_ASSIGN_OR_RAISE(*out, ctx->Allocate((length + 1) * sizeof(T)));
   const T* input_offsets = reinterpret_cast<const T*>(input_buffer.data()) + offset;
   T* out_offsets = reinterpret_cast<T*>((*out)->mutable_data());
   T first_offset = *input_offsets;
   for (int64_t i = 0; i < length; ++i) {
     *out_offsets++ = input_offsets[i] - first_offset;
   }
   *out_offsets = input_offsets[length] - first_offset;
   return Status::OK();
 }

 // Apply `transform` to input character data- this function cannot change the
 // length
 template <typename Type>
 void StringDataTransform(KernelContext* ctx, const ExecBatch& batch,
                          TransformFunc transform, Datum* out) {
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using offset_type = typename Type::offset_type;

   if (batch[0].kind() == Datum::ARRAY) {
     const ArrayData& input = *batch[0].array();
     ArrayType input_boxed(batch[0].array());

     ArrayData* out_arr = out->mutable_array();

     if (input.offset == 0) {
       // We can reuse offsets from input
       out_arr->buffers[1] = input.buffers[1];
     } else {
       DCHECK(input.buffers[1]);
       // We must allocate new space for the offsets and shift the existing offsets
       KERNEL_RETURN_IF_ERROR(
           ctx, GetShiftedOffsets<offset_type>(ctx, *input.buffers[1], input.offset,
                                               input.length, &out_arr->buffers[1]));
     }

     // Allocate space for output data
     int64_t data_nbytes = input_boxed.total_values_length();
     KERNEL_RETURN_IF_ERROR(ctx, ctx->Allocate(data_nbytes).Value(&out_arr->buffers[2]));
     if (input.length > 0) {
       transform(input.buffers[2]->data() + input_boxed.value_offset(0), data_nbytes,
                 out_arr->buffers[2]->mutable_data());
     }
   } else {
     const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
     auto result = checked_pointer_cast<BaseBinaryScalar>(MakeNullScalar(out->type()));
     if (input.is_valid) {
       result->is_valid = true;
       int64_t data_nbytes = input.value->size();
       KERNEL_RETURN_IF_ERROR(ctx, ctx->Allocate(data_nbytes).Value(&result->value));
       transform(input.value->data(), data_nbytes, result->value->mutable_data());
     }
     out->value = result;
   }
 }

 void TransformAsciiUpper(const uint8_t* input, int64_t length, uint8_t* output) {
   std::transform(input, input + length, output, ascii_toupper);
 }

 template <typename Type>
 struct AsciiUpper {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     StringDataTransform<Type>(ctx, batch, TransformAsciiUpper, out);
   }
 };

 void TransformAsciiLower(const uint8_t* input, int64_t length, uint8_t* output) {
   std::transform(input, input + length, output, ascii_tolower);
 }

 template <typename Type>
 struct AsciiLower {
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     StringDataTransform<Type>(ctx, batch, TransformAsciiLower, out);
   }
 };

 // ----------------------------------------------------------------------
 // exact pattern detection

 using StrToBoolTransformFunc =
     std::function<void(const void*, const uint8_t*, int64_t, int64_t, uint8_t*)>;

 // Apply `transform` to input character data- this function cannot change the
 // length
 template <typename Type>
 void StringBoolTransform(KernelContext* ctx, const ExecBatch& batch,
                          StrToBoolTransformFunc transform, Datum* out) {
   using offset_type = typename Type::offset_type;

   if (batch[0].kind() == Datum::ARRAY) {
     const ArrayData& input = *batch[0].array();
     ArrayData* out_arr = out->mutable_array();
     if (input.length > 0) {
       transform(
           reinterpret_cast<const offset_type*>(input.buffers[1]->data()) + input.offset,
           input.buffers[2]->data(), input.length, out_arr->offset,
           out_arr->buffers[1]->mutable_data());
     }
   } else {
     const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
     if (input.is_valid) {
       uint8_t result_value = 0;
       std::array<offset_type, 2> offsets{0,
                                          static_cast<offset_type>(input.value->size())};
       transform(offsets.data(), input.value->data(), 1, /*output_offset=*/0,
                 &result_value);
       out->value = std::make_shared<BooleanScalar>(result_value > 0);
     }
   }
 }

 using MatchSubstringState = OptionsWrapper<MatchSubstringOptions>;

 template <typename Type, typename Matcher>
 struct MatchSubstring {
   using offset_type = typename Type::offset_type;
   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     // TODO Cache matcher across invocations (for regex compilation)
     Matcher matcher(ctx, MatchSubstringState::Get(ctx));
     if (ctx->HasError()) return;
     StringBoolTransform<Type>(
         ctx, batch,
         [&matcher](const void* raw_offsets, const uint8_t* data, int64_t length,
                    int64_t output_offset, uint8_t* output) {
           const offset_type* offsets = reinterpret_cast<const offset_type*>(raw_offsets);
           FirstTimeBitmapWriter bitmap_writer(output, output_offset, length);
           for (int64_t i = 0; i < length; ++i) {
             const char* current_data = reinterpret_cast<const char*>(data + offsets[i]);
             int64_t current_length = offsets[i + 1] - offsets[i];
             if (matcher.Match(util::string_view(current_data, current_length))) {
               bitmap_writer.Set();
             }
             bitmap_writer.Next();
           }
           bitmap_writer.Finish();
         },
         out);
   }
 };

 // This is an implementation of the Knuth-Morris-Pratt algorithm
 struct PlainSubstringMatcher {
   const MatchSubstringOptions& options_;
   std::vector<int64_t> prefix_table;

   PlainSubstringMatcher(KernelContext* ctx, const MatchSubstringOptions& options)
       : options_(options) {
     // Phase 1: Build the prefix table
     const auto pattern_length = options_.pattern.size();
     prefix_table.resize(pattern_length + 1, /*value=*/0);
     int64_t prefix_length = -1;
     prefix_table[0] = -1;
     for (size_t pos = 0; pos < pattern_length; ++pos) {
       // The prefix cannot be expanded, reset.
       while (prefix_length >= 0 &&
              options_.pattern[pos] != options_.pattern[prefix_length]) {
         prefix_length = prefix_table[prefix_length];
       }
       prefix_length++;
       prefix_table[pos + 1] = prefix_length;
     }
   }

   bool Match(util::string_view current) {
     // Phase 2: Find the prefix in the data
     const auto pattern_length = options_.pattern.size();
     int64_t pattern_pos = 0;
     for (const auto c : current) {
       while ((pattern_pos >= 0) && (options_.pattern[pattern_pos] != c)) {
         pattern_pos = prefix_table[pattern_pos];
       }
       pattern_pos++;
       if (static_cast<size_t>(pattern_pos) == pattern_length) {
         return true;
       }
     }
     return false;
   }
 };

 const FunctionDoc match_substring_doc(
     "Match strings against literal pattern",
     ("For each string in `strings`, emit true iff it contains a given pattern.\n"
      "Null inputs emit null.  The pattern must be given in MatchSubstringOptions."),
     {"strings"}, "MatchSubstringOptions");

 #ifdef ARROW_WITH_RE2
 struct RegexSubstringMatcher {
   const MatchSubstringOptions& options_;
   const RE2 regex_match_;

   RegexSubstringMatcher(KernelContext* ctx, const MatchSubstringOptions& options)
       : options_(options), regex_match_(options_.pattern) {
     if (!regex_match_.ok()) {
       ctx->SetStatus(Status::Invalid("Regular expression error"));
     }
   }

   bool Match(util::string_view current) {
     auto piece = re2::StringPiece(current.data(), current.length());
     return re2::RE2::PartialMatch(piece, regex_match_);
   }
 };

 const FunctionDoc match_substring_regex_doc(
     "Match strings against regex pattern",
     ("For each string in `strings`, emit true iff it matches a given pattern at any "
      "position.\n"
      "Null inputs emit null.  The pattern must be given in MatchSubstringOptions."),
     {"strings"}, "MatchSubstringOptions");
 #endif

 void AddMatchSubstring(FunctionRegistry* registry) {
   {
     auto func = std::make_shared<ScalarFunction>("match_substring", Arity::Unary(),
                                                  &match_substring_doc);
     auto exec_32 = MatchSubstring<StringType, PlainSubstringMatcher>::Exec;
     auto exec_64 = MatchSubstring<LargeStringType, PlainSubstringMatcher>::Exec;
     DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
     DCHECK_OK(
         func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
 #ifdef ARROW_WITH_RE2
   {
     auto func = std::make_shared<ScalarFunction>("match_substring_regex", Arity::Unary(),
                                                  &match_substring_regex_doc);
     auto exec_32 = MatchSubstring<StringType, RegexSubstringMatcher>::Exec;
     auto exec_64 = MatchSubstring<LargeStringType, RegexSubstringMatcher>::Exec;
     DCHECK_OK(func->AddKernel({utf8()}, boolean(), exec_32, MatchSubstringState::Init));
     DCHECK_OK(
         func->AddKernel({large_utf8()}, boolean(), exec_64, MatchSubstringState::Init));
     DCHECK_OK(registry->AddFunction(std::move(func)));
   }
 #endif
 }

 // IsAlpha/Digit etc

 #ifdef ARROW_WITH_UTF8PROC

 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask) {
   utf8proc_category_t general_category = codepoint <= kMaxCodepointLookup
                                              ? lut_category[codepoint]
                                              : utf8proc_category(codepoint);
   uint32_t general_category_bit = 1 << general_category;
   // for e.g. undefined (but valid) codepoints, general_category == 0 ==
   // UTF8PROC_CATEGORY_CN
   return (general_category != UTF8PROC_CATEGORY_CN) &&
          ((general_category_bit & mask) != 0);
 }

 template <typename... Categories>
 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint, uint32_t mask,
                                                 utf8proc_category_t category,
                                                 Categories... categories) {
   return HasAnyUnicodeGeneralCategory(codepoint, mask | (1 << category), categories...);
 }

 template <typename... Categories>
 static inline bool HasAnyUnicodeGeneralCategory(uint32_t codepoint,
                                                 utf8proc_category_t category,
                                                 Categories... categories) {
   return HasAnyUnicodeGeneralCategory(codepoint, static_cast<uint32_t>(1u << category),
                                       categories...);
 }

 static inline bool IsCasedCharacterUnicode(uint32_t codepoint) {
   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
                                       UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT) ||
          ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) ||
           (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint));
 }

 static inline bool IsLowerCaseCharacterUnicode(uint32_t codepoint) {
   // although this trick seems to work for upper case, this is not enough for lower case
   // testing, see https://github.com/JuliaStrings/utf8proc/issues/195 . But currently the
   // best we can do
   return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LL) ||
           ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) != codepoint) &&
            (static_cast<uint32_t>(utf8proc_tolower(codepoint)) == codepoint))) &&
          !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
 }

 static inline bool IsUpperCaseCharacterUnicode(uint32_t codepoint) {
   // this seems to be a good workaround for utf8proc not having case information
   // https://github.com/JuliaStrings/utf8proc/issues/195
   return (HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU) ||
           ((static_cast<uint32_t>(utf8proc_toupper(codepoint)) == codepoint) &&
            (static_cast<uint32_t>(utf8proc_tolower(codepoint)) != codepoint))) &&
          !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LT);
 }

 static inline bool IsAlphaNumericCharacterUnicode(uint32_t codepoint) {
   return HasAnyUnicodeGeneralCategory(
       codepoint, UTF8PROC_CATEGORY_LU, UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
       UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO, UTF8PROC_CATEGORY_ND,
       UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
 }

 static inline bool IsAlphaCharacterUnicode(uint32_t codepoint) {
   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_LU,
                                       UTF8PROC_CATEGORY_LL, UTF8PROC_CATEGORY_LT,
                                       UTF8PROC_CATEGORY_LM, UTF8PROC_CATEGORY_LO);
 }

 static inline bool IsDecimalCharacterUnicode(uint32_t codepoint) {
   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
 }

 static inline bool IsDigitCharacterUnicode(uint32_t codepoint) {
   // Python defines this as Numeric_Type=Digit or Numeric_Type=Decimal.
   // utf8proc has no support for this, this is the best we can do:
   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND);
 }

 static inline bool IsNumericCharacterUnicode(uint32_t codepoint) {
   // Formally this is not correct, but utf8proc does not allow us to query for Numerical
   // properties, e.g. Numeric_Value and Numeric_Type
   // Python defines Numeric as Numeric_Type=Digit, Numeric_Type=Decimal or
   // Numeric_Type=Numeric.
   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ND,
                                       UTF8PROC_CATEGORY_NL, UTF8PROC_CATEGORY_NO);
 }

 static inline bool IsSpaceCharacterUnicode(uint32_t codepoint) {
   auto property = utf8proc_get_property(codepoint);
   return HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_ZS) ||
          property->bidi_class == UTF8PROC_BIDI_CLASS_WS ||
          property->bidi_class == UTF8PROC_BIDI_CLASS_B ||
          property->bidi_class == UTF8PROC_BIDI_CLASS_S;
 }

 static inline bool IsPrintableCharacterUnicode(uint32_t codepoint) {
   uint32_t general_category = utf8proc_category(codepoint);
   return (general_category != UTF8PROC_CATEGORY_CN) &&
          !HasAnyUnicodeGeneralCategory(codepoint, UTF8PROC_CATEGORY_CC,
                                        UTF8PROC_CATEGORY_CF, UTF8PROC_CATEGORY_CS,
                                        UTF8PROC_CATEGORY_CO, UTF8PROC_CATEGORY_ZS,
                                        UTF8PROC_CATEGORY_ZL, UTF8PROC_CATEGORY_ZP);
 }

 #endif

 static inline bool IsLowerCaseCharacterAscii(uint8_t ascii_character) {
   return (ascii_character >= 'a') && (ascii_character <= 'z');
 }

 static inline bool IsUpperCaseCharacterAscii(uint8_t ascii_character) {
   return (ascii_character >= 'A') && (ascii_character <= 'Z');
 }

 static inline bool IsCasedCharacterAscii(uint8_t ascii_character) {
   return IsLowerCaseCharacterAscii(ascii_character) ||
          IsUpperCaseCharacterAscii(ascii_character);
 }

 static inline bool IsAlphaCharacterAscii(uint8_t ascii_character) {
   return IsCasedCharacterAscii(ascii_character);  // same
 }

 static inline bool IsAlphaNumericCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= '0') && (ascii_character <= '9')) ||
          ((ascii_character >= 'a') && (ascii_character <= 'z')) ||
          ((ascii_character >= 'A') && (ascii_character <= 'Z'));
 }

 static inline bool IsDecimalCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= '0') && (ascii_character <= '9'));
 }

 static inline bool IsSpaceCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= 0x09) && (ascii_character <= 0x0D)) ||
          (ascii_character == ' ');
 }

 static inline bool IsPrintableCharacterAscii(uint8_t ascii_character) {
   return ((ascii_character >= ' ') && (ascii_character <= '~'));
 }

 template <typename Derived, bool allow_empty = false>
 struct CharacterPredicateUnicode {
   static bool Call(KernelContext* ctx, const uint8_t* input,
                    size_t input_string_ncodeunits) {
     if (allow_empty && input_string_ncodeunits == 0) {
       return true;
     }
     bool all;
     bool any = false;
     if (!ARROW_PREDICT_TRUE(arrow::util::UTF8AllOf(
             input, input + input_string_ncodeunits, &all, [&any](uint32_t codepoint) {
               any |= Derived::PredicateCharacterAny(codepoint);
               return Derived::PredicateCharacterAll(codepoint);
             }))) {
       ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
       return false;
     }
     return all & any;
   }

   static inline bool PredicateCharacterAny(uint32_t) {
     return true;  // default condition make sure there is at least 1 charachter
   }
 };

 template <typename Derived, bool allow_empty = false>
 struct CharacterPredicateAscii {
   static bool Call(KernelContext* ctx, const uint8_t* input,
                    size_t input_string_ncodeunits) {
     if (allow_empty && input_string_ncodeunits == 0) {
       return true;
     }
     bool any = false;
     // MB: A simple for loops seems 8% faster on gcc 9.3, running the IsAlphaNumericAscii
     // benchmark. I don't consider that worth it.
     bool all = std::all_of(input, input + input_string_ncodeunits,
                            [&any](uint8_t ascii_character) {
                              any |= Derived::PredicateCharacterAny(ascii_character);
                              return Derived::PredicateCharacterAll(ascii_character);
                            });
     return all & any;
   }

   static inline bool PredicateCharacterAny(uint8_t) {
     return true;  // default condition make sure there is at least 1 charachter
   }
 };

 #ifdef ARROW_WITH_UTF8PROC
 struct IsAlphaNumericUnicode : CharacterPredicateUnicode<IsAlphaNumericUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsAlphaNumericCharacterUnicode(codepoint);
   }
 };
 #endif

 struct IsAlphaNumericAscii : CharacterPredicateAscii<IsAlphaNumericAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsAlphaNumericCharacterAscii(ascii_character);
   }
 };

 #ifdef ARROW_WITH_UTF8PROC
 struct IsAlphaUnicode : CharacterPredicateUnicode<IsAlphaUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsAlphaCharacterUnicode(codepoint);
   }
 };
 #endif

 struct IsAlphaAscii : CharacterPredicateAscii<IsAlphaAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsAlphaCharacterAscii(ascii_character);
   }
 };

 #ifdef ARROW_WITH_UTF8PROC
 struct IsDecimalUnicode : CharacterPredicateUnicode<IsDecimalUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsDecimalCharacterUnicode(codepoint);
   }
 };
 #endif

 struct IsDecimalAscii : CharacterPredicateAscii<IsDecimalAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsDecimalCharacterAscii(ascii_character);
   }
 };

 #ifdef ARROW_WITH_UTF8PROC
 struct IsDigitUnicode : CharacterPredicateUnicode<IsDigitUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsDigitCharacterUnicode(codepoint);
   }
 };

 struct IsNumericUnicode : CharacterPredicateUnicode<IsNumericUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsNumericCharacterUnicode(codepoint);
   }
 };
 #endif

 struct IsAscii {
   static bool Call(KernelContext* ctx, const uint8_t* input,
                    size_t input_string_nascii_characters) {
     return std::all_of(input, input + input_string_nascii_characters,
                        IsAsciiCharacter<uint8_t>);
   }
 };

 #ifdef ARROW_WITH_UTF8PROC
 struct IsLowerUnicode : CharacterPredicateUnicode<IsLowerUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterUnicode(codepoint) || IsLowerCaseCharacterUnicode(codepoint);
   }
   static inline bool PredicateCharacterAny(uint32_t codepoint) {
     return IsCasedCharacterUnicode(codepoint);  // at least 1 cased character
   }
 };
 #endif

 struct IsLowerAscii : CharacterPredicateAscii<IsLowerAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterAscii(ascii_character) ||
            IsLowerCaseCharacterAscii(ascii_character);
   }
   static inline bool PredicateCharacterAny(uint8_t ascii_character) {
     return IsCasedCharacterAscii(ascii_character);  // at least 1 cased character
   }
 };

 #ifdef ARROW_WITH_UTF8PROC
 struct IsPrintableUnicode
     : CharacterPredicateUnicode<IsPrintableUnicode, /*allow_empty=*/true> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return codepoint == ' ' || IsPrintableCharacterUnicode(codepoint);
   }
 };
 #endif

 struct IsPrintableAscii
     : CharacterPredicateAscii<IsPrintableAscii, /*allow_empty=*/true> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsPrintableCharacterAscii(ascii_character);
   }
 };

 #ifdef ARROW_WITH_UTF8PROC
 struct IsSpaceUnicode : CharacterPredicateUnicode<IsSpaceUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     return IsSpaceCharacterUnicode(codepoint);
   }
 };
 #endif

 struct IsSpaceAscii : CharacterPredicateAscii<IsSpaceAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     return IsSpaceCharacterAscii(ascii_character);
   }
 };

 #ifdef ARROW_WITH_UTF8PROC
 struct IsTitleUnicode {
   static bool Call(KernelContext* ctx, const uint8_t* input,
                    size_t input_string_ncodeunits) {
     // rules:
     // * 1: lower case follows cased
     // * 2: upper case follows uncased
     // * 3: at least 1 cased character (which logically should be upper/title)
     bool rules_1_and_2;
     bool previous_cased = false;  // in LL, LU or LT
     bool rule_3 = false;
     bool status =
         arrow::util::UTF8AllOf(input, input + input_string_ncodeunits, &rules_1_and_2,
                                [&previous_cased, &rule_3](uint32_t codepoint) {
                                  if (IsLowerCaseCharacterUnicode(codepoint)) {
                                    if (!previous_cased) return false;  // rule 1 broken
                                    previous_cased = true;
                                  } else if (IsCasedCharacterUnicode(codepoint)) {
                                    if (previous_cased) return false;  // rule 2 broken
                                    // next should be a lower case or uncased
                                    previous_cased = true;
                                    rule_3 = true;  // rule 3 obeyed
                                  } else {
                                    // a non-cased char, like _ or 1
                                    // next should be upper case or more uncased
                                    previous_cased = false;
                                  }
                                  return true;
                                });
     if (!ARROW_PREDICT_TRUE(status)) {
       ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
       return false;
     }
     return rules_1_and_2 & rule_3;
   }
 };
 #endif

 struct IsTitleAscii {
   static bool Call(KernelContext* ctx, const uint8_t* input,
                    size_t input_string_ncodeunits) {
     // rules:
     // * 1: lower case follows cased
     // * 2: upper case follows uncased
     // * 3: at least 1 cased character (which logically should be upper/title)
     bool rules_1_and_2 = true;
     bool previous_cased = false;  // in LL, LU or LT
     bool rule_3 = false;
     // we cannot rely on std::all_of because we need guaranteed order
     for (const uint8_t* c = input; c < input + input_string_ncodeunits; ++c) {
       if (IsLowerCaseCharacterAscii(*c)) {
         if (!previous_cased) {
           // rule 1 broken
           rules_1_and_2 = false;
           break;
         }
         previous_cased = true;
       } else if (IsCasedCharacterAscii(*c)) {
         if (previous_cased) {
           // rule 2 broken
           rules_1_and_2 = false;
           break;
         }
         // next should be a lower case or uncased
         previous_cased = true;
         rule_3 = true;  // rule 3 obeyed
       } else {
         // a non-cased char, like _ or 1
         // next should be upper case or more uncased
         previous_cased = false;
       }
     }
     return rules_1_and_2 & rule_3;
   }
 };

 #ifdef ARROW_WITH_UTF8PROC
 struct IsUpperUnicode : CharacterPredicateUnicode<IsUpperUnicode> {
   static inline bool PredicateCharacterAll(uint32_t codepoint) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterUnicode(codepoint) || IsUpperCaseCharacterUnicode(codepoint);
   }
   static inline bool PredicateCharacterAny(uint32_t codepoint) {
     return IsCasedCharacterUnicode(codepoint);  // at least 1 cased character
   }
 };
 #endif

 struct IsUpperAscii : CharacterPredicateAscii<IsUpperAscii> {
   static inline bool PredicateCharacterAll(uint8_t ascii_character) {
     // Only for cased character it needs to be lower case
     return !IsCasedCharacterAscii(ascii_character) ||
            IsUpperCaseCharacterAscii(ascii_character);
   }
   static inline bool PredicateCharacterAny(uint8_t ascii_character) {
     return IsCasedCharacterAscii(ascii_character);  // at least 1 cased character
   }
 };

 // splitting

 template <typename Type, typename ListType, typename Options, typename Derived>
 struct SplitBaseTransform {
   using string_offset_type = typename Type::offset_type;
   using list_offset_type = typename ListType::offset_type;
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using ArrayListType = typename TypeTraits<ListType>::ArrayType;
   using ListScalarType = typename TypeTraits<ListType>::ScalarType;
   using ScalarType = typename TypeTraits<Type>::ScalarType;
   using BuilderType = typename TypeTraits<Type>::BuilderType;
   using ListOffsetsBuilderType = TypedBufferBuilder<list_offset_type>;
   using State = OptionsWrapper<Options>;

   std::vector<util::string_view> parts;
   Options options;

   explicit SplitBaseTransform(Options options) : options(options) {}

   Status Split(const util::string_view& s, BuilderType* builder) {
     const uint8_t* begin = reinterpret_cast<const uint8_t*>(s.data());
     const uint8_t* end = begin + s.length();

     int64_t max_splits = options.max_splits;
     // if there is no max splits, reversing does not make sense (and is probably less
     // efficient), but is useful for testing
     if (options.reverse) {
       // note that i points 1 further than the 'current'
       const uint8_t* i = end;
       // we will record the parts in reverse order
       parts.clear();
       if (max_splits > -1) {
         parts.reserve(max_splits + 1);
       }
       while (max_splits != 0) {
         const uint8_t *separator_begin, *separator_end;
         // find with whatever algo the part we will 'cut out'
         if (static_cast<Derived&>(*this).FindReverse(begin, i, &separator_begin,
                                                      &separator_end, options)) {
           parts.emplace_back(reinterpret_cast<const char*>(separator_end),
                              i - separator_end);
           i = separator_begin;
           max_splits--;
         } else {
           // if we cannot find a separator, we're done
           break;
         }
       }
       parts.emplace_back(reinterpret_cast<const char*>(begin), i - begin);
       // now we do the copying
       for (auto it = parts.rbegin(); it != parts.rend(); ++it) {
         RETURN_NOT_OK(builder->Append(*it));
       }
     } else {
       const uint8_t* i = begin;
       while (max_splits != 0) {
         const uint8_t *separator_begin, *separator_end;
         // find with whatever algo the part we will 'cut out'
         if (static_cast<Derived&>(*this).Find(i, end, &separator_begin, &separator_end,
                                               options)) {
           // the part till the beginning of the 'cut'
           RETURN_NOT_OK(
               builder->Append(i, static_cast<string_offset_type>(separator_begin - i)));
           i = separator_end;
           max_splits--;
         } else {
           // if we cannot find a separator, we're done
           break;
         }
       }
       // trailing part
       RETURN_NOT_OK(builder->Append(i, static_cast<string_offset_type>(end - i)));
     }
     return Status::OK();
   }

   static Status CheckOptions(const Options& options) { return Status::OK(); }

   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     Options options = State::Get(ctx);
     Derived splitter(options);  // we make an instance to reuse the parts vectors
     splitter.Split(ctx, batch, out);
   }

   void Split(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     EnsureLookupTablesFilled();  // only needed for unicode
     KERNEL_RETURN_IF_ERROR(ctx, Derived::CheckOptions(options));

     if (batch[0].kind() == Datum::ARRAY) {
       const ArrayData& input = *batch[0].array();
       ArrayType input_boxed(batch[0].array());

       BuilderType builder(input.type, ctx->memory_pool());
       // a slight overestimate of the data needed
       KERNEL_RETURN_IF_ERROR(ctx, builder.ReserveData(input_boxed.total_values_length()));
       // the minimum amount of strings needed
       KERNEL_RETURN_IF_ERROR(ctx, builder.Resize(input.length));

       ArrayData* output_list = out->mutable_array();
       // list offsets were preallocated
       auto* list_offsets = output_list->GetMutableValues<list_offset_type>(1);
       DCHECK_NE(list_offsets, nullptr);
       // initial value
       *list_offsets++ = 0;
       KERNEL_RETURN_IF_ERROR(
           ctx,
           VisitArrayDataInline<Type>(
               input,
               [&](util::string_view s) {
                 RETURN_NOT_OK(Split(s, &builder));
                 if (ARROW_PREDICT_FALSE(builder.length() >
                                         std::numeric_limits<list_offset_type>::max())) {
                   return Status::CapacityError("List offset does not fit into 32 bit");
                 }
                 *list_offsets++ = static_cast<list_offset_type>(builder.length());
                 return Status::OK();
               },
               [&]() {
                 // null value is already taken from input
                 *list_offsets++ = static_cast<list_offset_type>(builder.length());
                 return Status::OK();
               }));
       // assign list child data
       std::shared_ptr<Array> string_array;
       KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&string_array));
       output_list->child_data.push_back(string_array->data());

     } else {
       const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
       auto result = checked_pointer_cast<ListScalarType>(MakeNullScalar(out->type()));
       if (input.is_valid) {
         result->is_valid = true;
         BuilderType builder(input.type, ctx->memory_pool());
         util::string_view s(*input.value);
         KERNEL_RETURN_IF_ERROR(ctx, Split(s, &builder));
         KERNEL_RETURN_IF_ERROR(ctx, builder.Finish(&result->value));
       }
       out->value = result;
     }
   }
 };

 template <typename Type, typename ListType>
 struct SplitPatternTransform : SplitBaseTransform<Type, ListType, SplitPatternOptions,
                                                   SplitPatternTransform<Type, ListType>> {
   using Base = SplitBaseTransform<Type, ListType, SplitPatternOptions,
                                   SplitPatternTransform<Type, ListType>>;
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using ScalarType = typename TypeTraits<Type>::ScalarType;
   using string_offset_type = typename Type::offset_type;
   using Base::Base;

   static Status CheckOptions(const SplitPatternOptions& options) {
     if (options.pattern.length() == 0) {
       return Status::Invalid("Empty separator");
     }
     return Status::OK();
   }

   static bool Find(const uint8_t* begin, const uint8_t* end,
                    const uint8_t** separator_begin, const uint8_t** separator_end,
                    const SplitPatternOptions& options) {
     const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
     const int64_t pattern_length = options.pattern.length();
     const uint8_t* i = begin;
     // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
     // the match kernel
     while ((i + pattern_length <= end)) {
       i = std::search(i, end, pattern, pattern + pattern_length);
       if (i != end) {
         *separator_begin = i;
         *separator_end = i + pattern_length;
         return true;
       }
     }
     return false;
   }

   static bool FindReverse(const uint8_t* begin, const uint8_t* end,
                           const uint8_t** separator_begin, const uint8_t** separator_end,
                           const SplitPatternOptions& options) {
     const uint8_t* pattern = reinterpret_cast<const uint8_t*>(options.pattern.c_str());
     const int64_t pattern_length = options.pattern.length();
     // this is O(n*m) complexity, we could use the Knuth-Morris-Pratt algorithm used in
     // the match kernel
     std::reverse_iterator<const uint8_t*> ri(end);
     std::reverse_iterator<const uint8_t*> rend(begin);
     std::reverse_iterator<const uint8_t*> pattern_rbegin(pattern + pattern_length);
     std::reverse_iterator<const uint8_t*> pattern_rend(pattern);
     while (begin <= ri.base() - pattern_length) {
       ri = std::search(ri, rend, pattern_rbegin, pattern_rend);
       if (ri != rend) {
         *separator_begin = ri.base() - pattern_length;
         *separator_end = ri.base();
         return true;
       }
     }
     return false;
   }
 };

 const FunctionDoc split_pattern_doc(
     "Split string according to separator",
     ("Split each string according to the exact `pattern` defined in\n"
      "SplitPatternOptions.  The output for each string input is a list\n"
      "of strings.\n"
      "\n"
      "The maximum number of splits and direction of splitting\n"
      "(forward, reverse) can optionally be defined in SplitPatternOptions."),
     {"strings"}, "SplitPatternOptions");

 const FunctionDoc ascii_split_whitespace_doc(
     "Split string according to any ASCII whitespace",
     ("Split each string according any non-zero length sequence of ASCII\n"
      "whitespace characters.  The output for each string input is a list\n"
      "of strings.\n"
      "\n"
      "The maximum number of splits and direction of splitting\n"
      "(forward, reverse) can optionally be defined in SplitOptions."),
     {"strings"}, "SplitOptions");

 const FunctionDoc utf8_split_whitespace_doc(
     "Split string according to any Unicode whitespace",
     ("Split each string according any non-zero length sequence of Unicode\n"
      "whitespace characters.  The output for each string input is a list\n"
      "of strings.\n"
      "\n"
      "The maximum number of splits and direction of splitting\n"
      "(forward, reverse) can optionally be defined in SplitOptions."),
     {"strings"}, "SplitOptions");

 void AddSplitPattern(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("split_pattern", Arity::Unary(),
                                                &split_pattern_doc);
   using t32 = SplitPatternTransform<StringType, ListType>;
   using t64 = SplitPatternTransform<LargeStringType, ListType>;
   DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
   DCHECK_OK(
       func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }

 template <typename Type, typename ListType>
 struct SplitWhitespaceAsciiTransform
     : SplitBaseTransform<Type, ListType, SplitOptions,
                          SplitWhitespaceAsciiTransform<Type, ListType>> {
   using Base = SplitBaseTransform<Type, ListType, SplitOptions,
                                   SplitWhitespaceAsciiTransform<Type, ListType>>;
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using ScalarType = typename TypeTraits<Type>::ScalarType;
   using string_offset_type = typename Type::offset_type;
   using Base::Base;
   static bool Find(const uint8_t* begin, const uint8_t* end,
                    const uint8_t** separator_begin, const uint8_t** separator_end,
                    const SplitOptions& options) {
     const uint8_t* i = begin;
     while ((i < end)) {
       if (IsSpaceCharacterAscii(*i)) {
         *separator_begin = i;
         do {
           i++;
         } while (IsSpaceCharacterAscii(*i) && i < end);
         *separator_end = i;
         return true;
       }
       i++;
     }
     return false;
   }
   static bool FindReverse(const uint8_t* begin, const uint8_t* end,
                           const uint8_t** separator_begin, const uint8_t** separator_end,
                           const SplitOptions& options) {
     const uint8_t* i = end - 1;
     while ((i >= begin)) {
       if (IsSpaceCharacterAscii(*i)) {
         *separator_end = i + 1;
         do {
           i--;
         } while (IsSpaceCharacterAscii(*i) && i >= begin);
         *separator_begin = i + 1;
         return true;
       }
       i--;
     }
     return false;
   }
 };

 void AddSplitWhitespaceAscii(FunctionRegistry* registry) {
   static const SplitOptions default_options{};
   auto func =
       std::make_shared<ScalarFunction>("ascii_split_whitespace", Arity::Unary(),
                                        &ascii_split_whitespace_doc, &default_options);
   using t32 = SplitWhitespaceAsciiTransform<StringType, ListType>;
   using t64 = SplitWhitespaceAsciiTransform<LargeStringType, ListType>;
   DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
   DCHECK_OK(
       func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }

 #ifdef ARROW_WITH_UTF8PROC
 template <typename Type, typename ListType>
 struct SplitWhitespaceUtf8Transform
     : SplitBaseTransform<Type, ListType, SplitOptions,
                          SplitWhitespaceUtf8Transform<Type, ListType>> {
   using Base = SplitBaseTransform<Type, ListType, SplitOptions,
                                   SplitWhitespaceUtf8Transform<Type, ListType>>;
   using ArrayType = typename TypeTraits<Type>::ArrayType;
   using string_offset_type = typename Type::offset_type;
   using ScalarType = typename TypeTraits<Type>::ScalarType;
   using Base::Base;
   static bool Find(const uint8_t* begin, const uint8_t* end,
                    const uint8_t** separator_begin, const uint8_t** separator_end,
                    const SplitOptions& options) {
     const uint8_t* i = begin;
     while ((i < end)) {
       uint32_t codepoint = 0;
       *separator_begin = i;
       if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
         return false;
       }
       if (IsSpaceCharacterUnicode(codepoint)) {
         do {
           *separator_end = i;
           if (ARROW_PREDICT_FALSE(!arrow::util::UTF8Decode(&i, &codepoint))) {
             return false;
           }
         } while (IsSpaceCharacterUnicode(codepoint) && i < end);
         return true;
       }
     }
     return false;
   }
   static bool FindReverse(const uint8_t* begin, const uint8_t* end,
                           const uint8_t** separator_begin, const uint8_t** separator_end,
                           const SplitOptions& options) {
     const uint8_t* i = end - 1;
     while ((i >= begin)) {
       uint32_t codepoint = 0;
       *separator_end = i + 1;
       if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
         return false;
       }
       if (IsSpaceCharacterUnicode(codepoint)) {
         do {
           *separator_begin = i + 1;
           if (ARROW_PREDICT_FALSE(!arrow::util::UTF8DecodeReverse(&i, &codepoint))) {
             return false;
           }
         } while (IsSpaceCharacterUnicode(codepoint) && i >= begin);
         return true;
       }
     }
     return false;
   }
 };

 void AddSplitWhitespaceUTF8(FunctionRegistry* registry) {
   static const SplitOptions default_options{};
   auto func =
       std::make_shared<ScalarFunction>("utf8_split_whitespace", Arity::Unary(),
                                        &utf8_split_whitespace_doc, &default_options);
   using t32 = SplitWhitespaceUtf8Transform<StringType, ListType>;
   using t64 = SplitWhitespaceUtf8Transform<LargeStringType, ListType>;
   DCHECK_OK(func->AddKernel({utf8()}, {list(utf8())}, t32::Exec, t32::State::Init));
   DCHECK_OK(
       func->AddKernel({large_utf8()}, {list(large_utf8())}, t64::Exec, t64::State::Init));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }
 #endif

 void AddSplit(FunctionRegistry* registry) {
   AddSplitPattern(registry);
   AddSplitWhitespaceAscii(registry);
 #ifdef ARROW_WITH_UTF8PROC
   AddSplitWhitespaceUTF8(registry);
 #endif
 }

 // ----------------------------------------------------------------------
 // Replace substring (plain, regex)

 template <typename Type, typename Replacer>
 struct ReplaceSubString {
   using ScalarType = typename TypeTraits<Type>::ScalarType;
   using offset_type = typename Type::offset_type;
   using ValueDataBuilder = TypedBufferBuilder<uint8_t>;
   using OffsetBuilder = TypedBufferBuilder<offset_type>;
   using State = OptionsWrapper<ReplaceSubstringOptions>;

   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     // TODO Cache replacer across invocations (for regex compilation)
     Replacer replacer{ctx, State::Get(ctx)};
     if (!ctx->HasError()) {
       Replace(ctx, batch, &replacer, out);
     }
   }

   static void Replace(KernelContext* ctx, const ExecBatch& batch, Replacer* replacer,
                       Datum* out) {
     ValueDataBuilder value_data_builder(ctx->memory_pool());
     OffsetBuilder offset_builder(ctx->memory_pool());

     if (batch[0].kind() == Datum::ARRAY) {
       // We already know how many strings we have, so we can use Reserve/UnsafeAppend
       KERNEL_RETURN_IF_ERROR(ctx, offset_builder.Reserve(batch[0].array()->length));
       offset_builder.UnsafeAppend(0);  // offsets start at 0

       const ArrayData& input = *batch[0].array();
       KERNEL_RETURN_IF_ERROR(
           ctx, VisitArrayDataInline<Type>(
                    input,
                    [&](util::string_view s) {
                      RETURN_NOT_OK(replacer->ReplaceString(s, &value_data_builder));
                      offset_builder.UnsafeAppend(
                          static_cast<offset_type>(value_data_builder.length()));
                      return Status::OK();
                    },
                    [&]() {
                      // offset for null value
                      offset_builder.UnsafeAppend(
                          static_cast<offset_type>(value_data_builder.length()));
                      return Status::OK();
                    }));
       ArrayData* output = out->mutable_array();
       KERNEL_RETURN_IF_ERROR(ctx, value_data_builder.Finish(&output->buffers[2]));
       KERNEL_RETURN_IF_ERROR(ctx, offset_builder.Finish(&output->buffers[1]));
     } else {
       const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar());
       auto result = std::make_shared<ScalarType>();
       if (input.is_valid) {
         util::string_view s = static_cast<util::string_view>(*input.value);
         KERNEL_RETURN_IF_ERROR(ctx, replacer->ReplaceString(s, &value_data_builder));
         KERNEL_RETURN_IF_ERROR(ctx, value_data_builder.Finish(&result->value));
         result->is_valid = true;
       }
       out->value = result;
     }
   }
 };

 struct PlainSubStringReplacer {
   const ReplaceSubstringOptions& options_;

   PlainSubStringReplacer(KernelContext* ctx, const ReplaceSubstringOptions& options)
       : options_(options) {}

   Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) {
     const char* i = s.begin();
     const char* end = s.end();
     int64_t max_replacements = options_.max_replacements;
     while ((i < end) && (max_replacements != 0)) {
       const char* pos =
           std::search(i, end, options_.pattern.begin(), options_.pattern.end());
       if (pos == end) {
         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
                                       static_cast<int64_t>(end - i)));
         i = end;
       } else {
         // the string before the pattern
         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
                                       static_cast<int64_t>(pos - i)));
         // the replacement
         RETURN_NOT_OK(
             builder->Append(reinterpret_cast<const uint8_t*>(options_.replacement.data()),
                             options_.replacement.length()));
         // skip pattern
         i = pos + options_.pattern.length();
         max_replacements--;
       }
     }
     // if we exited early due to max_replacements, add the trailing part
     RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
                                   static_cast<int64_t>(end - i)));
     return Status::OK();
   }
 };

 #ifdef ARROW_WITH_RE2
 struct RegexSubStringReplacer {
   const ReplaceSubstringOptions& options_;
   const RE2 regex_find_;
   const RE2 regex_replacement_;

   // Using RE2::FindAndConsume we can only find the pattern if it is a group, therefore
   // we have 2 regexes, one with () around it, one without.
   RegexSubStringReplacer(KernelContext* ctx, const ReplaceSubstringOptions& options)
       : options_(options),
         regex_find_("(" + options_.pattern + ")"),
         regex_replacement_(options_.pattern) {
     if (!(regex_find_.ok() && regex_replacement_.ok())) {
       ctx->SetStatus(Status::Invalid("Regular expression error"));
       return;
     }
   }

   Status ReplaceString(util::string_view s, TypedBufferBuilder<uint8_t>* builder) {
     re2::StringPiece replacement(options_.replacement);
     if (options_.max_replacements == -1) {
       std::string s_copy(s.to_string());
       re2::RE2::GlobalReplace(&s_copy, regex_replacement_, replacement);
       RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(s_copy.data()),
                                     s_copy.length()));
       return Status::OK();
     }

     // Since RE2 does not have the concept of max_replacements, we have to do some work
     // ourselves.
     // We might do this faster similar to RE2::GlobalReplace using Match and Rewrite
     const char* i = s.begin();
     const char* end = s.end();
     re2::StringPiece piece(s.data(), s.length());

     int64_t max_replacements = options_.max_replacements;
     while ((i < end) && (max_replacements != 0)) {
       std::string found;
       if (!re2::RE2::FindAndConsume(&piece, regex_find_, &found)) {
         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
                                       static_cast<int64_t>(end - i)));
         i = end;
       } else {
         // wind back to the beginning of the match
         const char* pos = piece.begin() - found.length();
         // the string before the pattern
         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
                                       static_cast<int64_t>(pos - i)));
         // replace the pattern in what we found
         if (!re2::RE2::Replace(&found, regex_replacement_, replacement)) {
           return Status::Invalid("Regex found, but replacement failed");
         }
         RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(found.data()),
                                       static_cast<int64_t>(found.length())));
         // skip pattern
         i = piece.begin();
         max_replacements--;
       }
     }
     // If we exited early due to max_replacements, add the trailing part
     RETURN_NOT_OK(builder->Append(reinterpret_cast<const uint8_t*>(i),
                                   static_cast<int64_t>(end - i)));
     return Status::OK();
   }
 };
 #endif

 template <typename Type>
 using ReplaceSubStringPlain = ReplaceSubString<Type, PlainSubStringReplacer>;

 const FunctionDoc replace_substring_doc(
     "Replace non-overlapping substrings that match pattern by replacement",
     ("For each string in `strings`, replace non-overlapping substrings that match\n"
      "`pattern` by `replacement`. If `max_replacements != -1`, it determines the\n"
      "maximum amount of replacements made, counting from the left. Null values emit\n"
      "null."),
     {"strings"}, "ReplaceSubstringOptions");

 #ifdef ARROW_WITH_RE2
 template <typename Type>
 using ReplaceSubStringRegex = ReplaceSubString<Type, RegexSubStringReplacer>;

 const FunctionDoc replace_substring_regex_doc(
     "Replace non-overlapping substrings that match regex `pattern` by `replacement`",
     ("For each string in `strings`, replace non-overlapping substrings that match the\n"
      "regular expression `pattern` by `replacement` using the Google RE2 library.\n"
      "If `max_replacements != -1`, it determines the maximum amount of replacements\n"
      "made, counting from the left. Note that if the pattern contains groups,\n"
      "backreferencing macan be used. Null values emit null."),
     {"strings"}, "ReplaceSubstringOptions");
 #endif

 // ----------------------------------------------------------------------
 // strptime string parsing

 using StrptimeState = OptionsWrapper<StrptimeOptions>;

 struct ParseStrptime {
   explicit ParseStrptime(const StrptimeOptions& options)
       : parser(TimestampParser::MakeStrptime(options.format)), unit(options.unit) {}

   template <typename... Ignored>
   int64_t Call(KernelContext* ctx, util::string_view val) const {
     int64_t result = 0;
     if (!(*parser)(val.data(), val.size(), unit, &result)) {
       ctx->SetStatus(Status::Invalid("Failed to parse string: '", val,
                                      "' as a scalar of type ",
                                      TimestampType(unit).ToString()));
     }
     return result;
   }

   std::shared_ptr<TimestampParser> parser;
   TimeUnit::type unit;
 };

 template <typename InputType>
 void StrptimeExec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
   applicator::ScalarUnaryNotNullStateful<TimestampType, InputType, ParseStrptime> kernel{
       ParseStrptime(StrptimeState::Get(ctx))};
   return kernel.Exec(ctx, batch, out);
 }

 Result<ValueDescr> StrptimeResolve(KernelContext* ctx, const std::vector<ValueDescr>&) {
   if (ctx->state()) {
     return ::arrow::timestamp(StrptimeState::Get(ctx).unit);
   }

   return Status::Invalid("strptime does not provide default StrptimeOptions");
 }

 #ifdef ARROW_WITH_UTF8PROC

 template <typename Type, bool left, bool right, typename Derived>
 struct UTF8TrimWhitespaceBase : StringTransform<Type, Derived> {
   using Base = StringTransform<Type, Derived>;
   using offset_type = typename Base::offset_type;
   bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
                  uint8_t* output, offset_type* output_written) {
     const uint8_t* begin = input;
     const uint8_t* end = input + input_string_ncodeunits;
     const uint8_t* end_trimmed = end;
     const uint8_t* begin_trimmed = begin;

     auto predicate = [](uint32_t c) { return !IsSpaceCharacterUnicode(c); };
     if (left && !ARROW_PREDICT_TRUE(
                     arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
       return false;
     }
     if (right && (begin_trimmed < end)) {
       if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
                                                              predicate, &end_trimmed))) {
         return false;
       }
     }
     std::copy(begin_trimmed, end_trimmed, output);
     *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
     return true;
   }
   void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     EnsureLookupTablesFilled();
     Base::Execute(ctx, batch, out);
   }
 };

 template <typename Type>
 struct UTF8TrimWhitespace
     : UTF8TrimWhitespaceBase<Type, true, true, UTF8TrimWhitespace<Type>> {};

 template <typename Type>
 struct UTF8LTrimWhitespace
     : UTF8TrimWhitespaceBase<Type, true, false, UTF8LTrimWhitespace<Type>> {};

 template <typename Type>
 struct UTF8RTrimWhitespace
     : UTF8TrimWhitespaceBase<Type, false, true, UTF8RTrimWhitespace<Type>> {};

 struct TrimStateUTF8 {
   TrimOptions options_;
   std::vector<bool> codepoints_;
   explicit TrimStateUTF8(KernelContext* ctx, TrimOptions options)
       : options_(std::move(options)) {
     if (!ARROW_PREDICT_TRUE(
             arrow::util::UTF8ForEach(options_.characters, [&](uint32_t c) {
               codepoints_.resize(
                   std::max(c + 1, static_cast<uint32_t>(codepoints_.size())));
               codepoints_.at(c) = true;
             }))) {
       ctx->SetStatus(Status::Invalid("Invalid UTF8 sequence in input"));
     }
   }
 };

 template <typename Type, bool left, bool right, typename Derived>
 struct UTF8TrimBase : StringTransform<Type, Derived> {
   using Base = StringTransform<Type, Derived>;
   using offset_type = typename Base::offset_type;
   using State = KernelStateFromFunctionOptions<TrimStateUTF8, TrimOptions>;
   TrimStateUTF8 state_;

   explicit UTF8TrimBase(TrimStateUTF8 state) : state_(std::move(state)) {}

   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     TrimStateUTF8 state = State::Get(ctx);
     Derived(state).Execute(ctx, batch, out);
   }

   void Execute(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     EnsureLookupTablesFilled();
     Base::Execute(ctx, batch, out);
   }

   bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
                  uint8_t* output, offset_type* output_written) {
     const uint8_t* begin = input;
     const uint8_t* end = input + input_string_ncodeunits;
     const uint8_t* end_trimmed = end;
     const uint8_t* begin_trimmed = begin;

     auto predicate = [&](uint32_t c) {
       bool contains = state_.codepoints_[c];
       return !contains;
     };
     if (left && !ARROW_PREDICT_TRUE(
                     arrow::util::UTF8FindIf(begin, end, predicate, &begin_trimmed))) {
       return false;
     }
     if (right && (begin_trimmed < end)) {
       if (!ARROW_PREDICT_TRUE(arrow::util::UTF8FindIfReverse(begin_trimmed, end,
                                                              predicate, &end_trimmed))) {
         return false;
       }
     }
     std::copy(begin_trimmed, end_trimmed, output);
     *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
     return true;
   }
 };
 template <typename Type>
 struct UTF8Trim : UTF8TrimBase<Type, true, true, UTF8Trim<Type>> {
   using Base = UTF8TrimBase<Type, true, true, UTF8Trim<Type>>;
   using Base::Base;
 };

 template <typename Type>
 struct UTF8LTrim : UTF8TrimBase<Type, true, false, UTF8LTrim<Type>> {
   using Base = UTF8TrimBase<Type, true, false, UTF8LTrim<Type>>;
   using Base::Base;
 };

 template <typename Type>
 struct UTF8RTrim : UTF8TrimBase<Type, false, true, UTF8RTrim<Type>> {
   using Base = UTF8TrimBase<Type, false, true, UTF8RTrim<Type>>;
   using Base::Base;
 };

 #endif

 template <typename Type, bool left, bool right, typename Derived>
 struct AsciiTrimWhitespaceBase : StringTransform<Type, Derived> {
   using offset_type = typename Type::offset_type;
   bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
                  uint8_t* output, offset_type* output_written) {
     const uint8_t* begin = input;
     const uint8_t* end = input + input_string_ncodeunits;
     const uint8_t* end_trimmed = end;

     auto predicate = [](unsigned char c) { return !IsSpaceCharacterAscii(c); };
     const uint8_t* begin_trimmed = left ? std::find_if(begin, end, predicate) : begin;
     if (right & (begin_trimmed < end)) {
       std::reverse_iterator<const uint8_t*> rbegin(end);
       std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
       end_trimmed = std::find_if(rbegin, rend, predicate).base();
     }
     std::copy(begin_trimmed, end_trimmed, output);
     *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
     return true;
   }
 };

 template <typename Type>
 struct AsciiTrimWhitespace
     : AsciiTrimWhitespaceBase<Type, true, true, AsciiTrimWhitespace<Type>> {};

 template <typename Type>
 struct AsciiLTrimWhitespace
     : AsciiTrimWhitespaceBase<Type, true, false, AsciiLTrimWhitespace<Type>> {};

 template <typename Type>
 struct AsciiRTrimWhitespace
     : AsciiTrimWhitespaceBase<Type, false, true, AsciiRTrimWhitespace<Type>> {};

 template <typename Type, bool left, bool right, typename Derived>
 struct AsciiTrimBase : StringTransform<Type, Derived> {
   using Base = StringTransform<Type, Derived>;
   using offset_type = typename Base::offset_type;
   using State = OptionsWrapper<TrimOptions>;
   TrimOptions options_;
   std::vector<bool> characters_;

   explicit AsciiTrimBase(TrimOptions options)
       : options_(std::move(options)), characters_(256) {
     std::for_each(options_.characters.begin(), options_.characters.end(),
                   [&](char c) { characters_[static_cast<unsigned char>(c)] = true; });
   }

   static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     TrimOptions options = State::Get(ctx);
     Derived(options).Execute(ctx, batch, out);
   }

   bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
                  uint8_t* output, offset_type* output_written) {
     const uint8_t* begin = input;
     const uint8_t* end = input + input_string_ncodeunits;
     const uint8_t* end_trimmed = end;
     const uint8_t* begin_trimmed;

     auto predicate = [&](unsigned char c) {
       bool contains = characters_[c];
       return !contains;
     };

     begin_trimmed = left ? std::find_if(begin, end, predicate) : begin;
     if (right & (begin_trimmed < end)) {
       std::reverse_iterator<const uint8_t*> rbegin(end);
       std::reverse_iterator<const uint8_t*> rend(begin_trimmed);
       end_trimmed = std::find_if(rbegin, rend, predicate).base();
     }
     std::copy(begin_trimmed, end_trimmed, output);
     *output_written = static_cast<offset_type>(end_trimmed - begin_trimmed);
     return true;
   }
 };

 template <typename Type>
 struct AsciiTrim : AsciiTrimBase<Type, true, true, AsciiTrim<Type>> {
   using Base = AsciiTrimBase<Type, true, true, AsciiTrim<Type>>;
   using Base::Base;
 };

 template <typename Type>
 struct AsciiLTrim : AsciiTrimBase<Type, true, false, AsciiLTrim<Type>> {
   using Base = AsciiTrimBase<Type, true, false, AsciiLTrim<Type>>;
   using Base::Base;
 };

 template <typename Type>
 struct AsciiRTrim : AsciiTrimBase<Type, false, true, AsciiRTrim<Type>> {
   using Base = AsciiTrimBase<Type, false, true, AsciiRTrim<Type>>;
   using Base::Base;
 };

 const FunctionDoc utf8_trim_whitespace_doc(
     "Trim leading and trailing whitespace characters",
     ("For each string in `strings`, emit a string with leading and trailing whitespace\n"
      "characters removed, where whitespace characters are defined by the Unicode\n"
      "standard.  Null values emit null."),
     {"strings"});

 const FunctionDoc utf8_ltrim_whitespace_doc(
     "Trim leading whitespace characters",
     ("For each string in `strings`, emit a string with leading whitespace\n"
      "characters removed, where whitespace characters are defined by the Unicode\n"
      "standard.  Null values emit null."),
     {"strings"});

 const FunctionDoc utf8_rtrim_whitespace_doc(
     "Trim trailing whitespace characters",
     ("For each string in `strings`, emit a string with trailing whitespace\n"
      "characters removed, where whitespace characters are defined by the Unicode\n"
      "standard.  Null values emit null."),
     {"strings"});

 const FunctionDoc ascii_trim_whitespace_doc(
     "Trim leading and trailing ASCII whitespace characters",
     ("For each string in `strings`, emit a string with leading and trailing ASCII\n"
      "whitespace characters removed. Use `utf8_trim_whitespace` to trim Unicode\n"
      "whitespace characters. Null values emit null."),
     {"strings"});

 const FunctionDoc ascii_ltrim_whitespace_doc(
     "Trim leading ASCII whitespace characters",
     ("For each string in `strings`, emit a string with leading ASCII whitespace\n"
      "characters removed.  Use `utf8_ltrim_whitespace` to trim leading Unicode\n"
      "whitespace characters. Null values emit null."),
     {"strings"});

 const FunctionDoc ascii_rtrim_whitespace_doc(
     "Trim trailing ASCII whitespace characters",
     ("For each string in `strings`, emit a string with trailing ASCII whitespace\n"
      "characters removed. Use `utf8_rtrim_whitespace` to trim trailing Unicode\n"
      "whitespace characters. Null values emit null."),
     {"strings"});

 const FunctionDoc utf8_trim_doc(
     "Trim leading and trailing characters present in the `characters` arguments",
     ("For each string in `strings`, emit a string with leading and trailing\n"
      "characters removed that are present in the `characters` argument.  Null values\n"
      "emit null."),
     {"strings"}, "TrimOptions");

 const FunctionDoc utf8_ltrim_doc(
     "Trim leading characters present in the `characters` arguments",
     ("For each string in `strings`, emit a string with leading\n"
      "characters removed that are present in the `characters` argument.  Null values\n"
      "emit null."),
     {"strings"}, "TrimOptions");

 const FunctionDoc utf8_rtrim_doc(
     "Trim trailing characters present in the `characters` arguments",
     ("For each string in `strings`, emit a string with leading "
      "characters removed that are present in the `characters` argument.  Null values\n"
      "emit null."),
     {"strings"}, "TrimOptions");

 const FunctionDoc ascii_trim_doc(
     utf8_trim_doc.summary + "",
     utf8_trim_doc.description +
         ("\nBoth the input string as the `characters` argument are interepreted as\n"
          "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
     {"strings"}, "TrimOptions");

 const FunctionDoc ascii_ltrim_doc(
     utf8_ltrim_doc.summary + "",
     utf8_ltrim_doc.description +
         ("\nBoth the input string as the `characters` argument are interepreted as\n"
          "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
     {"strings"}, "TrimOptions");

 const FunctionDoc ascii_rtrim_doc(
     utf8_rtrim_doc.summary + "",
     utf8_rtrim_doc.description +
         ("\nBoth the input string as the `characters` argument are interepreted as\n"
          "ASCII characters, to trim non-ASCII characters, use `utf8_trim`."),
     {"strings"}, "TrimOptions");

 const FunctionDoc strptime_doc(
     "Parse timestamps",
     ("For each string in `strings`, parse it as a timestamp.\n"
      "The timestamp unit and the expected string pattern must be given\n"
      "in StrptimeOptions.  Null inputs emit null.  If a non-null string\n"
      "fails parsing, an error is returned."),
     {"strings"}, "StrptimeOptions");

 const FunctionDoc binary_length_doc(
     "Compute string lengths",
     ("For each string in `strings`, emit the number of bytes.  Null values emit null."),
     {"strings"});

 const FunctionDoc utf8_length_doc("Compute UTF8 string lengths",
                                   ("For each string in `strings`, emit the number of "
                                    "UTF8 characters.  Null values emit null."),
                                   {"strings"});

 void AddStrptime(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("strptime", Arity::Unary(), &strptime_doc);
   DCHECK_OK(func->AddKernel({utf8()}, OutputType(StrptimeResolve),
                             StrptimeExec<StringType>, StrptimeState::Init));
   DCHECK_OK(func->AddKernel({large_utf8()}, OutputType(StrptimeResolve),
                             StrptimeExec<LargeStringType>, StrptimeState::Init));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }

 void AddBinaryLength(FunctionRegistry* registry) {
   auto func = std::make_shared<ScalarFunction>("binary_length", Arity::Unary(),
                                                &binary_length_doc);
   ArrayKernelExec exec_offset_32 =
       applicator::ScalarUnaryNotNull<Int32Type, StringType, BinaryLength>::Exec;
   ArrayKernelExec exec_offset_64 =
       applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, BinaryLength>::Exec;
   for (const auto& input_type : {binary(), utf8()}) {
     DCHECK_OK(func->AddKernel({input_type}, int32(), exec_offset_32));
   }
   for (const auto& input_type : {large_binary(), large_utf8()}) {
     DCHECK_OK(func->AddKernel({input_type}, int64(), exec_offset_64));
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }

 void AddUtf8Length(FunctionRegistry* registry) {
   auto func =
       std::make_shared<ScalarFunction>("utf8_length", Arity::Unary(), &utf8_length_doc);

   ArrayKernelExec exec_offset_32 =
       applicator::ScalarUnaryNotNull<Int32Type, StringType, Utf8Length>::Exec;
   DCHECK_OK(func->AddKernel({utf8()}, int32(), std::move(exec_offset_32)));

   ArrayKernelExec exec_offset_64 =
       applicator::ScalarUnaryNotNull<Int64Type, LargeStringType, Utf8Length>::Exec;
   DCHECK_OK(func->AddKernel({large_utf8()}, int64(), std::move(exec_offset_64)));

   DCHECK_OK(registry->AddFunction(std::move(func)));
 }

 template <template <typename> class ExecFunctor>
 void MakeUnaryStringBatchKernel(
     std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
     MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
   {
     auto exec_32 = ExecFunctor<StringType>::Exec;
     ScalarKernel kernel{{utf8()}, utf8(), exec_32};
     kernel.mem_allocation = mem_allocation;
     DCHECK_OK(func->AddKernel(std::move(kernel)));
   }
   {
     auto exec_64 = ExecFunctor<LargeStringType>::Exec;
     ScalarKernel kernel{{large_utf8()}, large_utf8(), exec_64};
     kernel.mem_allocation = mem_allocation;
     DCHECK_OK(func->AddKernel(std::move(kernel)));
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }

 template <template <typename> class ExecFunctor>
 void MakeUnaryStringBatchKernelWithState(
     std::string name, FunctionRegistry* registry, const FunctionDoc* doc,
     MemAllocation::type mem_allocation = MemAllocation::PREALLOCATE) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
   {
     using t32 = ExecFunctor<StringType>;
     ScalarKernel kernel{{utf8()}, utf8(), t32::Exec, t32::State::Init};
     kernel.mem_allocation = mem_allocation;
     DCHECK_OK(func->AddKernel(std::move(kernel)));
   }
   {
     using t64 = ExecFunctor<LargeStringType>;
     ScalarKernel kernel{{large_utf8()}, large_utf8(), t64::Exec, t64::State::Init};
     kernel.mem_allocation = mem_allocation;
     DCHECK_OK(func->AddKernel(std::move(kernel)));
   }
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }

 #ifdef ARROW_WITH_UTF8PROC

 template <template <typename> class Transformer>
 void MakeUnaryStringUTF8TransformKernel(std::string name, FunctionRegistry* registry,
                                         const FunctionDoc* doc) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
   ArrayKernelExec exec_32 = Transformer<StringType>::Exec;
   ArrayKernelExec exec_64 = Transformer<LargeStringType>::Exec;
   DCHECK_OK(func->AddKernel({utf8()}, utf8(), exec_32));
   DCHECK_OK(func->AddKernel({large_utf8()}, large_utf8(), exec_64));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }

 #endif

 using StringPredicate = std::function<bool(KernelContext*, const uint8_t*, size_t)>;

 template <typename Type>
 void ApplyPredicate(KernelContext* ctx, const ExecBatch& batch, StringPredicate predicate,
                     Datum* out) {
   EnsureLookupTablesFilled();
   if (batch[0].kind() == Datum::ARRAY) {
     const ArrayData& input = *batch[0].array();
     ArrayIterator<Type> input_it(input);
     ArrayData* out_arr = out->mutable_array();
     ::arrow::internal::GenerateBitsUnrolled(
         out_arr->buffers[1]->mutable_data(), out_arr->offset, input.length,
         [&]() -> bool {
           util::string_view val = input_it();
           return predicate(ctx, reinterpret_cast<const uint8_t*>(val.data()), val.size());
         });
   } else {
     const auto& input = checked_cast<const BaseBinaryScalar&>(*batch[0].scalar());
     if (input.is_valid) {
       bool boolean_result =
           predicate(ctx, input.value->data(), static_cast<size_t>(input.value->size()));
       if (!ctx->status().ok()) {
         // UTF decoding can lead to issues
         return;
       }
       out->value = std::make_shared<BooleanScalar>(boolean_result);
     }
   }
 }

 template <typename Predicate>
 void AddUnaryStringPredicate(std::string name, FunctionRegistry* registry,
                              const FunctionDoc* doc) {
   auto func = std::make_shared<ScalarFunction>(name, Arity::Unary(), doc);
   auto exec_32 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     ApplyPredicate<StringType>(ctx, batch, Predicate::Call, out);
   };
   auto exec_64 = [](KernelContext* ctx, const ExecBatch& batch, Datum* out) {
     ApplyPredicate<LargeStringType>(ctx, batch, Predicate::Call, out);
   };
   DCHECK_OK(func->AddKernel({utf8()}, boolean(), std::move(exec_32)));
   DCHECK_OK(func->AddKernel({large_utf8()}, boolean(), std::move(exec_64)));
   DCHECK_OK(registry->AddFunction(std::move(func)));
 }

 FunctionDoc StringPredicateDoc(std::string summary, std::string description) {
   return FunctionDoc{std::move(summary), std::move(description), {"strings"}};
 }

 FunctionDoc StringClassifyDoc(std::string class_summary, std::string class_desc,
                               bool non_empty) {
   std::string summary, description;
   {
     std::stringstream ss;
     ss << "Classify strings as " << class_summary;
     summary = ss.str();
   }
   {
     std::stringstream ss;
     if (non_empty) {
       ss
           << ("For each string in `strings`, emit true iff the string is non-empty\n"
               "and consists only of ");
     } else {
       ss
           << ("For each string in `strings`, emit true iff the string consists only\n"
               "of ");
     }
     ss << class_desc << ".  Null strings emit null.";
     description = ss.str();
   }
   return StringPredicateDoc(std::move(summary), std::move(description));
 }

 const auto string_is_ascii_doc = StringClassifyDoc("ASCII", "ASCII characters", false);

 const auto ascii_is_alnum_doc =
     StringClassifyDoc("ASCII alphanumeric", "alphanumeric ASCII characters", true);
 const auto ascii_is_alpha_doc =
     StringClassifyDoc("ASCII alphabetic", "alphabetic ASCII characters", true);
 const auto ascii_is_decimal_doc =
     StringClassifyDoc("ASCII decimal", "decimal ASCII characters", true);
 const auto ascii_is_lower_doc =
     StringClassifyDoc("ASCII lowercase", "lowercase ASCII characters", true);
 const auto ascii_is_printable_doc =
     StringClassifyDoc("ASCII printable", "printable ASCII characters", true);
 const auto ascii_is_space_doc =
     StringClassifyDoc("ASCII whitespace", "whitespace ASCII characters", true);
 const auto ascii_is_upper_doc =
     StringClassifyDoc("ASCII uppercase", "uppercase ASCII characters", true);

 const auto ascii_is_title_doc = StringPredicateDoc(
     "Classify strings as ASCII titlecase",
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
      "follows a non-cased character, and each lowercase character follows\n"
      "an uppercase character.\n"));

 const auto utf8_is_alnum_doc =
     StringClassifyDoc("alphanumeric", "alphanumeric Unicode characters", true);
 const auto utf8_is_alpha_doc =
     StringClassifyDoc("alphabetic", "alphabetic Unicode characters", true);
 const auto utf8_is_decimal_doc =
     StringClassifyDoc("decimal", "decimal Unicode characters", true);
 const auto utf8_is_digit_doc = StringClassifyDoc("digits", "Unicode digits", true);
 const auto utf8_is_lower_doc =
     StringClassifyDoc("lowercase", "lowercase Unicode characters", true);
 const auto utf8_is_numeric_doc =
     StringClassifyDoc("numeric", "numeric Unicode characters", true);
 const auto utf8_is_printable_doc =
     StringClassifyDoc("printable", "printable Unicode characters", true);
 const auto utf8_is_space_doc =
     StringClassifyDoc("whitespace", "whitespace Unicode characters", true);
 const auto utf8_is_upper_doc =
     StringClassifyDoc("uppercase", "uppercase Unicode characters", true);

 const auto utf8_is_title_doc = StringPredicateDoc(
     "Classify strings as titlecase",
     ("For each string in `strings`, emit true iff the string is title-cased,\n"
      "i.e. it has at least one cased character, each uppercase character\n"
      "follows a non-cased character, and each lowercase character follows\n"
      "an uppercase character.\n"));

 const FunctionDoc ascii_upper_doc(
     "Transform ASCII input to uppercase",
     ("For each string in `strings`, return an uppercase version.\n\n"
      "This function assumes the input is fully ASCII.  It it may contain\n"
      "non-ASCII characters, use \"utf8_upper\" instead."),
     {"strings"});

 const FunctionDoc ascii_lower_doc(
     "Transform ASCII input to lowercase",
     ("For each string in `strings`, return a lowercase version.\n\n"
      "This function assumes the input is fully ASCII.  It it may contain\n"
      "non-ASCII characters, use \"utf8_lower\" instead."),
     {"strings"});

 const FunctionDoc utf8_upper_doc(
     "Transform input to uppercase",
     ("For each string in `strings`, return an uppercase version."), {"strings"});

 const FunctionDoc utf8_lower_doc(
     "Transform input to lowercase",
     ("For each string in `strings`, return a lowercase version."), {"strings"});

 }  // namespace

 void RegisterScalarStringAscii(FunctionRegistry* registry) {
   // ascii_upper and ascii_lower are able to reuse the original offsets buffer,
   // so don't preallocate them in the output.
   MakeUnaryStringBatchKernel<AsciiUpper>("ascii_upper", registry, &ascii_upper_doc,
                                          MemAllocation::NO_PREALLOCATE);
   MakeUnaryStringBatchKernel<AsciiLower>("ascii_lower", registry, &ascii_lower_doc,
                                          MemAllocation::NO_PREALLOCATE);
   MakeUnaryStringBatchKernel<AsciiTrimWhitespace>("ascii_trim_whitespace", registry,
                                                   &ascii_trim_whitespace_doc);
   MakeUnaryStringBatchKernel<AsciiLTrimWhitespace>("ascii_ltrim_whitespace", registry,
                                                    &ascii_ltrim_whitespace_doc);
   MakeUnaryStringBatchKernel<AsciiRTrimWhitespace>("ascii_rtrim_whitespace", registry,
                                                    &ascii_rtrim_whitespace_doc);
   MakeUnaryStringBatchKernelWithState<AsciiTrim>("ascii_trim", registry,
                                                  &ascii_lower_doc);
   MakeUnaryStringBatchKernelWithState<AsciiLTrim>("ascii_ltrim", registry,
                                                   &ascii_lower_doc);
   MakeUnaryStringBatchKernelWithState<AsciiRTrim>("ascii_rtrim", registry,
                                                   &ascii_lower_doc);

   AddUnaryStringPredicate<IsAscii>("string_is_ascii", registry, &string_is_ascii_doc);

   AddUnaryStringPredicate<IsAlphaNumericAscii>("ascii_is_alnum", registry,
                                                &ascii_is_alnum_doc);
   AddUnaryStringPredicate<IsAlphaAscii>("ascii_is_alpha", registry, &ascii_is_alpha_doc);
   AddUnaryStringPredicate<IsDecimalAscii>("ascii_is_decimal", registry,
                                           &ascii_is_decimal_doc);
   // no is_digit for ascii, since it is the same as is_decimal
   AddUnaryStringPredicate<IsLowerAscii>("ascii_is_lower", registry, &ascii_is_lower_doc);
   // no is_numeric for ascii, since it is the same as is_decimal
   AddUnaryStringPredicate<IsPrintableAscii>("ascii_is_printable", registry,
                                             &ascii_is_printable_doc);
   AddUnaryStringPredicate<IsSpaceAscii>("ascii_is_space", registry, &ascii_is_space_doc);
   AddUnaryStringPredicate<IsTitleAscii>("ascii_is_title", registry, &ascii_is_title_doc);
   AddUnaryStringPredicate<IsUpperAscii>("ascii_is_upper", registry, &ascii_is_upper_doc);

 #ifdef ARROW_WITH_UTF8PROC
   MakeUnaryStringUTF8TransformKernel<UTF8Upper>("utf8_upper", registry, &utf8_upper_doc);
   MakeUnaryStringUTF8TransformKernel<UTF8Lower>("utf8_lower", registry, &utf8_lower_doc);
   MakeUnaryStringBatchKernel<UTF8TrimWhitespace>("utf8_trim_whitespace", registry,
                                                  &utf8_trim_whitespace_doc);
   MakeUnaryStringBatchKernel<UTF8LTrimWhitespace>("utf8_ltrim_whitespace", registry,
                                                   &utf8_ltrim_whitespace_doc);
   MakeUnaryStringBatchKernel<UTF8RTrimWhitespace>("utf8_rtrim_whitespace", registry,
                                                   &utf8_rtrim_whitespace_doc);
   MakeUnaryStringBatchKernelWithState<UTF8Trim>("utf8_trim", registry, &utf8_trim_doc);
   MakeUnaryStringBatchKernelWithState<UTF8LTrim>("utf8_ltrim", registry, &utf8_ltrim_doc);
   MakeUnaryStringBatchKernelWithState<UTF8RTrim>("utf8_rtrim", registry, &utf8_rtrim_doc);

   AddUnaryStringPredicate<IsAlphaNumericUnicode>("utf8_is_alnum", registry,
                                                  &utf8_is_alnum_doc);
   AddUnaryStringPredicate<IsAlphaUnicode>("utf8_is_alpha", registry, &utf8_is_alpha_doc);
   AddUnaryStringPredicate<IsDecimalUnicode>("utf8_is_decimal", registry,
                                             &utf8_is_decimal_doc);
   AddUnaryStringPredicate<IsDigitUnicode>("utf8_is_digit", registry, &utf8_is_digit_doc);
   AddUnaryStringPredicate<IsLowerUnicode>("utf8_is_lower", registry, &utf8_is_lower_doc);
   AddUnaryStringPredicate<IsNumericUnicode>("utf8_is_numeric", registry,
                                             &utf8_is_numeric_doc);
   AddUnaryStringPredicate<IsPrintableUnicode>("utf8_is_printable", registry,
                                               &utf8_is_printable_doc);
   AddUnaryStringPredicate<IsSpaceUnicode>("utf8_is_space", registry, &utf8_is_space_doc);
   AddUnaryStringPredicate<IsTitleUnicode>("utf8_is_title", registry, &utf8_is_title_doc);
   AddUnaryStringPredicate<IsUpperUnicode>("utf8_is_upper", registry, &utf8_is_upper_doc);
 #endif

   AddSplit(registry);
   AddBinaryLength(registry);
   AddUtf8Length(registry);
   AddMatchSubstring(registry);
   MakeUnaryStringBatchKernelWithState<ReplaceSubStringPlain>(
       "replace_substring", registry, &replace_substring_doc,
       MemAllocation::NO_PREALLOCATE);
 #ifdef ARROW_WITH_RE2
   MakeUnaryStringBatchKernelWithState<ReplaceSubStringRegex>(
       "replace_substring_regex", registry, &replace_substring_regex_doc,
       MemAllocation::NO_PREALLOCATE);
 #endif
   AddStrptime(registry);
 }

 }  // namespace internal
 }  // namespace compute
 }  // namespace arrow