blob: 8636d54f65f62e2fb86cb848726aad1d480fe84b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef IMPALA_EXPRS_STRING_FUNCTIONS_H
#define IMPALA_EXPRS_STRING_FUNCTIONS_H
#include <re2/re2.h>
#include <bitset>
#include "runtime/string-value.h"
#include "runtime/string-search.h"
using namespace impala_udf;
namespace impala {
using impala_udf::FunctionContext;
using impala_udf::AnyVal;
using impala_udf::BooleanVal;
using impala_udf::TinyIntVal;
using impala_udf::SmallIntVal;
using impala_udf::IntVal;
using impala_udf::BigIntVal;
using impala_udf::FloatVal;
using impala_udf::DoubleVal;
using impala_udf::TimestampVal;
using impala_udf::StringVal;
using impala_udf::DecimalVal;
class Expr;
class OpcodeRegistry;
class TupleRow;
class StringFunctions {
public:
// String trimming position or direction
enum TrimPosition {
LEADING, // Trim from the begining, or leading end
TRAILING, // Trim from the right, or trailing end
BOTH // Trim from both ends of string
};
// A utility class for supporting the UTF-8 Trim() function, initialized with the input
// string to be trimmed. After Reset(), the Contains function can be used to determine
// if a character needs to be trimmed.
class TrimContext {
public:
TrimContext(bool utf8_mode) : utf8_mode_(utf8_mode) { }
void Reset(const StringVal& chars_to_trim);
inline bool Contains(uint8_t single_char) const {
return single_byte_chars_.test(single_char);
}
inline bool Contains(const uint8_t* utf8_char, int len) const;
bool utf8_mode() const { return utf8_mode_; }
private:
const bool utf8_mode_;
// The bitset to hold the unique characters to trim, used for non-UTF-8 characters
// or single-byte UTF-8 characters.
std::bitset<256> single_byte_chars_;
// Pointers to multi-byte UTF-8 characters used to check whether characters of the
// corresponding byte count need to be trimmed.
std::vector<const uint8_t*> double_byte_chars_;
std::vector<const uint8_t*> triple_byte_chars_;
std::vector<const uint8_t*> quadruple_byte_chars_;
};
static StringVal Substring(FunctionContext*, const StringVal& str, const BigIntVal& pos,
const BigIntVal& len);
static StringVal Substring(FunctionContext*, const StringVal& str,
const BigIntVal& pos);
static StringVal Utf8Substring(FunctionContext*, const StringVal& str,
const BigIntVal& pos, const BigIntVal& len);
static StringVal Utf8Substring(FunctionContext*, const StringVal& str,
const BigIntVal& pos);
static StringVal SplitPart(FunctionContext* context, const StringVal& str,
const StringVal& delim, const BigIntVal& field);
static StringVal Left(FunctionContext*, const StringVal& str, const BigIntVal& len);
static StringVal Right(FunctionContext*, const StringVal& str, const BigIntVal& len);
static StringVal Space(FunctionContext*, const BigIntVal& len);
static StringVal Repeat(FunctionContext*, const StringVal& str, const BigIntVal& n);
static StringVal Lpad(FunctionContext*, const StringVal& str, const BigIntVal& len,
const StringVal& pad);
static StringVal Rpad(FunctionContext*, const StringVal& str, const BigIntVal&,
const StringVal& pad);
static IntVal Bytes(FunctionContext*, const StringVal& str);
static IntVal Length(FunctionContext*, const StringVal& str);
static IntVal CharLength(FunctionContext*, const StringVal& str);
static IntVal Utf8Length(FunctionContext*, const StringVal& str);
static StringVal Lower(FunctionContext*, const StringVal& str);
static StringVal LowerAscii(FunctionContext*, const StringVal& str);
static StringVal LowerUtf8(FunctionContext*, const StringVal& str);
static StringVal Upper(FunctionContext*, const StringVal& str);
static StringVal UpperAscii(FunctionContext*, const StringVal& str);
static StringVal UpperUtf8(FunctionContext*, const StringVal& str);
static StringVal InitCap(FunctionContext*, const StringVal& str);
static StringVal InitCapAscii(FunctionContext*, const StringVal& str);
static StringVal InitCapUtf8(FunctionContext*, const StringVal& str);
static void ReplacePrepare(FunctionContext*, FunctionContext::FunctionStateScope);
static void ReplaceClose(FunctionContext*, FunctionContext::FunctionStateScope);
static StringVal Replace(FunctionContext*, const StringVal& str,
const StringVal& pattern, const StringVal& replace);
static StringVal Reverse(FunctionContext*, const StringVal& str);
static StringVal Utf8Reverse(FunctionContext*, const StringVal& str);
static StringVal Translate(FunctionContext*, const StringVal& str, const StringVal& src,
const StringVal& dst);
static StringVal Trim(FunctionContext*, const StringVal& str);
static StringVal Ltrim(FunctionContext*, const StringVal& str);
static StringVal Rtrim(FunctionContext*, const StringVal& str);
/// Sets up arguments and function context for the *TrimString functions below.
static void TrimPrepare(FunctionContext*, FunctionContext::FunctionStateScope);
static void Utf8TrimPrepare(FunctionContext*, FunctionContext::FunctionStateScope);
/// Cleans up the work done by TrimPrepare above.
static void TrimClose(FunctionContext*, FunctionContext::FunctionStateScope);
/// AES encryption functions in Impala, using openSSL libraries.
static void AesPrepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope);
static StringVal AesDecrypt(FunctionContext* ctx, const StringVal& expr,
const StringVal& key, const StringVal& mode, const StringVal& iv);
static StringVal AesEncrypt(FunctionContext* ctx, const StringVal& expr,
const StringVal& key, const StringVal& mode, const StringVal& iv);
static StringVal AesDecryptImpl(FunctionContext* ctx, const StringVal& expr,
const StringVal& key, const StringVal& mode, const StringVal& iv);
static StringVal AesEncryptImpl(FunctionContext* ctx, const StringVal& expr,
const StringVal& key, const StringVal& mode, const StringVal& iv);
static void AesClose(FunctionContext* context,
FunctionContext::FunctionStateScope scope);
/// Trims occurrences of the characters in 'chars_to_trim' string from
/// the beginning of string 'str'.
static StringVal LTrimString(FunctionContext* ctx, const StringVal& str,
const StringVal& chars_to_trim);
/// Trims occurrences of the characters in 'chars_to_trim' string from
/// the end of string 'str'.
static StringVal RTrimString(FunctionContext* ctx, const StringVal& str,
const StringVal& chars_to_trim);
/// Trims occurrences of the characters in 'chars_to_trim' string from
/// both ends of string 'str'.
static StringVal BTrimString(FunctionContext* ctx, const StringVal& str,
const StringVal& chars_to_trim);
static IntVal Ascii(FunctionContext*, const StringVal& str);
static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr,
const BigIntVal& start_position, const BigIntVal& occurrence);
static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr,
const BigIntVal& start_position);
static IntVal Instr(FunctionContext*, const StringVal& str, const StringVal& substr);
static IntVal Locate(FunctionContext*, const StringVal& substr, const StringVal& str);
static IntVal LocatePos(FunctionContext*, const StringVal& substr, const StringVal& str,
const BigIntVal& start_pos);
static bool SetRE2Options(const StringVal& match_parameter, std::string* error_str,
re2::RE2::Options* opts);
static void SetRE2MemOpt(re2::RE2::Options* opts);
static void RegexpPrepare(FunctionContext*, FunctionContext::FunctionStateScope);
static void RegexpClose(FunctionContext*, FunctionContext::FunctionStateScope);
static StringVal RegexpEscape(FunctionContext*, const StringVal& str);
static StringVal RegexpExtract(FunctionContext*, const StringVal& str,
const StringVal& pattern, const BigIntVal& index);
static StringVal RegexpReplace(FunctionContext*, const StringVal& str,
const StringVal& pattern, const StringVal& replace);
static void RegexpMatchCountPrepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope);
static IntVal RegexpMatchCount2Args(FunctionContext* context, const StringVal& str,
const StringVal& pattern);
static IntVal RegexpMatchCount4Args(FunctionContext* context, const StringVal& str,
const StringVal& pattern, const IntVal& start_pos,
const StringVal& match_parameter);
static StringVal Concat(FunctionContext*, int num_children, const StringVal* strs);
static StringVal ConcatWs(FunctionContext*, const StringVal& sep, int num_children,
const StringVal* strs);
static IntVal FindInSet(FunctionContext*, const StringVal& str,
const StringVal& str_set);
static void ParseUrlPrepare(FunctionContext*, FunctionContext::FunctionStateScope);
static StringVal ParseUrl(FunctionContext*, const StringVal& url,
const StringVal& part);
static StringVal ParseUrlKey(FunctionContext*, const StringVal& url,
const StringVal& key, const StringVal& part);
static void ParseUrlClose(FunctionContext*, FunctionContext::FunctionStateScope);
static void SetRE2MemLimit(int64_t re2_mem_limit);
/// Converts ASCII 'val' to corresponding character.
static StringVal Chr(FunctionContext* context, const IntVal& val);
static StringVal Base64Encode(FunctionContext* ctx, const StringVal& str);
static StringVal Base64Decode(FunctionContext* ctx, const StringVal& str);
static StringVal GetJsonObject(FunctionContext* ctx, const StringVal& json_str,
const StringVal& path_str);
/// Implementation of GetJsonObject, not cross-compiled since no significant benefits
/// can gain.
static StringVal GetJsonObjectImpl(FunctionContext* ctx, const StringVal& json_str,
const StringVal& path_str);
static IntVal Levenshtein(
FunctionContext* context, const StringVal& s1, const StringVal& s2);
static IntVal DamerauLevenshtein(
FunctionContext* context, const StringVal& s1, const StringVal& s2);
static DoubleVal JaroDistance(
FunctionContext* ctx, const StringVal& s1, const StringVal& s2);
static DoubleVal JaroSimilarity(
FunctionContext* ctx, const StringVal& s1, const StringVal& s2);
static DoubleVal JaroWinklerDistance(FunctionContext* ctx, const StringVal& s1,
const StringVal& s2);
static DoubleVal JaroWinklerDistance(FunctionContext* ctx, const StringVal& s1,
const StringVal& s2, const DoubleVal& scaling_factor);
static DoubleVal JaroWinklerDistance(FunctionContext* ctx, const StringVal& s1,
const StringVal& s2, const DoubleVal& scaling_factor,
const DoubleVal& boost_threshold);
static DoubleVal JaroWinklerSimilarity(FunctionContext* ctx, const StringVal& s1,
const StringVal& s2);
static DoubleVal JaroWinklerSimilarity(FunctionContext* ctx, const StringVal& s1,
const StringVal& s2, const DoubleVal& scaling_factor);
static DoubleVal JaroWinklerSimilarity(FunctionContext* ctx, const StringVal& s1,
const StringVal& s2, const DoubleVal& scaling_factor,
const DoubleVal& boost_threshold);
/// Converts bytes stored as an integer value into human readable memory measurements.
/// For example, 123456789012 bytes is converted to "114.98 GB".
static StringVal PrettyPrintMemory(FunctionContext*, const BigIntVal& bytes);
static StringVal PrettyPrintMemory(FunctionContext*, const IntVal& bytes);
static StringVal PrettyPrintMemory(FunctionContext*, const SmallIntVal& bytes);
static StringVal PrettyPrintMemory(FunctionContext*, const TinyIntVal& bytes);
private:
static uint64_t re2_mem_limit_;
static void DoTrimPrepare(FunctionContext* context,
FunctionContext::FunctionStateScope scope, bool utf8_mode);
/// Templatized implementation of the actual string trimming function.
/// The first parameter, 'D', is one of StringFunctions::TrimPosition values.
/// The second parameter, 'IS_IMPLICIT_WHITESPACE', is true when the set of characters
/// to trim is implicitly set to ' ', as a result of calling the one-arg
/// forms of trim/ltrim/rtrim.
template <TrimPosition D, bool IS_IMPLICIT_WHITESPACE>
static StringVal DoTrimString(FunctionContext* ctx, const StringVal& str,
const StringVal& chars_to_trim);
/// Templatized implementation of the actual string trimming function with UTF-8
/// character handling.
/// The first parameter, 'D', is one of the values of StringFunctions::TrimPosition.
template <StringFunctions::TrimPosition D>
static StringVal DoUtf8TrimString(const StringVal& str, const TrimContext& trim_ctx);
};
}
#endif