| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| |
| #ifndef IMPALA_UDF_UDF_H |
| #define IMPALA_UDF_UDF_H |
| |
| // THIS FILE IS USED BY THE STANDALONE IMPALA UDF DEVELOPMENT KIT. |
| // IT MUST BE BUILDABLE WITH C++98 AND WITHOUT ANY INTERNAL IMPALA HEADERS. |
| |
| #include <assert.h> |
| #include <boost/cstdint.hpp> |
| #include <string.h> |
| |
| // Only use noexcept if the compiler supports C++11 (some system compilers may not |
| // or may have it disabled by default). |
| #if __cplusplus >= 201103L |
| #define NOEXCEPT noexcept |
| #else |
| #define NOEXCEPT |
| #endif |
| |
| // Macro to prepend to function definitions that will export the symbols to be visible |
| // for loading by Impala. It is recommended that UDFs be built with the compiler flags |
| // "-fvisibility=hidden -fvisibility-inlines-hidden" and only functions that are entry |
| // points for UDFs be exported with this macro. |
| #define IMPALA_UDF_EXPORT __attribute__ ((visibility ("default"))) |
| |
| /// This is the only Impala header required to develop UDFs and UDAs. This header |
| /// contains the types that need to be used and the FunctionContext object. The context |
| /// object serves as the interface object between the UDF/UDA and the impala process. |
| namespace impala { |
| class FunctionContextImpl; |
| } |
| |
| namespace impala_udf { |
| |
| /// All input and output values will be one of the structs below. The struct is a simple |
| /// object containing a boolean to store if the value is NULL and the value itself. The |
| /// value is unspecified if the NULL boolean is set. |
| struct AnyVal; |
| struct BooleanVal; |
| struct TinyIntVal; |
| struct SmallIntVal; |
| struct IntVal; |
| struct BigIntVal; |
| struct StringVal; |
| struct TimestampVal; |
| struct DateVal; |
| |
| /// A FunctionContext is passed to every UDF/UDA and is the interface for the UDF to the |
| /// rest of the system. It contains APIs to examine the system state, report errors and |
| /// manage memory. |
| class FunctionContext { |
| public: |
| enum ImpalaVersion { |
| v1_2, |
| v1_3, |
| }; |
| |
| enum Type { |
| INVALID_TYPE = 0, |
| TYPE_NULL, |
| TYPE_BOOLEAN, |
| TYPE_TINYINT, |
| TYPE_SMALLINT, |
| TYPE_INT, |
| TYPE_BIGINT, |
| TYPE_FLOAT, |
| TYPE_DOUBLE, |
| TYPE_TIMESTAMP, |
| TYPE_STRING, |
| TYPE_DATE, |
| // Not used - maps to CHAR(N), which is not supported for UDFs and UDAs. |
| TYPE_FIXED_BUFFER, |
| TYPE_DECIMAL, |
| TYPE_VARCHAR, |
| // A fixed-size buffer, passed as a StringVal. |
| TYPE_FIXED_UDA_INTERMEDIATE |
| }; |
| |
| struct TypeDesc { |
| Type type; |
| |
| /// Only valid if type == TYPE_DECIMAL |
| int precision; |
| int scale; |
| |
| /// Only valid if type is one of TYPE_FIXED_BUFFER, TYPE_FIXED_UDA_INTERMEDIATE or |
| /// TYPE_VARCHAR. |
| int len; |
| }; |
| |
| struct UniqueId { |
| int64_t hi; |
| int64_t lo; |
| }; |
| |
| enum FunctionStateScope { |
| /// Indicates that the function state for this FunctionContext's UDF is shared across |
| /// the plan fragment (a query is divided into multiple plan fragments, each of which |
| /// is responsible for a part of the query execution). Within the plan fragment, there |
| /// may be multiple instances of the UDF executing concurrently with multiple |
| /// FunctionContexts sharing this state, meaning that the state must be |
| /// thread-safe. The Prepare() function for the UDF may be called with this scope |
| /// concurrently on a single host if the UDF will be evaluated in multiple plan |
| /// fragments on that host. In general, read-only state that doesn't need to be |
| /// recomputed for every UDF call should be fragment-local. |
| /// TODO: Move FRAGMENT_LOCAL states to query_state for multi-threading. |
| FRAGMENT_LOCAL, |
| |
| /// Indicates that the function state is local to the execution thread. This state |
| /// does not need to be thread-safe. However, this state will be initialized (via the |
| /// Prepare() function) once for every execution thread, so fragment-local state |
| /// should be used when possible for better performance. In general, inexpensive |
| /// shared state that is written to by the UDF (e.g. scratch space) should be |
| /// thread-local. |
| THREAD_LOCAL, |
| }; |
| |
| /// Returns the version of Impala that's currently running. |
| ImpalaVersion version() const; |
| |
| /// Returns the user that is running the query. Returns NULL if it is not |
| /// available. |
| const char* user() const; |
| |
| /// Returns the effective user for authorization purposes. If a delegated user is |
| /// configured, returns that user, otherwise returns the same as user(). |
| const char* effective_user() const; |
| |
| /// Returns the query_id for the current query. |
| UniqueId query_id() const; |
| |
| /// Sets an error for this UDF. If this is called, this will trigger the |
| /// query to fail. |
| void SetError(const char* error_msg); |
| |
| /// Adds a warning that is returned to the user. This can include things like |
| /// overflow or other recoverable error conditions. |
| /// Warnings are capped at a maximum number. Returns true if the warning was |
| /// added and false if it was ignored due to the cap. |
| bool AddWarning(const char* warning_msg); |
| |
| /// Returns true if there's been an error set. |
| bool has_error() const; |
| |
| /// Returns the current error message. Returns NULL if there is no error. |
| const char* error_msg() const; |
| |
| /// Allocates memory. All UDF/UDAs should use this if possible instead of malloc/new. |
| /// The UDF/UDA is responsible for calling Free() on all buffers returned by Allocate(). |
| /// If Allocate() fails or causes the memory limit to be exceeded, the error will be |
| /// set in this object causing the query to fail. |
| /// TODO: 'byte_size' should be 64-bit. See IMPALA-2756. |
| uint8_t* Allocate(int byte_size) NOEXCEPT; |
| |
| /// Wrapper around Allocate() to allocate a buffer of the given type "T". |
| template<typename T> |
| T* Allocate() { |
| return reinterpret_cast<T*>(Allocate(sizeof(T))); |
| } |
| |
| /// Reallocates 'ptr' to the new byte_size. If the currently underlying allocation |
| /// is big enough, the original ptr will be returned. If the allocation needs to |
| /// grow, a new allocation is made that is at least 'byte_size' and the contents |
| /// of 'ptr' will be copied into it. If the new allocation fails or causes the |
| /// memory limit to be exceeded, the error will be set in this object. |
| /// |
| /// This should be used for buffers that constantly get appended to. |
| /// TODO: 'byte_size' should be 64-bit. See IMPALA-2756. |
| uint8_t* Reallocate(uint8_t* ptr, int byte_size) NOEXCEPT; |
| |
| /// Frees a buffer returned from Allocate() or Reallocate() |
| void Free(uint8_t* buffer) NOEXCEPT; |
| |
| /// For allocations that cannot use the Allocate() API provided by this |
| /// object, TrackAllocation()/Free() can be used to just keep count of the |
| /// byte sizes. For each call to TrackAllocation(), the UDF/UDA must call |
| /// the corresponding Free(). |
| void TrackAllocation(int64_t byte_size); |
| void Free(int64_t byte_size); |
| |
| /// Methods for maintaining state across UDF/UDA function calls. SetFunctionState() can |
| /// be used to store a pointer that can then be retreived via GetFunctionState(). If |
| /// GetFunctionState() is called when no pointer is set, it will return |
| /// NULL. SetFunctionState() does not take ownership of 'ptr'; it is up to the UDF/UDA |
| /// to clean up any function state if necessary. |
| void SetFunctionState(FunctionStateScope scope, void* ptr); |
| void* GetFunctionState(FunctionStateScope scope) const; |
| |
| /// Returns the return type information of this function. For UDAs, this is the final |
| /// return type of the UDA (e.g., the type returned by the finalize function). |
| const TypeDesc& GetReturnType() const; |
| |
| /// Returns the intermediate type for UDAs, i.e., the one returned by |
| /// update and merge functions. Returns INVALID_TYPE for UDFs. |
| const TypeDesc& GetIntermediateType() const; |
| |
| /// Returns the number of arguments to this function (not including the FunctionContext* |
| /// argument or the output of a UDA). |
| /// For UDAs, returns the number of logical arguments of the aggregate function, not |
| /// the number of arguments of the C++ function being executed. |
| int GetNumArgs() const; |
| |
| /// Returns the type information for the arg_idx-th argument (0-indexed, not including |
| /// the FunctionContext* argument). Returns NULL if arg_idx is invalid. |
| /// For UDAs, returns the logical argument types of the aggregate function, not the |
| /// argument types of the C++ function being executed. |
| const TypeDesc* GetArgType(int arg_idx) const; |
| |
| /// Returns true if the arg_idx-th input argument (indexed in the same way as |
| /// GetArgType()) is a constant (e.g. 5, "string", 1 + 1). |
| bool IsArgConstant(int arg_idx) const; |
| |
| /// Returns a pointer to the value of the arg_idx-th input argument (indexed in the |
| /// same way as GetArgType()). Returns NULL if the argument is not constant. This |
| /// function can be used to obtain user-specified constants in a UDF's Init() or |
| /// Close() functions. |
| AnyVal* GetConstantArg(int arg_idx) const; |
| |
| /// TODO: Do we need to add arbitrary key/value metadata. This would be plumbed |
| /// through the query. E.g. "select UDA(col, 'sample=true') from tbl". |
| /// const char* GetMetadata(const char*) const; |
| |
| /// TODO: Add mechanism for UDAs to update stats similar to runtime profile counters |
| |
| /// TODO: Add mechanism to query for table/column stats |
| |
| /// Returns the underlying opaque implementation object. The UDF/UDA should not |
| /// use this. This is used internally. |
| impala::FunctionContextImpl* impl() const { return impl_; } |
| |
| ~FunctionContext(); |
| |
| private: |
| friend class impala::FunctionContextImpl; |
| FunctionContext(); |
| |
| /// Disable copy ctor and assignment operator |
| FunctionContext(const FunctionContext& other); |
| FunctionContext& operator=(const FunctionContext& other); |
| |
| impala::FunctionContextImpl* impl_; // Owned by this object. |
| }; |
| |
| //---------------------------------------------------------------------------- |
| //------------------------------- UDFs --------------------------------------- |
| //---------------------------------------------------------------------------- |
| /// The UDF must implement this function prototype. This is not a typedef as the actual |
| /// UDF's signature varies from UDF to UDF. |
| /// typedef <*Val> Evaluate(FunctionContext* context, <const Val& arg>); |
| /// |
| /// The UDF must return one of the *Val structs. The UDF must accept a pointer to a |
| /// FunctionContext object and then a const reference for each of the input arguments. |
| /// Examples of valid Udf signatures are: |
| /// 1) DoubleVal Example1(FunctionContext* context); |
| /// 2) IntVal Example2(FunctionContext* context, const IntVal& a1, const DoubleVal& a2); |
| /// |
| /// UDFs can be variadic. The variable arguments must all come at the end and must be |
| /// the same type. A example signature is: |
| /// StringVal Concat(FunctionContext* context, const StringVal& separator, |
| /// int num_var_args, const StringVal* args); |
| /// In this case args[0] is the first variable argument and args[num_var_args - 1] is |
| /// the last. |
| /// |
| /// ------- Memory Management ------- |
| /// --------------------------------- |
| /// The UDF can assume that memory from input arguments will have the same lifetime as |
| /// results for the UDF. In other words, the UDF can return memory from input arguments |
| /// without making copies. For example, a function like substring will not need to |
| /// allocate and copy the smaller string. |
| /// |
| /// Any state needed across calls must be stored and accessed via |
| /// FunctionContext::SetFunctionState() and FunctionContext::GetFunctionState(). The UDF |
| /// should not maintain any other state across calls since there is no guarantee on how |
| /// the execution is multithreaded or distributed. |
| /// |
| /// For StringVal return values, the UDF can use StringVal(FunctionContext*, int) |
| /// ctor or the function StringVal::CopyFrom(FunctionContext*, const uint8_t*, size_t). |
| /// The memory consumed by the StringVal will be managed by Impala. Please see the UDA |
| /// section below for details. |
| /// |
| /// -------- Execution Model -------- |
| /// --------------------------------- |
| /// Execution model: For each UDF use occurring in a given query, at least one |
| /// FunctionContext will be created. For a given FunctionContext, the UDF's functions are |
| /// never called concurrently and therefore do not need to be thread-safe. State shared |
| /// across UDF invocations should be initialized and cleaned up using prepare and close |
| /// functions (described below). |
| /// |
| /// Note that a single UDF use may produce multiple FunctionContexts for that UDF (this is |
| /// so the UDF can be executed concurrently in different threads). For example, the query |
| /// "select * from tbl where my_udf(x) > 0" may produce multiple FunctionContexts for |
| /// 'my_udf', each of which may concurrently be passed to 'my_udf's prepare, close, and |
| /// UDF functions. |
| /// |
| /// --- Prepare / Close Functions --- |
| /// --------------------------------- |
| /// The UDF can optionally include a prepare function, specified in the "CREATE FUNCTION" |
| /// statement using "prepare_fn=<prepare function symbol>". The prepare function is called |
| /// before any calls to the UDF to evaluate values. This is the appropriate time for the |
| /// UDF to initialize any shared data structures, validate versions, etc. If there is an |
| /// error, this function should call FunctionContext::SetError()/ |
| /// FunctionContext::AddWarning(). |
| /// |
| /// The prepare function is called multiple times with different FunctionStateScopes. It |
| /// will be called once per fragment with 'scope' set to FRAGMENT_LOCAL, and once per |
| /// execution thread with 'scope' set to THREAD_LOCAL. |
| typedef void (*UdfPrepare)(FunctionContext* context, |
| FunctionContext::FunctionStateScope scope); |
| |
| /// The UDF can also optionally include a close function, specified in the "CREATE |
| /// FUNCTION" statement using "close_fn=<close function symbol>". The close function is |
| /// called after all calls to the UDF have completed. This is the appropriate time for the |
| /// UDF to deallocate any shared data structures that are not needed to maintain the |
| /// results. If there is an error, this function should call FunctionContext::SetError()/ |
| /// FunctionContext::AddWarning(). |
| // |
| /// The close function is called multiple times with different FunctionStateScopes. It |
| /// will be called once per fragment with 'scope' set to FRAGMENT_LOCAL, and once per |
| /// execution thread with 'scope' set to THREAD_LOCAL. |
| typedef void (*UdfClose)(FunctionContext* context, |
| FunctionContext::FunctionStateScope scope); |
| |
| //---------------------------------------------------------------------------- |
| //------------------------------- UDAs --------------------------------------- |
| //---------------------------------------------------------------------------- |
| /// The UDA execution is broken up into a few steps. The general calling pattern |
| /// is one of these: |
| /// 1) Init(), Update() (repeatedly), Serialize() |
| /// 2) Init(), Update() (repeatedly), Finalize() |
| /// 3) Init(), Merge() (repeatedly), Serialize() |
| /// 4) Init(), Merge() (repeatedly), Finalize() |
| /// The UDA is registered with three types: the result type, the input type and |
| /// the intermediate type. |
| /// |
| /// If the UDA needs a variable-sized buffer, it should use TYPE_STRING and allocate it |
| /// from the FunctionContext manually. |
| /// For UDAs that need a complex data structure as the intermediate state, the |
| /// intermediate type should be string and the UDA can cast the ptr to the structure |
| /// it is using. |
| /// |
| /// Memory Management: allocations that are referred to by the intermediate values |
| /// returned by Init(), Update() and Merge() must be allocated via |
| /// FunctionContext::Allocate() and freed via FunctionContext::Free(). Both Serialize() |
| /// and Finalize() are responsible for cleaning up the intermediate value and freeing |
| /// such allocations. StringVals returned to Impala directly by Serialize(), Finalize() |
| /// or GetValue() should be backed by temporary results memory allocated via the |
| /// StringVal(FunctionContext*, int) ctor, StringVal::CopyFrom(FunctionContext*, |
| /// const uint8_t*, size_t), or StringVal::Resize(). |
| /// |
| /// Note that in the rare case the StringVal ctor or StringVal::CopyFrom() fail to |
| /// allocate memory, the StringVal object will be marked as a null string. |
| /// Serialize()/Finalize() should handle allocation failures by checking the is_null |
| /// field of the StringVal object and carry out appropriate error handling action. |
| /// Similarly, FunctionContext::Allocate()/Reallocate() may also fail to allocate |
| /// memory so callers should check the returned values before using them. |
| /// |
| /// For clarity in documenting the UDA interface, the various types will be typedefed |
| /// here. The actual execution resolves all the types at runtime and none of these types |
| /// should actually be used. |
| /// |
| /// TODO: add an Init() variant that takes the initial input value to avoid initializing |
| /// then immediately overwriting the value. |
| typedef AnyVal InputType; |
| typedef AnyVal InputType2; |
| typedef AnyVal ResultType; |
| typedef AnyVal IntermediateType; |
| |
| /// UdaInit is called once for each aggregate group before calls to any of the |
| /// other functions below. |
| typedef void (*UdaInit)(FunctionContext* context, IntermediateType* result); |
| |
| /// This is called for each input value. The UDA should update result based on the |
| /// input value. The update function can take any number of input arguments. Here |
| /// are some examples: |
| typedef void (*UdaUpdate)(FunctionContext* context, const InputType& input, |
| IntermediateType* result); |
| typedef void (*UdaUpdate2)(FunctionContext* context, const InputType& input, |
| const InputType2& input2, IntermediateType* result); |
| |
| /// Merge an intermediate result 'src' into 'dst'. |
| typedef void (*UdaMerge)(FunctionContext* context, const IntermediateType& src, |
| IntermediateType* dst); |
| |
| /// Serialize the intermediate type. The serialized data is then sent across the |
| /// wire. |
| /// No additional functions will be called with this FunctionContext object and the |
| /// UDA should do final clean (e.g. Free()) here. |
| typedef const IntermediateType (*UdaSerialize)(FunctionContext* context, |
| const IntermediateType& type); |
| |
| /// Called once at the end to return the final value for this UDA. |
| /// No additional functions will be called with this FunctionContext object and the |
| /// UDA should do final clean (e.g. Free()) here. |
| typedef ResultType (*UdaFinalize)(FunctionContext* context, const IntermediateType& v); |
| |
| //---------------------------------------------------------------------------- |
| //-------------Implementation of the *Val structs ---------------------------- |
| //---------------------------------------------------------------------------- |
| struct AnyVal { |
| // Whether this value is NULL. If true, all other fields contain arbitrary values. |
| // UDF code should *not* assume that other fields of a NULL *Val struct have any |
| // particular value (e.g. 0 or -1). |
| bool is_null; |
| AnyVal(bool is_null = false) : is_null(is_null) {} |
| }; |
| |
| struct BooleanVal : public AnyVal { |
| bool val; |
| |
| BooleanVal(bool val = false) : val(val) {} |
| |
| static BooleanVal null() { |
| BooleanVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| bool operator==(const BooleanVal& other) const { |
| if (is_null && other.is_null) return true; |
| if (is_null || other.is_null) return false; |
| return val == other.val; |
| } |
| bool operator!=(const BooleanVal& other) const { return !(*this == other); } |
| }; |
| |
| struct TinyIntVal : public AnyVal { |
| typedef int8_t underlying_type_t; |
| underlying_type_t val; |
| |
| TinyIntVal(underlying_type_t val = 0) : val(val) { } |
| |
| static TinyIntVal null() { |
| TinyIntVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| bool operator==(const TinyIntVal& other) const { |
| if (is_null && other.is_null) return true; |
| if (is_null || other.is_null) return false; |
| return val == other.val; |
| } |
| bool operator!=(const TinyIntVal& other) const { return !(*this == other); } |
| }; |
| |
| struct SmallIntVal : public AnyVal { |
| typedef int16_t underlying_type_t; |
| underlying_type_t val; |
| |
| SmallIntVal(underlying_type_t val = 0) : val(val) { } |
| |
| static SmallIntVal null() { |
| SmallIntVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| bool operator==(const SmallIntVal& other) const { |
| if (is_null && other.is_null) return true; |
| if (is_null || other.is_null) return false; |
| return val == other.val; |
| } |
| bool operator!=(const SmallIntVal& other) const { return !(*this == other); } |
| }; |
| |
| struct IntVal : public AnyVal { |
| typedef int32_t underlying_type_t; |
| underlying_type_t val; |
| |
| IntVal(underlying_type_t val = 0) : val(val) { } |
| |
| static IntVal null() { |
| IntVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| bool operator==(const IntVal& other) const { |
| if (is_null && other.is_null) return true; |
| if (is_null || other.is_null) return false; |
| return val == other.val; |
| } |
| bool operator!=(const IntVal& other) const { return !(*this == other); } |
| }; |
| |
| struct BigIntVal : public AnyVal { |
| typedef int64_t underlying_type_t; |
| underlying_type_t val; |
| |
| BigIntVal(underlying_type_t val = 0) : val(val) { } |
| |
| static BigIntVal null() { |
| BigIntVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| bool operator==(const BigIntVal& other) const { |
| if (is_null && other.is_null) return true; |
| if (is_null || other.is_null) return false; |
| return val == other.val; |
| } |
| bool operator!=(const BigIntVal& other) const { return !(*this == other); } |
| }; |
| |
| struct FloatVal : public AnyVal { |
| float val; |
| |
| FloatVal(float val = 0) : val(val) { } |
| |
| static FloatVal null() { |
| FloatVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| bool operator==(const FloatVal& other) const { |
| if (is_null && other.is_null) return true; |
| if (is_null || other.is_null) return false; |
| return val == other.val; |
| } |
| bool operator!=(const FloatVal& other) const { return !(*this == other); } |
| }; |
| |
| struct DoubleVal : public AnyVal { |
| double val; |
| |
| DoubleVal(double val = 0) : val(val) { } |
| |
| static DoubleVal null() { |
| DoubleVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| bool operator==(const DoubleVal& other) const { |
| if (is_null && other.is_null) return true; |
| if (is_null || other.is_null) return false; |
| return val == other.val; |
| } |
| bool operator!=(const DoubleVal& other) const { return !(*this == other); } |
| }; |
| |
| /// This object has a compatible storage format with boost::ptime. |
| struct TimestampVal : public AnyVal { |
| /// Gregorian date. This has the same binary format as boost::gregorian::date. |
| int32_t date; |
| /// Nanoseconds in current day. |
| int64_t time_of_day; |
| |
| TimestampVal(int32_t date = 0, int64_t time_of_day = 0) : |
| date(date), time_of_day(time_of_day) { |
| } |
| |
| static TimestampVal null() { |
| TimestampVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| bool operator==(const TimestampVal& other) const { |
| if (is_null && other.is_null) return true; |
| if (is_null || other.is_null) return false; |
| return date == other.date && time_of_day == other.time_of_day; |
| } |
| bool operator!=(const TimestampVal& other) const { return !(*this == other); } |
| }; |
| |
| /// Represents a DATE value. |
| /// - The minimum and maximum dates are 0001-01-01 and 9999-12-31. Valid dates must fall |
| /// in this range. |
| /// - Internally represents DATE values as number of days since 1970-01-01. |
| /// - This representation was chosen to be the same (bit-by-bit) as Parquet's date type. |
| /// (https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#date) |
| /// - Proleptic Gregorian calendar is used to calculate the number of days since epoch, |
| /// which can lead to different representation of historical dates compared to Hive. |
| /// (https://en.wikipedia.org/wiki/Proleptic_Gregorian_calendar). |
| struct DateVal : public AnyVal { |
| typedef int32_t underlying_type_t; |
| underlying_type_t val; |
| |
| explicit DateVal(underlying_type_t val = 0) : val(val) { } |
| |
| static DateVal null() { |
| DateVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| bool operator==(const DateVal& other) const { |
| if (is_null && other.is_null) return true; |
| if (is_null || other.is_null) return false; |
| return val == other.val; |
| } |
| bool operator!=(const DateVal& other) const { return !(*this == other); } |
| }; |
| |
| /// A String value represented as a buffer + length. |
| /// Note: there is a difference between a NULL string (is_null == true) and an |
| /// empty string (len == 0). |
| struct StringVal : public AnyVal { |
| |
| // It's important to keep this as unsigned to avoid comparing with negative number |
| // in case of overflow. |
| static const unsigned MAX_LENGTH = (1 << 30); |
| |
| // The length of the string buffer in bytes. |
| int len; |
| |
| // Pointer to the start of the string buffer. The buffer is not aligned and is not |
| // null-terminated. Functions must not read or write past the end of the buffer. |
| // I.e. accessing ptr[i] where i >= len is invalid. |
| uint8_t* ptr; |
| |
| /// Construct a StringVal from ptr/len. Note: this does not make a copy of ptr |
| /// so the buffer must exist as long as this StringVal does. |
| StringVal(uint8_t* ptr = NULL, int len = 0) : len(len), ptr(ptr) { |
| assert(len >= 0); |
| if (ptr == NULL) assert(len == 0); |
| } |
| |
| /// Construct a StringVal from NULL-terminated c-string. Note: this does not make a |
| /// copy of ptr so the underlying string must exist as long as this StringVal does. |
| StringVal(const char* ptr) |
| : len(strlen(ptr)), |
| ptr(const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(ptr))) {} |
| |
| /// Creates a StringVal, allocating a new buffer with 'len'. This should |
| /// be used to return StringVal objects in UDF/UDAs that need to allocate new |
| /// string memory. |
| /// |
| /// If the memory allocation fails, e.g. because the intermediate value would be too |
| /// large, the constructor will construct a NULL string and set an error on the function |
| /// context. |
| /// |
| /// The memory backing this StringVal is managed by the Impala runtime and so doesn't need |
| /// to be explicitly freed. |
| StringVal(FunctionContext* context, int len) NOEXCEPT; |
| |
| /// Resize a string value to 'len'. If 'len' is the same as or smaller than the current |
| /// length, truncates the string. Otherwise, increases the string's length, allocating |
| /// new memory and copying over the current contents if needed. The content of the new |
| /// space is undefined. If a resize fails, the length and contents of the StringVal are |
| /// unchanged. |
| /// |
| /// Resized strings can be returned from UDFs as the result value. Callers do not |
| /// otherwise need to be concerned with backing storage, which is managed by the |
| /// Impala runtime and freed at some point after the UDF returns. |
| /// |
| /// Returns true on success, false on failure. |
| bool Resize(FunctionContext* context, int len) NOEXCEPT; |
| |
| /// Will create a new StringVal with the given dimension and copy the data from the |
| /// parameters. In case of an error will return a NULL string and set an error on the |
| /// function context. |
| /// |
| /// Note that the memory for the buffer of the new StringVal is managed by Impala. |
| /// Impala will handle freeing it. Callers should not call Free() on the 'ptr' of |
| /// the StringVal returned. |
| static StringVal CopyFrom(FunctionContext* ctx, const uint8_t* buf, size_t len) |
| NOEXCEPT; |
| |
| static StringVal null() { |
| StringVal sv; |
| sv.is_null = true; |
| return sv; |
| } |
| |
| bool operator==(const StringVal& other) const { |
| if (is_null != other.is_null) return false; |
| if (is_null) return true; |
| if (len != other.len) return false; |
| return ptr == other.ptr || memcmp(ptr, other.ptr, len) == 0; |
| } |
| bool operator!=(const StringVal& other) const { return !(*this == other); } |
| }; |
| |
| struct DecimalVal : public impala_udf::AnyVal { |
| /// Decimal data is stored as an unscaled integer value. For example, the decimal 1.00 |
| /// (precision 3, scale 2) is stored as 100. The byte size necessary to store the |
| /// decimal depends on the precision, which determines which field of the union should |
| /// be used to store and manipulate the unscaled value. |
| /// |
| /// precision between 0-9: val4 (4 bytes) |
| /// precision between 10-18: val8 (8 bytes) |
| /// precision between 19-38: val16 (16 bytes) |
| /// |
| /// While it is always safe to use a larger field than necessary, it may result in worse |
| /// performance. For example, a UDF that only uses val16 can handle any precision but |
| /// may be slower than one that uses val4 or val8. This is because the least-significant |
| /// bits of all three union fields are the same (assuming a little-endian architecture). |
| union { |
| int32_t val4; |
| int64_t val8; |
| __int128_t val16; |
| }; |
| |
| DecimalVal() : val16(0) {} |
| DecimalVal(int32_t v) : val16(v) {} |
| DecimalVal(int64_t v) : val16(v) {} |
| DecimalVal(__int128_t v) : val16(v) {} |
| |
| static DecimalVal null() { |
| DecimalVal result; |
| result.is_null = true; |
| return result; |
| } |
| |
| DecimalVal& operator=(const DecimalVal& other) { |
| // Depending on the compiler, the default assignment operator may require 16-byte |
| // alignment of 'this' and 'other'. Cast to void* so the compiler doesn't change back |
| // to an assignment. |
| memcpy(reinterpret_cast<void*>(this), reinterpret_cast<const void*>(&other), |
| sizeof(DecimalVal)); |
| return *this; |
| } |
| |
| DecimalVal(const DecimalVal& other) { |
| *this = other; |
| } |
| }; |
| |
| typedef uint8_t* BufferVal; |
| |
| } |
| |
| #endif |