| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| |
| #ifndef IMPALA_RUNTIME_TUPLE_H |
| #define IMPALA_RUNTIME_TUPLE_H |
| |
| #include "codegen/impala-ir.h" |
| #include "common/logging.h" |
| #include "gutil/macros.h" |
| #include "runtime/descriptors.h" |
| #include "runtime/mem-pool.h" |
| |
| namespace llvm { |
| class Function; |
| } |
| |
| namespace impala { |
| |
| struct CollectionValue; |
| struct StringValue; |
| class TupleDescriptor; |
| class TupleRow; |
| |
| /// A tuple is stored as a contiguous sequence of bytes containing a fixed number |
| /// of fixed-size slots. The slots are arranged in order of increasing byte length; |
| /// the tuple might contain padding between slots in order to align them according |
| /// to their type. |
| // |
| /// The contents of a tuple: |
| /// 1) a number of bytes holding a bitvector of null indicators |
| /// 2) bool slots |
| /// 3) tinyint slots |
| /// 4) smallint slots |
| /// 5) int slots |
| /// 6) float slots |
| /// 7) bigint slots |
| /// 8) double slots |
| /// 9) string slots |
| // |
| /// A tuple with 0 materialised slots is represented as NULL. |
| /// |
| /// TODO: Our projection of collection-typed slots breaks/augments the conventional |
| /// semantics of the null bits, because we rely on producers of array values to also |
| /// set the slot value in addition to the null bit. We should address this issue with |
| /// a proper projection that restores the intended (original) null bit semantics. |
| /// See also UnnestNode for details on the projection. |
| class Tuple { |
| public: |
| /// initialize individual tuple with data residing in mem pool |
| static Tuple* Create(int size, MemPool* pool) { |
| if (size == 0) return NULL; |
| Tuple* result = reinterpret_cast<Tuple*>(pool->Allocate(size)); |
| result->Init(size); |
| return result; |
| } |
| |
| void Init(int size) { memset(this, 0, size); } |
| |
| void ClearNullBits(const TupleDescriptor& tuple_desc) { |
| memset(reinterpret_cast<uint8_t*>(this) + tuple_desc.null_bytes_offset(), |
| 0, tuple_desc.num_null_bytes()); |
| } |
| |
| /// The total size of all data represented in this tuple (tuple data and referenced |
| /// string and collection data). |
| int64_t TotalByteSize(const TupleDescriptor& desc) const; |
| |
| /// The size of all referenced string and collection data. |
| int64_t VarlenByteSize(const TupleDescriptor& desc) const; |
| |
| /// Create a copy of 'this', including all of its referenced variable-length data |
| /// (i.e. strings and collections), using pool to allocate memory. Returns the copy. |
| Tuple* DeepCopy(const TupleDescriptor& desc, MemPool* pool); |
| |
| /// Create a copy of 'this', including all its referenced variable-length data |
| /// (i.e. strings and collections), using pool to allocate memory. This version does |
| /// not allocate a tuple, instead copying to 'dst'. 'dst' must already be allocated to |
| /// the correct size (i.e. TotalByteSize()). |
| void DeepCopy(Tuple* dst, const TupleDescriptor& desc, MemPool* pool); |
| |
| /// Create a copy of 'this', including all referenced variable-length data (i.e. strings |
| /// and collections), into 'data'. The tuple is written first, followed by any |
| /// variable-length data. 'data' and 'offset' will be incremented by the total number of |
| /// bytes written. 'data' must already be allocated to the correct size |
| /// (i.e. TotalByteSize()). |
| /// If 'convert_ptrs' is true, rewrites pointers that are part of the tuple as offsets |
| /// into 'data'. Otherwise they will remain pointers directly into data. The offsets are |
| /// determined by 'offset', where '*offset' corresponds to address '*data'. |
| void DeepCopy(const TupleDescriptor& desc, char** data, int* offset, |
| bool convert_ptrs = false); |
| |
| /// This function should only be called on tuples created by DeepCopy() with |
| /// 'convert_ptrs' = true. It takes all pointers contained in this tuple (i.e. in |
| /// StringValues and CollectionValues, including those contained within other |
| /// CollectionValues), and converts the offset values into pointers into |
| /// 'tuple_data'. 'tuple_data' should be the serialized tuple buffer created by |
| /// DeepCopy(). Note that 'tuple_data' should always be the beginning of this buffer, |
| /// regardless of this tuple's offset in 'tuple_data'. |
| void ConvertOffsetsToPointers(const TupleDescriptor& desc, uint8_t* tuple_data); |
| |
| /// Materialize 'this' by evaluating the expressions in 'materialize_exprs_ctxs' over |
| /// the specified 'row'. |
| /// |
| /// If non-NULL, 'pool' is used to allocate var-length data, otherwise var-length data |
| /// isn't copied. (Memory for this tuple itself must already be allocated.) 'NULL_POOL' |
| /// should be true if 'pool' is NULL and false otherwise. The template parameter serves |
| /// only to differentiate the NULL vs. non-NULL pool cases when we replace the function |
| /// calls during codegen; the parameter means there are two different function symbols. |
| /// |
| /// If 'COLLECT_STRING_VALS' is true, the materialized non-NULL string value slots and |
| /// the total length of the string slots are returned in 'non_null_string_values' and |
| /// 'total_string_lengths'. 'non_null_string_values' and 'total_string_lengths' must be |
| /// non-NULL in this case. 'non_null_string_values' does not need to be empty; its |
| /// original contents will be overwritten. |
| |
| /// TODO: this function does not collect other var-len types such as collections. |
| template <bool COLLECT_STRING_VALS, bool NULL_POOL> |
| inline void IR_ALWAYS_INLINE MaterializeExprs(TupleRow* row, |
| const TupleDescriptor& desc, const std::vector<ExprContext*>& materialize_expr_ctxs, |
| MemPool* pool, std::vector<StringValue*>* non_null_string_values = NULL, |
| int* total_string_lengths = NULL) { |
| DCHECK_EQ(NULL_POOL, pool == NULL); |
| DCHECK_EQ(materialize_expr_ctxs.size(), desc.slots().size()); |
| StringValue** non_null_string_values_array = NULL; |
| int num_non_null_string_values = 0; |
| if (COLLECT_STRING_VALS) { |
| DCHECK(non_null_string_values != NULL); |
| DCHECK(total_string_lengths != NULL); |
| // string::resize() will zero-initialize any new values, so we resize to the largest |
| // possible size here, then truncate the vector below once we know the actual size |
| // (which preserves already-written values). |
| non_null_string_values->resize(desc.string_slots().size()); |
| non_null_string_values_array = non_null_string_values->data(); |
| *total_string_lengths = 0; |
| } |
| MaterializeExprs<COLLECT_STRING_VALS, NULL_POOL>(row, desc, |
| materialize_expr_ctxs.data(), pool, non_null_string_values_array, |
| total_string_lengths, &num_non_null_string_values); |
| if (COLLECT_STRING_VALS) non_null_string_values->resize(num_non_null_string_values); |
| } |
| |
| /// Symbols (or substrings of the symbols) of MaterializeExprs(). These can be passed to |
| /// LlvmCodeGen::ReplaceCallSites(). |
| static const char* MATERIALIZE_EXPRS_SYMBOL; |
| static const char* MATERIALIZE_EXPRS_NULL_POOL_SYMBOL; |
| |
| /// Generates an IR version of MaterializeExprs(), returned in 'fn'. Currently only |
| /// 'collect_string_vals' = false is implemented. |
| /// |
| /// 'pool' may be NULL, in which case no pool-related code is generated. Otherwise |
| /// 'pool's address is used directly in the IR. Note that this requires generating |
| /// separate functions for the non-NULL and NULL cases, i.e., the 'pool' argument of the |
| /// generated function is ignored. There are two different MaterializeExprs symbols to |
| /// differentiate these cases when we replace the function calls during codegen. |
| static Status CodegenMaterializeExprs(LlvmCodeGen* codegen, bool collect_string_vals, |
| const TupleDescriptor& desc, const vector<ExprContext*>& materialize_expr_ctxs, |
| MemPool* pool, llvm::Function** fn); |
| |
| /// Turn null indicator bit on. For non-nullable slots, the mask will be 0 and |
| /// this is a no-op (but we don't have to branch to check is slots are nullable). |
| void SetNull(const NullIndicatorOffset& offset) { |
| char* null_indicator_byte = reinterpret_cast<char*>(this) + offset.byte_offset; |
| *null_indicator_byte |= offset.bit_mask; |
| } |
| |
| /// Turn null indicator bit off. |
| void SetNotNull(const NullIndicatorOffset& offset) { |
| char* null_indicator_byte = reinterpret_cast<char*>(this) + offset.byte_offset; |
| *null_indicator_byte &= ~offset.bit_mask; |
| } |
| |
| bool IsNull(const NullIndicatorOffset& offset) const { |
| const char* null_indicator_byte = |
| reinterpret_cast<const char*>(this) + offset.byte_offset; |
| return (*null_indicator_byte & offset.bit_mask) != 0; |
| } |
| |
| void* GetSlot(int offset) { |
| DCHECK(offset != -1); // -1 offset indicates non-materialized slot |
| return reinterpret_cast<char*>(this) + offset; |
| } |
| |
| const void* GetSlot(int offset) const { |
| DCHECK(offset != -1); // -1 offset indicates non-materialized slot |
| return reinterpret_cast<const char*>(this) + offset; |
| } |
| |
| StringValue* GetStringSlot(int offset) { |
| DCHECK(offset != -1); // -1 offset indicates non-materialized slot |
| return reinterpret_cast<StringValue*>(reinterpret_cast<char*>(this) + offset); |
| } |
| |
| const StringValue* GetStringSlot(int offset) const { |
| DCHECK(offset != -1); // -1 offset indicates non-materialized slot |
| return reinterpret_cast<const StringValue*>( |
| reinterpret_cast<const char*>(this) + offset); |
| } |
| |
| CollectionValue* GetCollectionSlot(int offset) { |
| DCHECK(offset != -1); // -1 offset indicates non-materialized slot |
| return reinterpret_cast<CollectionValue*>(reinterpret_cast<char*>(this) + offset); |
| } |
| |
| const CollectionValue* GetCollectionSlot(int offset) const { |
| DCHECK(offset != -1); // -1 offset indicates non-materialized slot |
| return reinterpret_cast<const CollectionValue*>( |
| reinterpret_cast<const char*>(this) + offset); |
| } |
| |
| /// For C++/IR interop, we need to be able to look up types by name. |
| static const char* LLVM_CLASS_NAME; |
| |
| private: |
| DISALLOW_COPY_AND_ASSIGN(Tuple); |
| |
| /// Copy all referenced string and collection data by allocating memory from pool, |
| /// copying data, then updating pointers in tuple to reference copied data. |
| void DeepCopyVarlenData(const TupleDescriptor& desc, MemPool* pool); |
| |
| /// Copies all referenced string and collection data into 'data'. Increments 'data' and |
| /// 'offset' by the number of bytes written. Recursively writes collection tuple data |
| /// and referenced collection and string data. |
| void DeepCopyVarlenData(const TupleDescriptor& desc, char** data, int* offset, |
| bool convert_ptrs); |
| |
| /// Implementation of MaterializedExprs(). This function is replaced during |
| /// codegen. 'num_non_null_string_values' must be initialized by the caller. |
| template <bool COLLECT_STRING_VALS, bool NULL_POOL> |
| void IR_NO_INLINE MaterializeExprs(TupleRow* row, const TupleDescriptor& desc, |
| ExprContext* const* materialize_expr_ctxs, MemPool* pool, |
| StringValue** non_null_string_values, int* total_string_lengths, |
| int* num_non_null_string_values); |
| }; |
| |
| } |
| |
| #endif |