| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // This file is copied from |
| // https://github.com/ClickHouse/ClickHouse/blob/master/src/Columns/ColumnString.h |
| // and modified by Doris |
| |
| #pragma once |
| |
| #include <glog/logging.h> |
| #include <sys/types.h> |
| |
| #include <cassert> |
| #include <cstdint> |
| #include <cstring> |
| #include <typeinfo> |
| #include <vector> |
| |
| #include "common/cast_set.h" |
| #include "common/compiler_util.h" // IWYU pragma: keep |
| #include "common/exception.h" |
| #include "common/status.h" |
| #include "runtime/define_primitive_type.h" |
| #include "util/hash_util.hpp" |
| #include "vec/columns/column.h" |
| #include "vec/common/assert_cast.h" |
| #include "vec/common/cow.h" |
| #include "vec/common/memcmp_small.h" |
| #include "vec/common/memcpy_small.h" |
| #include "vec/common/pod_array_fwd.h" |
| #include "vec/common/sip_hash.h" |
| #include "vec/common/string_ref.h" |
| #include "vec/core/field.h" |
| #include "vec/core/types.h" |
| |
| namespace doris::vectorized { |
| #include "common/compile_check_begin.h" |
| class Arena; |
| class ColumnSorter; |
| |
| /** Column for String values. |
| */ |
| template <typename T> |
| class ColumnStr final : public COWHelper<IColumn, ColumnStr<T>> { |
| public: |
| using Char = UInt8; |
| using Chars = PaddedPODArray<UInt8>; |
| |
| static constexpr size_t MAX_STRINGS_OVERFLOW_SIZE = 128; |
| |
| void static check_chars_length(size_t total_length, size_t element_number, size_t rows = 0) { |
| if constexpr (std::is_same_v<T, UInt32>) { |
| if (UNLIKELY(total_length > MAX_STRING_SIZE)) { |
| throw Exception(ErrorCode::STRING_OVERFLOW_IN_VEC_ENGINE, |
| "string column length is too large: total_length={}, " |
| "element_number={}, rows={}", |
| total_length, element_number, rows); |
| } |
| } |
| } |
| |
| private: |
| // currently Offsets is uint32, if chars.size() exceeds 4G, offset will overflow. |
| // limit chars.size() and check the size when inserting data into ColumnStr<T>. |
| static constexpr size_t MAX_STRING_SIZE = 0xffffffff; |
| |
| friend class COWHelper<IColumn, ColumnStr<T>>; |
| friend class OlapBlockDataConvertor; |
| |
| /// Maps i'th position to offset to i+1'th element. Last offset maps to the end of all chars (is the size of all chars). |
| PaddedPODArray<T> offsets; |
| |
| /// Bytes of strings, placed contiguously. |
| /// For convenience, every string ends with terminating zero byte. Note that strings could contain zero bytes in the middle. |
| Chars chars; |
| |
| // Start position of i-th element. |
| T ALWAYS_INLINE offset_at(ssize_t i) const { return offsets[i - 1]; } |
| |
| // Size of i-th element, including terminating zero. |
| // assume that the length of a single element is less than 32-bit |
| uint32_t ALWAYS_INLINE size_at(ssize_t i) const { |
| return uint32_t(offsets[i] - offsets[i - 1]); |
| } |
| size_t serialize_size_at(size_t row) const override { |
| auto string_size(size_at(row)); |
| return sizeof(string_size) + string_size; |
| } |
| |
| template <bool positive> |
| struct less; |
| template <bool positive> |
| struct lessWithCollation; |
| |
| ColumnStr() = default; |
| |
| ColumnStr(const ColumnStr<T>& src) |
| : offsets(src.offsets.begin(), src.offsets.end()), |
| chars(src.chars.begin(), src.chars.end()) {} |
| |
| public: |
| bool is_variable_length() const override { return true; } |
| |
| void sanity_check() const override; |
| void sanity_check_simple() const; |
| |
| std::string get_name() const override { return "String"; } |
| |
| size_t size() const override { return offsets.size(); } |
| |
| size_t byte_size() const override { return chars.size() + offsets.size() * sizeof(offsets[0]); } |
| |
| bool has_enough_capacity(const IColumn& src) const override; |
| |
| size_t allocated_bytes() const override { |
| return chars.allocated_bytes() + offsets.allocated_bytes(); |
| } |
| |
| MutableColumnPtr clone_resized(size_t to_size) const override; |
| |
| void shrink_padding_chars() override; |
| |
| Field operator[](size_t n) const override; |
| |
| void get(size_t n, Field& res) const override; |
| |
| StringRef get_data_at(size_t n) const override { |
| DCHECK_LT(n, size()); |
| sanity_check_simple(); |
| return StringRef(&chars[offset_at(n)], size_at(n)); |
| } |
| |
| String get_element(size_t n) const { return get_data_at(n).to_string(); } |
| |
| void insert_value(const String& value) { insert_data(value.data(), value.size()); } |
| |
| void insert(const Field& x) override; |
| |
| void insert_many_from(const IColumn& src, size_t position, size_t length) override; |
| |
| bool is_column_string64() const override { return sizeof(T) == sizeof(uint64_t); } |
| |
| void insert_from(const IColumn& src_, size_t n) override { |
| const ColumnStr<T>& src = assert_cast<const ColumnStr<T>&>(src_); |
| const size_t size_to_append = |
| src.offsets[n] - src.offsets[n - 1]; /// -1th index is Ok, see PaddedPODArray. |
| |
| if (!size_to_append) { |
| /// shortcut for empty string |
| offsets.push_back(chars.size()); |
| } else { |
| const size_t old_size = chars.size(); |
| const size_t offset = src.offsets[n - 1]; |
| const size_t new_size = old_size + size_to_append; |
| |
| check_chars_length(new_size, offsets.size() + 1); |
| |
| chars.resize(new_size); |
| memcpy_small_allow_read_write_overflow15(chars.data() + old_size, &src.chars[offset], |
| size_to_append); |
| offsets.push_back(new_size); |
| } |
| sanity_check_simple(); |
| } |
| |
| void insert_data(const char* pos, size_t length) override { |
| const size_t old_size = chars.size(); |
| const size_t new_size = old_size + length; |
| |
| if (length) { |
| check_chars_length(new_size, offsets.size() + 1); |
| chars.resize(new_size); |
| memcpy(chars.data() + old_size, pos, length); |
| } |
| offsets.push_back(new_size); |
| sanity_check_simple(); |
| } |
| |
| void insert_data_without_reserve(const char* pos, size_t length) { |
| const size_t old_size = chars.size(); |
| const size_t new_size = old_size + length; |
| |
| if (length) { |
| check_chars_length(new_size, offsets.size() + 1); |
| chars.resize(new_size); |
| memcpy(chars.data() + old_size, pos, length); |
| } |
| offsets.push_back_without_reserve(new_size); |
| sanity_check_simple(); |
| } |
| |
| /// Before insert strings, the caller should calculate the total size of strings, |
| /// and reserve the chars & the offsets. |
| void insert_many_strings_without_reserve(const StringRef* strings, size_t num) { |
| Char* data = chars.data(); |
| size_t offset = chars.size(); |
| data += offset; |
| size_t length = 0; |
| |
| const char* ptr = strings[0].data; |
| for (size_t i = 0; i != num; i++) { |
| size_t len = strings[i].size; |
| length += len; |
| offset += len; |
| offsets.push_back(offset); |
| |
| if (i != num - 1 && strings[i].data + len == strings[i + 1].data) { |
| continue; |
| } |
| |
| if (length != 0) { |
| DCHECK(ptr != nullptr); |
| memcpy(data, ptr, length); |
| data += length; |
| } |
| |
| if (LIKELY(i != num - 1)) { |
| ptr = strings[i + 1].data; |
| length = 0; |
| } |
| } |
| check_chars_length(offset, offsets.size()); |
| chars.resize(offset); |
| sanity_check_simple(); |
| } |
| |
| void insert_many_continuous_binary_data(const char* data, const uint32_t* offsets_, |
| const size_t num) override { |
| if (UNLIKELY(num == 0)) { |
| return; |
| } |
| const auto old_size = chars.size(); |
| const auto begin_offset = offsets_[0]; |
| const size_t total_mem_size = offsets_[num] - begin_offset; |
| if (LIKELY(total_mem_size > 0)) { |
| check_chars_length(total_mem_size + old_size, offsets.size() + num); |
| chars.resize(total_mem_size + old_size); |
| memcpy(chars.data() + old_size, data + begin_offset, total_mem_size); |
| } |
| const auto old_rows = offsets.size(); |
| auto tail_offset = offsets.back(); |
| DCHECK(tail_offset == old_size); |
| offsets.resize(old_rows + num); |
| auto* offsets_ptr = &offsets[old_rows]; |
| |
| for (size_t i = 0; i < num; ++i) { |
| offsets_ptr[i] = tail_offset + offsets_[i + 1] - begin_offset; |
| } |
| DCHECK(chars.size() == offsets.back()); |
| sanity_check_simple(); |
| } |
| |
| void insert_many_strings(const StringRef* strings, size_t num) override { |
| size_t new_size = 0; |
| for (size_t i = 0; i < num; i++) { |
| new_size += strings[i].size; |
| } |
| |
| const size_t old_size = chars.size(); |
| check_chars_length(old_size + new_size, offsets.size() + num); |
| chars.resize(old_size + new_size); |
| |
| Char* data = chars.data(); |
| size_t offset = old_size; |
| for (size_t i = 0; i < num; i++) { |
| size_t len = strings[i].size; |
| if (len) { |
| memcpy(data + offset, strings[i].data, len); |
| offset += len; |
| } |
| offsets.push_back(offset); |
| } |
| sanity_check_simple(); |
| } |
| |
| template <size_t copy_length> |
| void insert_many_strings_fixed_length(const StringRef* strings, size_t num) { |
| size_t new_size = 0; |
| for (size_t i = 0; i < num; i++) { |
| new_size += strings[i].size; |
| } |
| |
| const size_t old_size = chars.size(); |
| check_chars_length(old_size + new_size, offsets.size() + num); |
| chars.resize(old_size + new_size + copy_length); |
| |
| Char* data = chars.data(); |
| size_t offset = old_size; |
| for (size_t i = 0; i < num; i++) { |
| size_t len = strings[i].size; |
| if (len) { |
| memcpy(data + offset, strings[i].data, copy_length); |
| offset += len; |
| } |
| offsets.push_back(offset); |
| } |
| chars.resize(old_size + new_size); |
| sanity_check_simple(); |
| } |
| |
| void insert_many_strings_overflow(const StringRef* strings, size_t num, |
| size_t max_length) override { |
| if (max_length <= 8) { |
| insert_many_strings_fixed_length<8>(strings, num); |
| } else if (max_length <= 16) { |
| insert_many_strings_fixed_length<16>(strings, num); |
| } else if (max_length <= 32) { |
| insert_many_strings_fixed_length<32>(strings, num); |
| } else if (max_length <= 64) { |
| insert_many_strings_fixed_length<64>(strings, num); |
| } else if (max_length <= 128) { |
| insert_many_strings_fixed_length<128>(strings, num); |
| } else { |
| insert_many_strings(strings, num); |
| } |
| sanity_check_simple(); |
| } |
| |
| void insert_many_dict_data(const int32_t* data_array, size_t start_index, const StringRef* dict, |
| size_t num, uint32_t /*dict_num*/) override { |
| size_t offset_size = offsets.size(); |
| size_t old_size = chars.size(); |
| size_t new_size = old_size; |
| offsets.resize(offsets.size() + num); |
| |
| for (size_t i = 0; i < num; i++) { |
| int32_t codeword = data_array[i + start_index]; |
| new_size += dict[codeword].size; |
| offsets[offset_size + i] = static_cast<T>(new_size); |
| } |
| |
| if (new_size > std::numeric_limits<T>::max()) { |
| throw doris::Exception(ErrorCode::INTERNAL_ERROR, |
| "ColumnString insert size out of range type {} [{},{}]", |
| typeid(T).name(), std::numeric_limits<T>::min(), |
| std::numeric_limits<T>::max()); |
| } |
| check_chars_length(new_size, offsets.size()); |
| chars.resize(new_size); |
| |
| for (size_t i = start_index; i < start_index + num; i++) { |
| int32_t codeword = data_array[i]; |
| const auto& src = dict[codeword]; |
| memcpy(chars.data() + old_size, src.data, src.size); |
| old_size += src.size; |
| } |
| sanity_check_simple(); |
| } |
| |
| void pop_back(size_t n) override { |
| size_t nested_n = offsets.back() - offset_at(offsets.size() - n); |
| chars.resize(chars.size() - nested_n); |
| offsets.resize_assume_reserved(offsets.size() - n); |
| sanity_check_simple(); |
| } |
| |
| StringRef serialize_value_into_arena(size_t n, Arena& arena, char const*& begin) const override; |
| |
| const char* deserialize_and_insert_from_arena(const char* pos) override; |
| |
| void deserialize_vec(StringRef* keys, const size_t num_rows) override; |
| |
| size_t get_max_row_byte_size() const override; |
| |
| void update_xxHash_with_value(size_t start, size_t end, uint64_t& hash, |
| const uint8_t* __restrict null_data) const override { |
| if (null_data) { |
| for (size_t i = start; i < end; ++i) { |
| if (null_data[i] == 0) { |
| size_t string_size = size_at(i); |
| size_t offset = offset_at(i); |
| hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&chars[offset]), |
| string_size, hash); |
| } |
| } |
| } else { |
| for (size_t i = start; i < end; ++i) { |
| size_t string_size = size_at(i); |
| size_t offset = offset_at(i); |
| hash = HashUtil::xxHash64WithSeed(reinterpret_cast<const char*>(&chars[offset]), |
| string_size, hash); |
| } |
| } |
| } |
| |
| void update_crc_with_value(size_t start, size_t end, uint32_t& hash, |
| const uint8_t* __restrict null_data) const override { |
| if (null_data) { |
| for (size_t i = start; i < end; ++i) { |
| if (null_data[i] == 0) { |
| auto data_ref = get_data_at(i); |
| // If offset is uint32, size will not exceed, check the size when inserting data into ColumnStr<T>. |
| hash = HashUtil::zlib_crc_hash(data_ref.data, |
| static_cast<uint32_t>(data_ref.size), hash); |
| } |
| } |
| } else { |
| for (size_t i = start; i < end; ++i) { |
| auto data_ref = get_data_at(i); |
| hash = HashUtil::zlib_crc_hash(data_ref.data, static_cast<uint32_t>(data_ref.size), |
| hash); |
| } |
| } |
| } |
| |
| void update_hash_with_value(size_t n, SipHash& hash) const override { |
| size_t string_size = size_at(n); |
| size_t offset = offset_at(n); |
| |
| // TODO: Rethink we really need to update the string_size? |
| hash.update(reinterpret_cast<const char*>(&string_size), sizeof(string_size)); |
| hash.update(reinterpret_cast<const char*>(&chars[offset]), string_size); |
| } |
| |
| void update_crcs_with_value(uint32_t* __restrict hashes, PrimitiveType type, uint32_t rows, |
| uint32_t offset, |
| const uint8_t* __restrict null_data) const override; |
| |
| void update_hashes_with_value(uint64_t* __restrict hashes, |
| const uint8_t* __restrict null_data) const override { |
| auto s = size(); |
| if (null_data) { |
| for (int i = 0; i < s; i++) { |
| if (null_data[i] == 0) { |
| size_t string_size = size_at(i); |
| size_t offset = offset_at(i); |
| hashes[i] = HashUtil::xxHash64WithSeed( |
| reinterpret_cast<const char*>(&chars[offset]), string_size, hashes[i]); |
| } |
| } |
| } else { |
| for (int i = 0; i < s; i++) { |
| size_t string_size = size_at(i); |
| size_t offset = offset_at(i); |
| hashes[i] = HashUtil::xxHash64WithSeed( |
| reinterpret_cast<const char*>(&chars[offset]), string_size, hashes[i]); |
| } |
| } |
| } |
| |
| void insert_range_from(const IColumn& src, size_t start, size_t length) override; |
| |
| void insert_range_from_ignore_overflow(const IColumn& src, size_t start, |
| size_t length) override; |
| |
| void insert_indices_from(const IColumn& src, const uint32_t* indices_begin, |
| const uint32_t* indices_end) override; |
| |
| ColumnPtr filter(const IColumn::Filter& filt, ssize_t result_size_hint) const override; |
| size_t filter(const IColumn::Filter& filter) override; |
| |
| Status filter_by_selector(const uint16_t* sel, size_t sel_size, IColumn* col_ptr) override; |
| |
| MutableColumnPtr permute(const IColumn::Permutation& perm, size_t limit) const override; |
| |
| void sort_column(const ColumnSorter* sorter, EqualFlags& flags, IColumn::Permutation& perms, |
| EqualRange& range, bool last_column) const override; |
| |
| void insert_default() override { offsets.push_back(chars.size()); } |
| |
| void insert_many_defaults(size_t length) override { |
| offsets.resize_fill(offsets.size() + length, static_cast<T>(chars.size())); |
| sanity_check_simple(); |
| } |
| |
| int compare_at(size_t n, size_t m, const IColumn& rhs_, |
| int /*nan_direction_hint*/) const override { |
| const ColumnStr<T>& rhs = assert_cast<const ColumnStr<T>&>(rhs_); |
| return memcmp_small_allow_overflow15(chars.data() + offset_at(n), size_at(n), |
| rhs.chars.data() + rhs.offset_at(m), rhs.size_at(m)); |
| } |
| |
| void get_permutation(bool reverse, size_t limit, int nan_direction_hint, |
| IColumn::Permutation& res) const override; |
| |
| ColumnPtr replicate(const IColumn::Offsets& replicate_offsets) const override; |
| |
| void reserve(size_t n) override; |
| |
| void resize(size_t n) override; |
| |
| bool is_column_string() const override { return true; } |
| |
| bool structure_equals(const IColumn& rhs) const override { |
| return typeid(rhs) == typeid(ColumnStr<T>); |
| } |
| |
| bool is_ascii() const; |
| |
| Chars& get_chars() { return chars; } |
| const Chars& get_chars() const { return chars; } |
| |
| auto& get_offsets() { return offsets; } |
| const auto& get_offsets() const { return offsets; } |
| |
| void clear() override { |
| chars.clear(); |
| offsets.clear(); |
| } |
| |
| void replace_column_data(const IColumn& rhs, size_t row, size_t self_row = 0) override { |
| throw doris::Exception(ErrorCode::INTERNAL_ERROR, |
| "Method replace_column_data is not supported for ColumnString"); |
| } |
| |
| void compare_internal(size_t rhs_row_id, const IColumn& rhs, int nan_direction_hint, |
| int direction, std::vector<uint8_t>& cmp_res, |
| uint8_t* __restrict filter) const override; |
| |
| ColumnPtr convert_column_if_overflow() override; |
| |
| void erase(size_t start, size_t length) override; |
| |
| void serialize_vec(StringRef* keys, const size_t num_rows) const override; |
| size_t serialize_impl(char* pos, size_t row) const override; |
| size_t deserialize_impl(const char* pos) override; |
| }; |
| |
| using ColumnString = ColumnStr<UInt32>; |
| using ColumnString64 = ColumnStr<UInt64>; |
| } // namespace doris::vectorized |
| #include "common/compile_check_end.h" |