| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include <fmt/format.h> |
| #include <glog/logging.h> |
| #include <stddef.h> |
| |
| #include <algorithm> |
| #include <boost/iterator/iterator_facade.hpp> |
| #include <memory> |
| #include <new> |
| #include <ostream> |
| #include <string> |
| #include <utility> |
| |
| #include "common/status.h" |
| #include "runtime/define_primitive_type.h" |
| #include "vec/aggregate_functions/aggregate_function.h" |
| #include "vec/columns/column.h" |
| #include "vec/columns/column_array.h" |
| #include "vec/columns/column_nullable.h" |
| #include "vec/columns/column_vector.h" |
| #include "vec/common/arena.h" |
| #include "vec/common/assert_cast.h" |
| #include "vec/common/columns_hashing.h" |
| #include "vec/common/hash_table/hash.h" |
| #include "vec/common/hash_table/hash_map_context.h" |
| #include "vec/common/pod_array_fwd.h" |
| #include "vec/common/string_ref.h" |
| #include "vec/common/uint128.h" |
| #include "vec/core/block.h" |
| #include "vec/core/call_on_type_index.h" |
| #include "vec/core/column_numbers.h" |
| #include "vec/core/column_with_type_and_name.h" |
| #include "vec/core/types.h" |
| #include "vec/data_types/data_type.h" |
| #include "vec/data_types/data_type_array.h" |
| #include "vec/data_types/data_type_nullable.h" |
| #include "vec/data_types/data_type_number.h" |
| #include "vec/functions/function.h" |
| #include "vec/functions/function_helpers.h" |
| #include "vec/functions/simple_function_factory.h" |
| |
| namespace doris { |
| class FunctionContext; |
| } // namespace doris |
| template <typename, typename> |
| struct DefaultHash; |
| |
| namespace doris::vectorized { |
| #include "common/compile_check_begin.h" |
| |
| class FunctionArrayEnumerateUniq : public IFunction { |
| private: |
| static constexpr size_t INITIAL_SIZE_DEGREE = 5; |
| |
| public: |
| using NullMapType = PaddedPODArray<UInt8>; |
| static constexpr auto name = "array_enumerate_uniq"; |
| static FunctionPtr create() { return std::make_shared<FunctionArrayEnumerateUniq>(); } |
| String get_name() const override { return name; } |
| bool is_variadic() const override { return true; } |
| size_t get_number_of_arguments() const override { return 1; } |
| |
| DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
| if (arguments.empty()) { |
| throw doris::Exception( |
| ErrorCode::INVALID_ARGUMENT, |
| "Incorrect number of arguments for array_enumerate_uniq function"); |
| } |
| bool is_nested_nullable = false; |
| for (size_t i = 0; i < arguments.size(); ++i) { |
| const DataTypeArray* array_type = |
| check_and_get_data_type<DataTypeArray>(remove_nullable(arguments[i]).get()); |
| if (!array_type) { |
| throw doris::Exception( |
| ErrorCode::INVALID_ARGUMENT, |
| "The {} -th argument for function: {} .must be an array but it type is {}", |
| i, get_name(), arguments[i]->get_name()); |
| } |
| is_nested_nullable = is_nested_nullable || array_type->get_nested_type()->is_nullable(); |
| } |
| |
| auto return_nested_type = std::make_shared<DataTypeInt64>(); |
| DataTypePtr return_type = std::make_shared<DataTypeArray>( |
| is_nested_nullable ? make_nullable(return_nested_type) : return_nested_type); |
| if (arguments[0]->is_nullable()) { |
| return_type = make_nullable(return_type); |
| } |
| return return_type; |
| } |
| |
| // When compiling `FunctionArrayEnumerateUniq::_execute_by_hash`, `AllocatorWithStackMemory::free(buf)` |
| // will be called when `delete HashMapContainer`. the gcc compiler will think that `size > N` and `buf` is not heap memory, |
| // and report an error `' void free(void*)' called on unallocated object 'hash_map'` |
| // This only fails on doris docker + gcc 11.1, no problem on doris docker + clang 16.0.1, |
| // no problem on ldb_toolchanin gcc 11.1 and clang 16.0.1. |
| #ifdef __GNUC__ |
| #pragma GCC diagnostic push |
| #pragma GCC diagnostic ignored "-Wfree-nonheap-object" |
| #endif // __GNUC__ |
| |
| Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
| uint32_t result, size_t input_rows_count) const override { |
| ColumnRawPtrs data_columns(arguments.size()); |
| const ColumnArray::Offsets64* offsets = nullptr; |
| ColumnPtr src_offsets; |
| Columns src_columns; // to keep ownership |
| |
| const ColumnArray* first_column_array = nullptr; |
| |
| for (size_t i = 0; i < arguments.size(); i++) { |
| src_columns.emplace_back( |
| block.get_by_position(arguments[i]).column->convert_to_full_column_if_const()); |
| ColumnPtr& cur_column = src_columns[i]; |
| const ColumnArray* array = |
| check_and_get_column<ColumnArray>(remove_nullable(cur_column->get_ptr()).get()); |
| if (!array) { |
| return Status::RuntimeError( |
| fmt::format("Illegal column {}, of first argument of function {}", |
| cur_column->get_name(), get_name())); |
| } |
| |
| const ColumnArray::Offsets64& cur_offsets = array->get_offsets(); |
| if (i == 0) { |
| first_column_array = array; |
| offsets = &cur_offsets; |
| src_offsets = array->get_offsets_ptr(); |
| } else if (*offsets != cur_offsets) { |
| return Status::RuntimeError(fmt::format( |
| "lengths of all arrays of function {} must be equal.", get_name())); |
| } |
| const auto* array_data = &array->get_data(); |
| data_columns[i] = array_data; |
| } |
| |
| const NullMapType* null_map = nullptr; |
| if (arguments.size() == 1 && data_columns[0]->is_nullable()) { |
| const ColumnNullable* nullable = check_and_get_column<ColumnNullable>(*data_columns[0]); |
| data_columns[0] = nullable->get_nested_column_ptr().get(); |
| null_map = &nullable->get_null_map_column().get_data(); |
| } |
| |
| auto dst_nested_column = ColumnInt64::create(); |
| ColumnInt64::Container& dst_values = dst_nested_column->get_data(); |
| dst_values.resize(offsets->back()); |
| |
| if (arguments.size() == 1) { |
| DataTypePtr src_column_type = block.get_by_position(arguments[0]).type; |
| if (src_column_type->is_nullable()) { |
| src_column_type = |
| assert_cast<const DataTypeNullable&>(*src_column_type).get_nested_type(); |
| } |
| auto nested_type = |
| assert_cast<const DataTypeArray&>(*src_column_type).get_nested_type(); |
| |
| auto call = [&](const auto& type) -> bool { |
| using DispatchType = std::decay_t<decltype(type)>; |
| _execute_number<typename DispatchType::ColumnType>(data_columns, *offsets, null_map, |
| dst_values); |
| return true; |
| }; |
| |
| if (is_string_type(nested_type->get_primitive_type())) { |
| _execute_string(data_columns, *offsets, null_map, dst_values); |
| } else if (!dispatch_switch_scalar(nested_type->get_primitive_type(), call)) { |
| return Status::RuntimeError(fmt::format( |
| "execute failed or unsupported types for function {}({})", get_name(), |
| block.get_by_position(arguments[0]).type->get_name())); |
| } |
| } else { |
| _execute_by_hash<MethodSerialized<PHHashMap<StringRef, Int64>>, false>( |
| data_columns, *offsets, nullptr, dst_values); |
| } |
| |
| ColumnPtr nested_column = dst_nested_column->get_ptr(); |
| if (first_column_array->get_data().is_nullable()) { |
| nested_column = ColumnNullable::create(nested_column, |
| ColumnUInt8::create(nested_column->size(), 0)); |
| } |
| ColumnPtr res_column = ColumnArray::create(std::move(nested_column), src_offsets); |
| if (arguments.size() == 1 && block.get_by_position(arguments[0]).column->is_nullable()) { |
| auto left_column = |
| block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); |
| const ColumnNullable* nullable = |
| check_and_get_column<ColumnNullable>(left_column.get()); |
| res_column = ColumnNullable::create( |
| res_column, nullable->get_null_map_column().clone_resized(nullable->size())); |
| } |
| |
| block.replace_by_position(result, std::move(res_column)); |
| return Status::OK(); |
| } |
| |
| private: |
| template <typename HashTableContext, bool is_nullable> |
| void _execute_by_hash(const ColumnRawPtrs& columns, const ColumnArray::Offsets64& offsets, |
| [[maybe_unused]] const NullMap* null_map, |
| ColumnInt64::Container& dst_values) const { |
| HashTableContext ctx; |
| ctx.init_serialized_keys(columns, static_cast<uint32_t>(columns[0]->size()), |
| null_map ? null_map->data() : nullptr); |
| |
| using KeyGetter = typename HashTableContext::State; |
| KeyGetter key_getter(columns); |
| |
| auto creator = [&](const auto& ctor, auto& key, auto& origin) { ctor(key, 0); }; |
| auto creator_for_null_key = [&](auto& mapped) { mapped = 0; }; |
| |
| ColumnArray::Offset64 prev_off = 0; |
| for (size_t off : offsets) { |
| ctx.hash_table->clear_and_shrink(); |
| Int64 null_count = 0; |
| for (ColumnArray::Offset64 j = prev_off; j < off; ++j) { |
| if constexpr (is_nullable) { |
| if ((*null_map)[j]) { |
| dst_values[j] = ++null_count; |
| continue; |
| } |
| } |
| auto& mapped = *ctx.lazy_emplace(key_getter, j, creator, creator_for_null_key); |
| mapped++; |
| dst_values[j] = mapped; |
| } |
| prev_off = off; |
| } |
| } |
| |
| template <typename ColumnType> |
| void _execute_number(const ColumnRawPtrs& columns, const ColumnArray::Offsets64& offsets, |
| const NullMapType* null_map, ColumnInt64::Container& dst_values) const { |
| using NestType = typename ColumnType::value_type; |
| using ElementNativeType = typename NativeType<NestType>::Type; |
| using HashMethod = |
| MethodOneNumber<ElementNativeType, |
| PHHashMap<ElementNativeType, Int64, HashCRC32<ElementNativeType>>>; |
| if (null_map != nullptr) { |
| _execute_by_hash<HashMethod, true>(columns, offsets, null_map, dst_values); |
| } else { |
| _execute_by_hash<HashMethod, false>(columns, offsets, nullptr, dst_values); |
| } |
| } |
| |
| void _execute_string(const ColumnRawPtrs& columns, const ColumnArray::Offsets64& offsets, |
| const NullMapType* null_map, ColumnInt64::Container& dst_values) const { |
| using HashMethod = MethodStringNoCache<PHHashMap<StringRef, Int64>>; |
| if (null_map != nullptr) { |
| _execute_by_hash<HashMethod, true>(columns, offsets, null_map, dst_values); |
| } else { |
| _execute_by_hash<HashMethod, false>(columns, offsets, nullptr, dst_values); |
| } |
| } |
| }; |
| |
| #ifdef __GNUC__ |
| #pragma GCC diagnostic pop |
| #endif // __GNUC__ |
| |
| void register_function_array_enumerate_uniq(SimpleFunctionFactory& factory) { |
| factory.register_function<FunctionArrayEnumerateUniq>(); |
| } |
| #include "common/compile_check_end.h" |
| } // namespace doris::vectorized |