| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| // This file is copied from |
| // https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/FunctionHash.cpp |
| // and modified by Doris |
| |
| #include "vec/functions/function_hash.h" |
| |
| #include <glog/logging.h> |
| #include <stddef.h> |
| |
| #include <algorithm> |
| #include <boost/iterator/iterator_facade.hpp> |
| #include <string> |
| |
| #include "common/status.h" |
| #include "util/hash_util.hpp" |
| #include "util/murmur_hash3.h" |
| #include "vec/columns/column.h" |
| #include "vec/columns/column_const.h" |
| #include "vec/columns/column_string.h" |
| #include "vec/columns/column_vector.h" |
| #include "vec/common/assert_cast.h" |
| #include "vec/common/bit_cast.h" |
| #include "vec/core/field.h" |
| #include "vec/data_types/data_type.h" |
| #include "vec/data_types/data_type_number.h" |
| #include "vec/functions/function_helpers.h" |
| #include "vec/functions/function_variadic_arguments.h" |
| #include "vec/functions/simple_function_factory.h" |
| #include "vec/utils/template_helpers.hpp" |
| |
| namespace doris::vectorized { |
| struct MurmurHash2Impl64 { |
| static constexpr auto name = "murmurHash2_64"; |
| using ReturnType = UInt64; |
| |
| static Status empty_apply(IColumn& icolumn, size_t input_rows_count) { |
| ColumnVector<ReturnType>& vec_to = assert_cast<ColumnVector<ReturnType>&>(icolumn); |
| vec_to.get_data().assign(input_rows_count, static_cast<ReturnType>(0xe28dbde7fe22e41c)); |
| return Status::OK(); |
| } |
| |
| static Status first_apply(const IDataType* type, const IColumn* column, size_t input_rows_count, |
| IColumn& icolumn) { |
| execute_any<true>(type, column, icolumn, input_rows_count); |
| return Status::OK(); |
| } |
| |
| static Status combine_apply(const IDataType* type, const IColumn* column, |
| size_t input_rows_count, IColumn& icolumn) { |
| execute_any<false>(type, column, icolumn, input_rows_count); |
| return Status::OK(); |
| } |
| |
| template <typename FromType, bool first> |
| static Status execute_int_type(const IColumn* column, IColumn& col_to, |
| size_t input_rows_count) { |
| if (const ColumnVector<FromType>* col_from = |
| check_and_get_column<ColumnVector<FromType>>(column)) { |
| const typename ColumnVector<FromType>::Container& vec_from = col_from->get_data(); |
| size_t size = vec_from.size(); |
| for (size_t i = 0; i < size; ++i) { |
| ReturnType val = HashUtil::murmur_hash2_64( |
| reinterpret_cast<const char*>(reinterpret_cast<const char*>(&vec_from[i])), |
| sizeof(vec_from[i]), 0); |
| if (first) |
| col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0); |
| else |
| assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] = |
| IntHash64Impl::apply( |
| assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i]) ^ |
| val; |
| } |
| } else if (auto col_from_const = |
| check_and_get_column_const<ColumnVector<FromType>>(column)) { |
| auto value = col_from_const->template get_value<FromType>(); |
| ReturnType val; |
| val = IntHash64Impl::apply(ext::bit_cast<ReturnType>(value)); |
| for (size_t i = 0; i < input_rows_count; ++i) { |
| if (first) { |
| col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), 0); |
| } else { |
| assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] = |
| IntHash64Impl::apply( |
| assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i]) ^ |
| val; |
| } |
| } |
| } else { |
| DCHECK(false); |
| return Status::NotSupported("Illegal column {} of argument of function {}", |
| column->get_name(), name); |
| } |
| return Status::OK(); |
| } |
| |
| template <bool first> |
| static Status execute_string(const IColumn* column, IColumn& col_to, size_t input_rows_count) { |
| if (const ColumnString* col_from = check_and_get_column<ColumnString>(column)) { |
| const typename ColumnString::Chars& data = col_from->get_chars(); |
| const typename ColumnString::Offsets& offsets = col_from->get_offsets(); |
| size_t size = offsets.size(); |
| |
| ColumnString::Offset current_offset = 0; |
| for (size_t i = 0; i < size; ++i) { |
| const ReturnType val = HashUtil::murmur_hash2_64( |
| reinterpret_cast<const char*>(&data[current_offset]), |
| offsets[i] - current_offset, 0); |
| |
| if (first) |
| col_to.insert_data(reinterpret_cast<const char*>(&val), 0); |
| else |
| assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] = |
| IntHash64Impl::apply( |
| assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i]) ^ |
| val; |
| |
| current_offset = offsets[i]; |
| } |
| } else if (const ColumnConst* col_from_const = |
| check_and_get_column_const_string_or_fixedstring(column)) { |
| String value = col_from_const->get_value<String>().data(); |
| const ReturnType val = HashUtil::murmur_hash2_64(value.data(), value.size(), 0); |
| |
| for (size_t i = 0; i < input_rows_count; ++i) { |
| if (first) { |
| col_to.insert_data(reinterpret_cast<const char*>(&val), 0); |
| } else { |
| assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i] = |
| IntHash64Impl::apply( |
| assert_cast<ColumnVector<ReturnType>&>(col_to).get_data()[i]) ^ |
| val; |
| } |
| } |
| } else { |
| DCHECK(false); |
| return Status::NotSupported("Illegal column {} of argument of function {}", |
| column->get_name(), name); |
| } |
| return Status::OK(); |
| } |
| |
| template <bool first> |
| static Status execute_any(const IDataType* from_type, const IColumn* icolumn, IColumn& col_to, |
| size_t input_rows_count) { |
| WhichDataType which(from_type); |
| if (which.is_string()) { |
| return execute_string<first>(icolumn, col_to, input_rows_count); |
| } |
| |
| #define DISPATCH(TYPE, COLUMN_TYPE) \ |
| if (which.idx == TypeIndex::TYPE) \ |
| return execute_int_type<TYPE, first>(icolumn, col_to, input_rows_count); |
| NUMERIC_TYPE_TO_COLUMN_TYPE(DISPATCH) |
| #undef DISPATCH |
| return Status::NotSupported("argument_type {} not supported", from_type->get_name()); |
| } |
| }; |
| using FunctionMurmurHash2_64 = FunctionVariadicArgumentsBase<DataTypeUInt64, MurmurHash2Impl64>; |
| |
| template <typename ReturnType> |
| struct MurmurHash3ImplName {}; |
| |
| template <> |
| struct MurmurHash3ImplName<Int32> { |
| static constexpr auto name = "murmur_hash3_32"; |
| }; |
| |
| template <> |
| struct MurmurHash3ImplName<Int64> { |
| static constexpr auto name = "murmur_hash3_64"; |
| }; |
| |
| template <typename ReturnType> |
| struct MurmurHash3Impl { |
| static constexpr auto name = MurmurHash3ImplName<ReturnType>::name; |
| |
| static Status empty_apply(IColumn& icolumn, size_t input_rows_count) { |
| ColumnVector<ReturnType>& vec_to = assert_cast<ColumnVector<ReturnType>&>(icolumn); |
| vec_to.get_data().assign(input_rows_count, static_cast<ReturnType>(0xe28dbde7fe22e41c)); |
| return Status::OK(); |
| } |
| |
| static Status first_apply(const IDataType* type, const IColumn* column, size_t input_rows_count, |
| IColumn& icolumn) { |
| return execute<true>(type, column, input_rows_count, icolumn); |
| } |
| |
| static Status combine_apply(const IDataType* type, const IColumn* column, |
| size_t input_rows_count, IColumn& icolumn) { |
| return execute<false>(type, column, input_rows_count, icolumn); |
| } |
| |
| template <bool first> |
| static Status execute(const IDataType* type, const IColumn* column, size_t input_rows_count, |
| IColumn& col_to) { |
| auto* col_to_data = assert_cast<ColumnVector<ReturnType>&>(col_to).get_data().data(); |
| if (const ColumnString* col_from = check_and_get_column<ColumnString>(column)) { |
| const typename ColumnString::Chars& data = col_from->get_chars(); |
| const typename ColumnString::Offsets& offsets = col_from->get_offsets(); |
| size_t size = offsets.size(); |
| |
| ColumnString::Offset current_offset = 0; |
| for (size_t i = 0; i < size; ++i) { |
| if (first) { |
| if constexpr (std::is_same_v<ReturnType, Int32>) { |
| UInt32 val = HashUtil::murmur_hash3_32( |
| reinterpret_cast<const char*>(&data[current_offset]), |
| offsets[i] - current_offset, HashUtil::MURMUR3_32_SEED); |
| col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), |
| 0); |
| } else { |
| UInt64 val = 0; |
| murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]), |
| offsets[i] - current_offset, 0, &val); |
| col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), |
| 0); |
| } |
| } else { |
| if constexpr (std::is_same_v<ReturnType, Int32>) { |
| col_to_data[i] = HashUtil::murmur_hash3_32( |
| reinterpret_cast<const char*>(&data[current_offset]), |
| offsets[i] - current_offset, ext::bit_cast<UInt32>(col_to[i])); |
| } else { |
| murmur_hash3_x64_64(reinterpret_cast<const char*>(&data[current_offset]), |
| offsets[i] - current_offset, |
| ext::bit_cast<UInt64>(col_to[i]), col_to_data + i); |
| } |
| } |
| current_offset = offsets[i]; |
| } |
| } else if (const ColumnConst* col_from_const = |
| check_and_get_column_const_string_or_fixedstring(column)) { |
| String value = col_from_const->get_value<String>().data(); |
| for (size_t i = 0; i < input_rows_count; ++i) { |
| if (first) { |
| if constexpr (std::is_same_v<ReturnType, Int32>) { |
| UInt32 val = HashUtil::murmur_hash3_32(value.data(), value.size(), |
| HashUtil::MURMUR3_32_SEED); |
| col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), |
| 0); |
| } else { |
| UInt64 val = 0; |
| murmur_hash3_x64_64(value.data(), value.size(), 0, &val); |
| col_to.insert_data(const_cast<const char*>(reinterpret_cast<char*>(&val)), |
| 0); |
| } |
| } else { |
| if constexpr (std::is_same_v<ReturnType, Int32>) { |
| col_to_data[i] = HashUtil::murmur_hash3_32( |
| value.data(), value.size(), ext::bit_cast<UInt32>(col_to[i])); |
| } else { |
| murmur_hash3_x64_64(value.data(), value.size(), |
| ext::bit_cast<UInt64>(col_to[i]), col_to_data + i); |
| } |
| } |
| } |
| } else { |
| DCHECK(false); |
| return Status::NotSupported("Illegal column {} of argument of function {}", |
| column->get_name(), name); |
| } |
| return Status::OK(); |
| } |
| }; |
| using FunctionMurmurHash3_32 = FunctionVariadicArgumentsBase<DataTypeInt32, MurmurHash3Impl<Int32>>; |
| using FunctionMurmurHash3_64 = FunctionVariadicArgumentsBase<DataTypeInt64, MurmurHash3Impl<Int64>>; |
| |
| void register_function_hash(SimpleFunctionFactory& factory) { |
| factory.register_function<FunctionMurmurHash2_64>(); |
| factory.register_function<FunctionMurmurHash3_32>(); |
| factory.register_function<FunctionMurmurHash3_64>(); |
| } |
| } // namespace doris::vectorized |