blob: 60c2370bc6086a36a3a9cc15bdd377587c83ffa6 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
// This file is copied from
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/ColumnsHashing.h
// and modified by Doris
#pragma once
#include <memory>
#include <span>
#include <type_traits>
#include "vec/aggregate_functions/aggregate_function.h"
#include "vec/columns/column_string.h"
#include "vec/common/arena.h"
#include "vec/common/assert_cast.h"
#include "vec/common/columns_hashing_impl.h"
#include "vec/common/hash_table/ph_hash_map.h"
#include "vec/common/string_ref.h"
#include "vec/common/unaligned.h"
namespace doris::vectorized {
using Sizes = std::vector<size_t>;
inline Sizes get_key_sizes(const std::vector<DataTypePtr>& data_types) {
Sizes key_sizes;
for (const auto& data_type : data_types) {
key_sizes.emplace_back(data_type->get_size_of_value_in_memory() - data_type->is_nullable());
}
return key_sizes;
}
namespace ColumnsHashing {
/// For the case when there is one numeric key.
/// UInt8/16/32/64 for any type with corresponding bit width.
template <typename Value, typename Mapped, typename FieldType>
struct HashMethodOneNumber
: public columns_hashing_impl::HashMethodBase<HashMethodOneNumber<Value, Mapped, FieldType>,
Value, Mapped, false> {
using Self = HashMethodOneNumber<Value, Mapped, FieldType>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
HashMethodOneNumber(const ColumnRawPtrs& key_columns) {}
using Base::find_key_with_hash;
};
/// For the case when there is one string key.
template <typename Value, typename Mapped, bool place_string_to_arena = true>
struct HashMethodString
: public columns_hashing_impl::HashMethodBase<
HashMethodString<Value, Mapped, place_string_to_arena>, Value, Mapped, false> {
using Self = HashMethodString<Value, Mapped, place_string_to_arena>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
HashMethodString(const ColumnRawPtrs& key_columns) {}
protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
};
/** Hash by concatenating serialized key values.
* The serialized value differs in that it uniquely allows to deserialize it, having only the position with which it starts.
* That is, for example, for strings, it contains first the serialized length of the string, and then the bytes.
* Therefore, when aggregating by several strings, there is no ambiguity.
*/
template <typename Value, typename Mapped>
struct HashMethodSerialized
: public columns_hashing_impl::HashMethodBase<HashMethodSerialized<Value, Mapped>, Value,
Mapped, false> {
using Self = HashMethodSerialized<Value, Mapped>;
using Base = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
HashMethodSerialized(const ColumnRawPtrs& key_columns) {}
protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
};
/// For the case when all keys are of fixed length, and they fit in N (for example, 128) bits.
template <typename Value, typename Key, typename Mapped>
struct HashMethodKeysFixed
: public columns_hashing_impl::HashMethodBase<HashMethodKeysFixed<Value, Key, Mapped>,
Value, Mapped, false> {
using Self = HashMethodKeysFixed<Value, Key, Mapped>;
using BaseHashed = columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
HashMethodKeysFixed(const ColumnRawPtrs& key_columns) {}
};
template <typename SingleColumnMethod, typename Mapped>
struct HashMethodSingleLowNullableColumn : public SingleColumnMethod {
using Base = SingleColumnMethod;
static constexpr bool has_mapped = !std::is_same<Mapped, void>::value;
using FindResult = columns_hashing_impl::FindResultImpl<Mapped>;
const ColumnNullable* key_column;
static ColumnRawPtrs get_nested_column(const IColumn* col) {
const auto* nullable = check_and_get_column<ColumnNullable>(*col);
DCHECK(nullable != nullptr);
const auto* const nested_col = nullable->get_nested_column_ptr().get();
return {nested_col};
}
HashMethodSingleLowNullableColumn(const ColumnRawPtrs& key_columns_nullable)
: Base(get_nested_column(key_columns_nullable[0])),
key_column(assert_cast<const ColumnNullable*>(key_columns_nullable[0])) {}
template <typename Data, typename Func, typename CreatorForNull, typename KeyHolder>
ALWAYS_INLINE Mapped* lazy_emplace_key(Data& data, size_t row, KeyHolder&& key,
size_t hash_value, Func&& f,
CreatorForNull&& null_creator) {
if (key_column->is_null_at(row)) {
bool has_null_key = data.has_null_key_data();
data.has_null_key_data() = true;
if constexpr (std::is_same_v<Mapped, void>) {
if (!has_null_key) {
std::forward<CreatorForNull>(null_creator)();
}
return nullptr;
} else {
if (!has_null_key) {
std::forward<CreatorForNull>(null_creator)(
data.template get_null_key_data<Mapped>());
}
return &data.template get_null_key_data<Mapped>();
}
}
typename Data::LookupResult it;
data.lazy_emplace(std::forward<KeyHolder>(key), it, hash_value, std::forward<Func>(f));
return lookup_result_get_mapped(it);
}
template <typename Data, typename Key>
ALWAYS_INLINE FindResult find_key_with_hash(Data& data, size_t i, Key key, size_t hash_value) {
if (key_column->is_null_at(i)) {
if (data.has_null_key_data()) {
return FindResult {&data.template get_null_key_data<Mapped>(), true};
} else {
return FindResult {nullptr, false};
}
}
return Base::find_key_impl(key, hash_value, data);
}
};
} // namespace ColumnsHashing
} // namespace doris::vectorized