| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #pragma once |
| |
| #include <concurrentqueue.h> |
| #include <sqltypes.h> |
| |
| #include <atomic> |
| #include <functional> |
| #include <memory> |
| #include <mutex> |
| #include <thread> |
| #include <utility> |
| |
| #include "common/config.h" |
| #include "common/logging.h" |
| #include "gen_cpp/internal_service.pb.h" |
| #include "gutil/integral_types.h" |
| #include "pipeline/common/agg_utils.h" |
| #include "pipeline/common/join_utils.h" |
| #include "pipeline/common/set_utils.h" |
| #include "pipeline/exec/data_queue.h" |
| #include "pipeline/exec/join/process_hash_table_probe.h" |
| #include "util/brpc_closure.h" |
| #include "util/stack_util.h" |
| #include "vec/common/sort/partition_sorter.h" |
| #include "vec/common/sort/sorter.h" |
| #include "vec/core/block.h" |
| #include "vec/core/types.h" |
| #include "vec/spill/spill_stream.h" |
| |
| namespace doris::vectorized { |
| class AggFnEvaluator; |
| class VSlotRef; |
| } // namespace doris::vectorized |
| |
| namespace doris::pipeline { |
| #include "common/compile_check_begin.h" |
| class Dependency; |
| class PipelineTask; |
| struct BasicSharedState; |
| using DependencySPtr = std::shared_ptr<Dependency>; |
| class LocalExchangeSourceLocalState; |
| |
| static constexpr auto SLOW_DEPENDENCY_THRESHOLD = 60 * 1000L * 1000L * 1000L; |
| static constexpr auto TIME_UNIT_DEPENDENCY_LOG = 30 * 1000L * 1000L * 1000L; |
| static_assert(TIME_UNIT_DEPENDENCY_LOG < SLOW_DEPENDENCY_THRESHOLD); |
| |
| struct BasicSharedState { |
| ENABLE_FACTORY_CREATOR(BasicSharedState) |
| |
| template <class TARGET> |
| TARGET* cast() { |
| DCHECK(dynamic_cast<TARGET*>(this)) |
| << " Mismatch type! Current type is " << typeid(*this).name() |
| << " and expect type is" << typeid(TARGET).name(); |
| return reinterpret_cast<TARGET*>(this); |
| } |
| template <class TARGET> |
| const TARGET* cast() const { |
| DCHECK(dynamic_cast<const TARGET*>(this)) |
| << " Mismatch type! Current type is " << typeid(*this).name() |
| << " and expect type is" << typeid(TARGET).name(); |
| return reinterpret_cast<const TARGET*>(this); |
| } |
| std::vector<DependencySPtr> source_deps; |
| std::vector<DependencySPtr> sink_deps; |
| int id = 0; |
| std::set<int> related_op_ids; |
| |
| virtual ~BasicSharedState() = default; |
| |
| void create_source_dependencies(int num_sources, int operator_id, int node_id, |
| const std::string& name); |
| Dependency* create_source_dependency(int operator_id, int node_id, const std::string& name); |
| |
| Dependency* create_sink_dependency(int dest_id, int node_id, const std::string& name); |
| std::vector<DependencySPtr> get_dep_by_channel_id(int channel_id) { |
| DCHECK_LT(channel_id, source_deps.size()); |
| return {source_deps[channel_id]}; |
| } |
| }; |
| |
| class Dependency : public std::enable_shared_from_this<Dependency> { |
| public: |
| ENABLE_FACTORY_CREATOR(Dependency); |
| Dependency(int id, int node_id, std::string name, bool ready = false) |
| : _id(id), _node_id(node_id), _name(std::move(name)), _ready(ready) {} |
| virtual ~Dependency() = default; |
| |
| [[nodiscard]] int id() const { return _id; } |
| [[nodiscard]] virtual std::string name() const { return _name; } |
| BasicSharedState* shared_state() { return _shared_state; } |
| void set_shared_state(BasicSharedState* shared_state) { _shared_state = shared_state; } |
| virtual std::string debug_string(int indentation_level = 0); |
| bool ready() const { return _ready; } |
| |
| // Start the watcher. We use it to count how long this dependency block the current pipeline task. |
| void start_watcher() { _watcher.start(); } |
| [[nodiscard]] int64_t watcher_elapse_time() { return _watcher.elapsed_time(); } |
| |
| // Which dependency current pipeline task is blocked by. `nullptr` if this dependency is ready. |
| [[nodiscard]] Dependency* is_blocked_by(std::shared_ptr<PipelineTask> task = nullptr); |
| // Notify downstream pipeline tasks this dependency is ready. |
| void set_ready(); |
| void set_ready_to_read(int channel_id = 0) { |
| DCHECK_LT(channel_id, _shared_state->source_deps.size()) << debug_string(); |
| _shared_state->source_deps[channel_id]->set_ready(); |
| } |
| void set_ready_to_write() { |
| DCHECK_EQ(_shared_state->sink_deps.size(), 1) << debug_string(); |
| _shared_state->sink_deps.front()->set_ready(); |
| } |
| |
| // Notify downstream pipeline tasks this dependency is blocked. |
| void block() { |
| if (_always_ready) { |
| return; |
| } |
| std::unique_lock<std::mutex> lc(_always_ready_lock); |
| if (_always_ready) { |
| return; |
| } |
| _ready = false; |
| } |
| |
| void set_always_ready() { |
| if (_always_ready) { |
| return; |
| } |
| std::unique_lock<std::mutex> lc(_always_ready_lock); |
| if (_always_ready) { |
| return; |
| } |
| _always_ready = true; |
| set_ready(); |
| } |
| |
| protected: |
| void _add_block_task(std::shared_ptr<PipelineTask> task); |
| |
| const int _id; |
| const int _node_id; |
| const std::string _name; |
| std::atomic<bool> _ready; |
| |
| BasicSharedState* _shared_state = nullptr; |
| MonotonicStopWatch _watcher; |
| |
| std::mutex _task_lock; |
| std::vector<std::weak_ptr<PipelineTask>> _blocked_task; |
| |
| // If `_always_ready` is true, `block()` will never block tasks. |
| std::atomic<bool> _always_ready = false; |
| std::mutex _always_ready_lock; |
| }; |
| |
| struct FakeSharedState final : public BasicSharedState { |
| ENABLE_FACTORY_CREATOR(FakeSharedState) |
| }; |
| |
| class CountedFinishDependency final : public Dependency { |
| public: |
| using SharedState = FakeSharedState; |
| CountedFinishDependency(int id, int node_id, std::string name) |
| : Dependency(id, node_id, std::move(name), true) {} |
| |
| void add(uint32_t count = 1) { |
| std::unique_lock<std::mutex> l(_mtx); |
| if (!_counter) { |
| block(); |
| } |
| _counter += count; |
| } |
| |
| void sub() { |
| std::unique_lock<std::mutex> l(_mtx); |
| _counter--; |
| if (!_counter) { |
| set_ready(); |
| } |
| } |
| |
| std::string debug_string(int indentation_level = 0) override; |
| |
| private: |
| std::mutex _mtx; |
| uint32_t _counter = 0; |
| }; |
| |
| struct RuntimeFilterTimerQueue; |
| class RuntimeFilterTimer { |
| public: |
| RuntimeFilterTimer(int64_t registration_time, int32_t wait_time_ms, |
| std::shared_ptr<Dependency> parent, bool force_wait_timeout = false) |
| : _parent(std::move(parent)), |
| _registration_time(registration_time), |
| _wait_time_ms(wait_time_ms), |
| _force_wait_timeout(force_wait_timeout) {} |
| |
| // Called by runtime filter producer. |
| void call_ready(); |
| |
| // Called by RuntimeFilterTimerQueue which is responsible for checking if this rf is timeout. |
| void call_timeout(); |
| |
| int64_t registration_time() const { return _registration_time; } |
| int32_t wait_time_ms() const { return _wait_time_ms; } |
| |
| void set_local_runtime_filter_dependencies( |
| const std::vector<std::shared_ptr<Dependency>>& deps) { |
| _local_runtime_filter_dependencies = deps; |
| } |
| |
| bool should_be_check_timeout(); |
| |
| bool force_wait_timeout() { return _force_wait_timeout; } |
| |
| private: |
| friend struct RuntimeFilterTimerQueue; |
| std::shared_ptr<Dependency> _parent = nullptr; |
| std::vector<std::shared_ptr<Dependency>> _local_runtime_filter_dependencies; |
| std::mutex _lock; |
| int64_t _registration_time; |
| const int32_t _wait_time_ms; |
| // true only for group_commit_scan_operator |
| bool _force_wait_timeout; |
| }; |
| |
| struct RuntimeFilterTimerQueue { |
| constexpr static int64_t interval = 10; |
| void run() { _thread.detach(); } |
| void start(); |
| |
| void stop() { |
| _stop = true; |
| cv.notify_all(); |
| wait_for_shutdown(); |
| } |
| |
| void wait_for_shutdown() const { |
| while (!_shutdown) { |
| std::this_thread::sleep_for(std::chrono::milliseconds(interval)); |
| } |
| } |
| |
| ~RuntimeFilterTimerQueue() = default; |
| RuntimeFilterTimerQueue() { _thread = std::thread(&RuntimeFilterTimerQueue::start, this); } |
| void push_filter_timer(std::vector<std::shared_ptr<pipeline::RuntimeFilterTimer>>&& filter) { |
| std::unique_lock<std::mutex> lc(_que_lock); |
| _que.insert(_que.end(), filter.begin(), filter.end()); |
| cv.notify_all(); |
| } |
| |
| std::thread _thread; |
| std::condition_variable cv; |
| std::mutex cv_m; |
| std::mutex _que_lock; |
| std::atomic_bool _stop = false; |
| std::atomic_bool _shutdown = false; |
| std::list<std::shared_ptr<pipeline::RuntimeFilterTimer>> _que; |
| }; |
| |
| struct AggSharedState : public BasicSharedState { |
| ENABLE_FACTORY_CREATOR(AggSharedState) |
| public: |
| AggSharedState() { |
| agg_data = std::make_unique<AggregatedDataVariants>(); |
| agg_arena_pool = std::make_unique<vectorized::Arena>(); |
| } |
| ~AggSharedState() override { |
| if (!probe_expr_ctxs.empty()) { |
| _close_with_serialized_key(); |
| } else { |
| _close_without_key(); |
| } |
| } |
| |
| Status reset_hash_table(); |
| |
| bool do_limit_filter(vectorized::Block* block, size_t num_rows, |
| const std::vector<int>* key_locs = nullptr); |
| void build_limit_heap(size_t hash_table_size); |
| |
| // We should call this function only at 1st phase. |
| // 1st phase: is_merge=true, only have one SlotRef. |
| // 2nd phase: is_merge=false, maybe have multiple exprs. |
| static int get_slot_column_id(const vectorized::AggFnEvaluator* evaluator); |
| |
| AggregatedDataVariantsUPtr agg_data = nullptr; |
| std::unique_ptr<AggregateDataContainer> aggregate_data_container; |
| ArenaUPtr agg_arena_pool; |
| std::vector<vectorized::AggFnEvaluator*> aggregate_evaluators; |
| // group by k1,k2 |
| vectorized::VExprContextSPtrs probe_expr_ctxs; |
| size_t input_num_rows = 0; |
| std::vector<vectorized::AggregateDataPtr> values; |
| /// The total size of the row from the aggregate functions. |
| size_t total_size_of_aggregate_states = 0; |
| size_t align_aggregate_states = 1; |
| /// The offset to the n-th aggregate function in a row of aggregate functions. |
| vectorized::Sizes offsets_of_aggregate_states; |
| std::vector<size_t> make_nullable_keys; |
| |
| bool agg_data_created_without_key = false; |
| bool enable_spill = false; |
| bool reach_limit = false; |
| |
| int64_t limit = -1; |
| bool do_sort_limit = false; |
| vectorized::MutableColumns limit_columns; |
| int limit_columns_min = -1; |
| vectorized::PaddedPODArray<uint8_t> need_computes; |
| std::vector<uint8_t> cmp_res; |
| std::vector<int> order_directions; |
| std::vector<int> null_directions; |
| |
| struct HeapLimitCursor { |
| HeapLimitCursor(int row_id, vectorized::MutableColumns& limit_columns, |
| std::vector<int>& order_directions, std::vector<int>& null_directions) |
| : _row_id(row_id), |
| _limit_columns(limit_columns), |
| _order_directions(order_directions), |
| _null_directions(null_directions) {} |
| |
| HeapLimitCursor(const HeapLimitCursor& other) = default; |
| |
| HeapLimitCursor(HeapLimitCursor&& other) noexcept |
| : _row_id(other._row_id), |
| _limit_columns(other._limit_columns), |
| _order_directions(other._order_directions), |
| _null_directions(other._null_directions) {} |
| |
| HeapLimitCursor& operator=(const HeapLimitCursor& other) noexcept { |
| _row_id = other._row_id; |
| return *this; |
| } |
| |
| HeapLimitCursor& operator=(HeapLimitCursor&& other) noexcept { |
| _row_id = other._row_id; |
| return *this; |
| } |
| |
| bool operator<(const HeapLimitCursor& rhs) const { |
| for (int i = 0; i < _limit_columns.size(); ++i) { |
| const auto& _limit_column = _limit_columns[i]; |
| auto res = _limit_column->compare_at(_row_id, rhs._row_id, *_limit_column, |
| _null_directions[i]) * |
| _order_directions[i]; |
| if (res < 0) { |
| return true; |
| } else if (res > 0) { |
| return false; |
| } |
| } |
| return false; |
| } |
| |
| int _row_id; |
| vectorized::MutableColumns& _limit_columns; |
| std::vector<int>& _order_directions; |
| std::vector<int>& _null_directions; |
| }; |
| |
| std::priority_queue<HeapLimitCursor> limit_heap; |
| |
| // Refresh the top limit heap with a new row |
| void refresh_top_limit(size_t row_id, const vectorized::ColumnRawPtrs& key_columns); |
| |
| private: |
| vectorized::MutableColumns _get_keys_hash_table(); |
| |
| void _close_with_serialized_key() { |
| std::visit(vectorized::Overload {[&](std::monostate& arg) -> void { |
| // Do nothing |
| }, |
| [&](auto& agg_method) -> void { |
| auto& data = *agg_method.hash_table; |
| data.for_each_mapped([&](auto& mapped) { |
| if (mapped) { |
| static_cast<void>(_destroy_agg_status(mapped)); |
| mapped = nullptr; |
| } |
| }); |
| if (data.has_null_key_data()) { |
| auto st = _destroy_agg_status( |
| data.template get_null_key_data< |
| vectorized::AggregateDataPtr>()); |
| if (!st) { |
| throw Exception(st.code(), st.to_string()); |
| } |
| } |
| }}, |
| agg_data->method_variant); |
| } |
| |
| void _close_without_key() { |
| //because prepare maybe failed, and couldn't create agg data. |
| //but finally call close to destory agg data, if agg data has bitmapValue |
| //will be core dump, it's not initialized |
| if (agg_data_created_without_key) { |
| static_cast<void>(_destroy_agg_status(agg_data->without_key)); |
| agg_data_created_without_key = false; |
| } |
| } |
| Status _destroy_agg_status(vectorized::AggregateDataPtr data); |
| }; |
| |
| struct BasicSpillSharedState { |
| virtual ~BasicSpillSharedState() = default; |
| |
| // These two counters are shared to spill source operators as the initial value |
| // of 'SpillWriteFileCurrentBytes' and 'SpillWriteFileCurrentCount'. |
| // Total bytes of spill data written to disk file(after serialized) |
| RuntimeProfile::Counter* _spill_write_file_total_size = nullptr; |
| RuntimeProfile::Counter* _spill_file_total_count = nullptr; |
| |
| void setup_shared_profile(RuntimeProfile* sink_profile) { |
| _spill_file_total_count = |
| ADD_COUNTER_WITH_LEVEL(sink_profile, "SpillWriteFileTotalCount", TUnit::UNIT, 1); |
| _spill_write_file_total_size = |
| ADD_COUNTER_WITH_LEVEL(sink_profile, "SpillWriteFileBytes", TUnit::BYTES, 1); |
| } |
| |
| virtual void update_spill_stream_profiles(RuntimeProfile* source_profile) = 0; |
| }; |
| |
| struct AggSpillPartition; |
| struct PartitionedAggSharedState : public BasicSharedState, |
| public BasicSpillSharedState, |
| public std::enable_shared_from_this<PartitionedAggSharedState> { |
| ENABLE_FACTORY_CREATOR(PartitionedAggSharedState) |
| |
| PartitionedAggSharedState() = default; |
| ~PartitionedAggSharedState() override = default; |
| |
| void update_spill_stream_profiles(RuntimeProfile* source_profile) override; |
| |
| void init_spill_params(size_t spill_partition_count); |
| |
| void close(); |
| |
| AggSharedState* in_mem_shared_state = nullptr; |
| std::shared_ptr<BasicSharedState> in_mem_shared_state_sptr; |
| |
| size_t partition_count; |
| size_t max_partition_index; |
| bool is_spilled = false; |
| std::atomic_bool is_closed = false; |
| std::deque<std::shared_ptr<AggSpillPartition>> spill_partitions; |
| |
| size_t get_partition_index(size_t hash_value) const { return hash_value % partition_count; } |
| }; |
| |
| struct AggSpillPartition { |
| static constexpr int64_t AGG_SPILL_FILE_SIZE = 1024 * 1024 * 1024; // 1G |
| |
| AggSpillPartition() = default; |
| |
| void close(); |
| |
| Status get_spill_stream(RuntimeState* state, int node_id, RuntimeProfile* profile, |
| vectorized::SpillStreamSPtr& spilling_stream); |
| |
| Status flush_if_full() { |
| DCHECK(spilling_stream_); |
| Status status; |
| // avoid small spill files |
| if (spilling_stream_->get_written_bytes() >= AGG_SPILL_FILE_SIZE) { |
| status = spilling_stream_->spill_eof(); |
| spilling_stream_.reset(); |
| } |
| return status; |
| } |
| |
| Status finish_current_spilling(bool eos = false) { |
| if (spilling_stream_) { |
| if (eos || spilling_stream_->get_written_bytes() >= AGG_SPILL_FILE_SIZE) { |
| auto status = spilling_stream_->spill_eof(); |
| spilling_stream_.reset(); |
| return status; |
| } |
| } |
| return Status::OK(); |
| } |
| |
| std::deque<vectorized::SpillStreamSPtr> spill_streams_; |
| vectorized::SpillStreamSPtr spilling_stream_; |
| }; |
| using AggSpillPartitionSPtr = std::shared_ptr<AggSpillPartition>; |
| struct SortSharedState : public BasicSharedState { |
| ENABLE_FACTORY_CREATOR(SortSharedState) |
| public: |
| std::shared_ptr<vectorized::Sorter> sorter; |
| }; |
| |
| struct SpillSortSharedState : public BasicSharedState, |
| public BasicSpillSharedState, |
| public std::enable_shared_from_this<SpillSortSharedState> { |
| ENABLE_FACTORY_CREATOR(SpillSortSharedState) |
| |
| SpillSortSharedState() = default; |
| ~SpillSortSharedState() override = default; |
| |
| void update_spill_block_batch_row_count(RuntimeState* state, const vectorized::Block* block) { |
| auto rows = block->rows(); |
| if (rows > 0 && 0 == avg_row_bytes) { |
| avg_row_bytes = std::max((std::size_t)1, block->bytes() / rows); |
| spill_block_batch_row_count = |
| (state->spill_sort_batch_bytes() + avg_row_bytes - 1) / avg_row_bytes; |
| LOG(INFO) << "spill sort block batch row count: " << spill_block_batch_row_count; |
| } |
| } |
| |
| void update_spill_stream_profiles(RuntimeProfile* source_profile) override; |
| |
| void close(); |
| |
| SortSharedState* in_mem_shared_state = nullptr; |
| bool enable_spill = false; |
| bool is_spilled = false; |
| std::atomic_bool is_closed = false; |
| std::shared_ptr<BasicSharedState> in_mem_shared_state_sptr; |
| |
| std::deque<vectorized::SpillStreamSPtr> sorted_streams; |
| size_t avg_row_bytes = 0; |
| size_t spill_block_batch_row_count; |
| }; |
| |
| struct UnionSharedState : public BasicSharedState { |
| ENABLE_FACTORY_CREATOR(UnionSharedState) |
| |
| public: |
| UnionSharedState(int child_count = 1) : data_queue(child_count), _child_count(child_count) {}; |
| int child_count() const { return _child_count; } |
| DataQueue data_queue; |
| const int _child_count; |
| }; |
| |
| struct DataQueueSharedState : public BasicSharedState { |
| ENABLE_FACTORY_CREATOR(DataQueueSharedState) |
| public: |
| DataQueue data_queue; |
| }; |
| |
| class MultiCastDataStreamer; |
| |
| struct MultiCastSharedState : public BasicSharedState, |
| public BasicSpillSharedState, |
| public std::enable_shared_from_this<MultiCastSharedState> { |
| MultiCastSharedState(ObjectPool* pool, int cast_sender_count, int node_id); |
| std::unique_ptr<pipeline::MultiCastDataStreamer> multi_cast_data_streamer; |
| |
| void update_spill_stream_profiles(RuntimeProfile* source_profile) override; |
| }; |
| |
| struct AnalyticSharedState : public BasicSharedState { |
| ENABLE_FACTORY_CREATOR(AnalyticSharedState) |
| |
| public: |
| AnalyticSharedState() = default; |
| std::queue<vectorized::Block> blocks_buffer; |
| std::mutex buffer_mutex; |
| bool sink_eos = false; |
| std::mutex sink_eos_lock; |
| }; |
| |
| struct JoinSharedState : public BasicSharedState { |
| // For some join case, we can apply a short circuit strategy |
| // 1. _has_null_in_build_side = true |
| // 2. build side rows is empty, Join op is: inner join/right outer join/left semi/right semi/right anti |
| bool _has_null_in_build_side = false; |
| bool short_circuit_for_probe = false; |
| // for some join, when build side rows is empty, we could return directly by add some additional null data in probe table. |
| bool empty_right_table_need_probe_dispose = false; |
| JoinOpVariants join_op_variants; |
| }; |
| |
| struct HashJoinSharedState : public JoinSharedState { |
| ENABLE_FACTORY_CREATOR(HashJoinSharedState) |
| HashJoinSharedState() { |
| hash_table_variant_vector.push_back(std::make_shared<JoinDataVariants>()); |
| } |
| HashJoinSharedState(int num_instances) { |
| source_deps.resize(num_instances, nullptr); |
| hash_table_variant_vector.resize(num_instances, nullptr); |
| for (int i = 0; i < num_instances; i++) { |
| hash_table_variant_vector[i] = std::make_shared<JoinDataVariants>(); |
| } |
| } |
| std::shared_ptr<vectorized::Arena> arena = std::make_shared<vectorized::Arena>(); |
| |
| const std::vector<TupleDescriptor*> build_side_child_desc; |
| size_t build_exprs_size = 0; |
| std::shared_ptr<vectorized::Block> build_block; |
| std::shared_ptr<std::vector<uint32_t>> build_indexes_null; |
| |
| // Used by shared hash table |
| // For probe operator, hash table in _hash_table_variants is read-only if visited flags is not |
| // used. (visited flags will be used only in right / full outer join). |
| // |
| // For broadcast join, although hash table is read-only, some states in `_hash_table_variants` |
| // are still could be written. For example, serialized keys will be written in a continuous |
| // memory in `_hash_table_variants`. So before execution, we should use a local _hash_table_variants |
| // which has a shared hash table in it. |
| std::vector<std::shared_ptr<JoinDataVariants>> hash_table_variant_vector; |
| }; |
| |
| struct PartitionedHashJoinSharedState |
| : public HashJoinSharedState, |
| public BasicSpillSharedState, |
| public std::enable_shared_from_this<PartitionedHashJoinSharedState> { |
| ENABLE_FACTORY_CREATOR(PartitionedHashJoinSharedState) |
| |
| void update_spill_stream_profiles(RuntimeProfile* source_profile) override { |
| for (auto& stream : spilled_streams) { |
| if (stream) { |
| stream->update_shared_profiles(source_profile); |
| } |
| } |
| } |
| |
| std::unique_ptr<RuntimeState> inner_runtime_state; |
| std::shared_ptr<HashJoinSharedState> inner_shared_state; |
| std::vector<std::unique_ptr<vectorized::MutableBlock>> partitioned_build_blocks; |
| std::vector<vectorized::SpillStreamSPtr> spilled_streams; |
| bool need_to_spill = false; |
| }; |
| |
| struct NestedLoopJoinSharedState : public JoinSharedState { |
| ENABLE_FACTORY_CREATOR(NestedLoopJoinSharedState) |
| // if true, left child has no more rows to process |
| bool left_side_eos = false; |
| // Visited flags for each row in build side. |
| vectorized::MutableColumns build_side_visited_flags; |
| // List of build blocks, constructed in prepare() |
| vectorized::Blocks build_blocks; |
| }; |
| |
| struct PartitionSortNodeSharedState : public BasicSharedState { |
| ENABLE_FACTORY_CREATOR(PartitionSortNodeSharedState) |
| public: |
| std::queue<vectorized::Block> blocks_buffer; |
| std::mutex buffer_mutex; |
| std::vector<std::unique_ptr<vectorized::PartitionSorter>> partition_sorts; |
| bool sink_eos = false; |
| std::mutex sink_eos_lock; |
| std::mutex prepared_finish_lock; |
| }; |
| |
| struct SetSharedState : public BasicSharedState { |
| ENABLE_FACTORY_CREATOR(SetSharedState) |
| public: |
| /// default init |
| vectorized::Block build_block; // build to source |
| //record element size in hashtable |
| int64_t valid_element_in_hash_tbl = 0; |
| //first: idx mapped to column types |
| //second: column_id, could point to origin column or cast column |
| std::unordered_map<int, int> build_col_idx; |
| |
| //// shared static states (shared, decided in prepare/open...) |
| |
| /// init in setup_local_state |
| std::unique_ptr<SetDataVariants> hash_table_variants = |
| std::make_unique<SetDataVariants>(); // the real data HERE. |
| std::vector<bool> build_not_ignore_null; |
| |
| // The SET operator's child might have different nullable attributes. |
| // If a calculation involves both nullable and non-nullable columns, the final output should be a nullable column |
| Status update_build_not_ignore_null(const vectorized::VExprContextSPtrs& ctxs); |
| |
| size_t get_hash_table_size() const; |
| /// init in both upstream side. |
| //The i-th result expr list refers to the i-th child. |
| std::vector<vectorized::VExprContextSPtrs> child_exprs_lists; |
| |
| /// init in build side |
| size_t child_quantity; |
| vectorized::VExprContextSPtrs build_child_exprs; |
| std::vector<Dependency*> probe_finished_children_dependency; |
| |
| /// init in probe side |
| std::vector<vectorized::VExprContextSPtrs> probe_child_exprs_lists; |
| |
| std::atomic<bool> ready_for_read = false; |
| |
| /// called in setup_local_state |
| Status hash_table_init(); |
| }; |
| |
| enum class ExchangeType : uint8_t { |
| NOOP = 0, |
| // Shuffle data by Crc32HashPartitioner<LocalExchangeChannelIds>. |
| HASH_SHUFFLE = 1, |
| // Round-robin passthrough data blocks. |
| PASSTHROUGH = 2, |
| // Shuffle data by Crc32HashPartitioner<ShuffleChannelIds> (e.g. same as storage engine). |
| BUCKET_HASH_SHUFFLE = 3, |
| // Passthrough data blocks to all channels. |
| BROADCAST = 4, |
| // Passthrough data to channels evenly in an adaptive way. |
| ADAPTIVE_PASSTHROUGH = 5, |
| // Send all data to the first channel. |
| PASS_TO_ONE = 6, |
| }; |
| |
| inline std::string get_exchange_type_name(ExchangeType idx) { |
| switch (idx) { |
| case ExchangeType::NOOP: |
| return "NOOP"; |
| case ExchangeType::HASH_SHUFFLE: |
| return "HASH_SHUFFLE"; |
| case ExchangeType::PASSTHROUGH: |
| return "PASSTHROUGH"; |
| case ExchangeType::BUCKET_HASH_SHUFFLE: |
| return "BUCKET_HASH_SHUFFLE"; |
| case ExchangeType::BROADCAST: |
| return "BROADCAST"; |
| case ExchangeType::ADAPTIVE_PASSTHROUGH: |
| return "ADAPTIVE_PASSTHROUGH"; |
| case ExchangeType::PASS_TO_ONE: |
| return "PASS_TO_ONE"; |
| } |
| throw Exception(Status::FatalError("__builtin_unreachable")); |
| } |
| |
| struct DataDistribution { |
| DataDistribution(ExchangeType type) : distribution_type(type) {} |
| DataDistribution(ExchangeType type, const std::vector<TExpr>& partition_exprs_) |
| : distribution_type(type), partition_exprs(partition_exprs_) {} |
| DataDistribution(const DataDistribution& other) = default; |
| bool need_local_exchange() const { return distribution_type != ExchangeType::NOOP; } |
| DataDistribution& operator=(const DataDistribution& other) = default; |
| ExchangeType distribution_type; |
| std::vector<TExpr> partition_exprs; |
| }; |
| |
| class ExchangerBase; |
| |
| struct LocalExchangeSharedState : public BasicSharedState { |
| public: |
| ENABLE_FACTORY_CREATOR(LocalExchangeSharedState); |
| LocalExchangeSharedState(int num_instances); |
| ~LocalExchangeSharedState() override; |
| std::unique_ptr<ExchangerBase> exchanger {}; |
| std::vector<RuntimeProfile::Counter*> mem_counters; |
| std::atomic<int64_t> mem_usage = 0; |
| std::atomic<size_t> _buffer_mem_limit = config::local_exchange_buffer_mem_limit; |
| // We need to make sure to add mem_usage first and then enqueue, otherwise sub mem_usage may cause negative mem_usage during concurrent dequeue. |
| std::mutex le_lock; |
| void sub_running_sink_operators(); |
| void sub_running_source_operators(); |
| void _set_always_ready() { |
| for (auto& dep : source_deps) { |
| DCHECK(dep); |
| dep->set_always_ready(); |
| } |
| for (auto& dep : sink_deps) { |
| DCHECK(dep); |
| dep->set_always_ready(); |
| } |
| } |
| |
| Dependency* get_sink_dep_by_channel_id(int channel_id) { return nullptr; } |
| |
| void set_ready_to_read(int channel_id) { |
| auto& dep = source_deps[channel_id]; |
| DCHECK(dep) << channel_id; |
| dep->set_ready(); |
| } |
| |
| void add_mem_usage(int channel_id, size_t delta) { mem_counters[channel_id]->update(delta); } |
| |
| void sub_mem_usage(int channel_id, size_t delta) { |
| mem_counters[channel_id]->update(-(int64_t)delta); |
| } |
| |
| void add_total_mem_usage(size_t delta) { |
| if (cast_set<int64_t>(mem_usage.fetch_add(delta) + delta) > _buffer_mem_limit) { |
| sink_deps.front()->block(); |
| } |
| } |
| |
| void sub_total_mem_usage(size_t delta) { |
| auto prev_usage = mem_usage.fetch_sub(delta); |
| DCHECK_GE(prev_usage - delta, 0) << "prev_usage: " << prev_usage << " delta: " << delta; |
| if (cast_set<int64_t>(prev_usage - delta) <= _buffer_mem_limit) { |
| sink_deps.front()->set_ready(); |
| } |
| } |
| |
| void set_low_memory_mode(RuntimeState* state) { |
| _buffer_mem_limit = std::min<int64_t>(config::local_exchange_buffer_mem_limit, |
| state->low_memory_mode_buffer_limit()); |
| } |
| }; |
| |
| struct FetchRpcStruct { |
| std::shared_ptr<PBackendService_Stub> stub; |
| PMultiGetRequestV2 request; |
| std::shared_ptr<doris::DummyBrpcCallback<PMultiGetResponseV2>> callback; |
| MonotonicStopWatch rpc_timer; |
| }; |
| |
| struct MaterializationSharedState : public BasicSharedState { |
| ENABLE_FACTORY_CREATOR(MaterializationSharedState) |
| public: |
| MaterializationSharedState() = default; |
| |
| Status init_multi_requests(const TMaterializationNode& tnode, RuntimeState* state); |
| Status create_muiltget_result(const vectorized::Columns& columns, bool eos, bool gc_id_map); |
| Status merge_multi_response(vectorized::Block* block); |
| |
| void create_counter_dependency(int operator_id, int node_id, const std::string& name); |
| |
| bool rpc_struct_inited = false; |
| AtomicStatus rpc_status; |
| |
| bool last_block = false; |
| // empty materialization sink block not need to merge block |
| bool need_merge_block = true; |
| vectorized::Block origin_block; |
| // The rowid column of the origin block. should be replaced by the column of the result block. |
| std::vector<int> rowid_locs; |
| std::vector<vectorized::MutableBlock> response_blocks; |
| std::map<int64_t, FetchRpcStruct> rpc_struct_map; |
| // Register each line in which block to ensure the order of the result. |
| // Zero means NULL value. |
| std::vector<std::vector<int64_t>> block_order_results; |
| }; |
| #include "common/compile_check_end.h" |
| } // namespace doris::pipeline |