be/src/runtime/buffer_control_block.h - doris - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #pragma once

 #include <arrow/type.h>
 #include <cctz/time_zone.h>
 #include <gen_cpp/PaloInternalService_types.h>
 #include <gen_cpp/Types_types.h>
 #include <gen_cpp/segment_v2.pb.h>
 #include <stdint.h>

 #include <atomic>
 #include <condition_variable>
 #include <deque>
 #include <list>
 #include <memory>
 #include <mutex>

 #include "common/status.h"
 #include "runtime/memory/mem_tracker_limiter.h"
 #include "runtime/query_statistics.h"
 #include "util/runtime_profile.h"

 namespace google {
 namespace protobuf {
 class Closure;
 }
 } // namespace google

 namespace arrow {
 class RecordBatch;
 } // namespace arrow

 namespace brpc {
 class Controller;
 }

 namespace doris {

 namespace pipeline {
 class Dependency;
 } // namespace pipeline

 namespace vectorized {
 class Block;
 } // namespace vectorized

 class PFetchDataResult;
 class PFetchArrowDataResult;
 class RuntimeState;

 struct GetResultBatchCtx {
     brpc::Controller* cntl = nullptr;
     PFetchDataResult* result = nullptr;
     google::protobuf::Closure* done = nullptr;

     GetResultBatchCtx(brpc::Controller* cntl_, PFetchDataResult* result_,
                       google::protobuf::Closure* done_)
             : cntl(cntl_), result(result_), done(done_) {}

     void on_failure(const Status& status);
     void on_close(int64_t packet_seq, QueryStatistics* statistics = nullptr);
     void on_data(const std::unique_ptr<TFetchDataResult>& t_result, int64_t packet_seq,
                  bool eos = false);
 };

 struct GetArrowResultBatchCtx {
     brpc::Controller* cntl = nullptr;
     PFetchArrowDataResult* result = nullptr;
     google::protobuf::Closure* done = nullptr;

     GetArrowResultBatchCtx(brpc::Controller* cntl_, PFetchArrowDataResult* result_,
                            google::protobuf::Closure* done_)
             : cntl(cntl_), result(result_), done(done_) {}

     void on_failure(const Status& status);
     void on_close(int64_t packet_seq);
     void on_data(const std::shared_ptr<vectorized::Block>& block, int64_t packet_seq,
                  int be_exec_version,
                  segment_v2::CompressionTypePB fragement_transmission_compression_type,
                  std::string timezone, RuntimeProfile::Counter* serialize_batch_ns_timer,
                  RuntimeProfile::Counter* uncompressed_bytes_counter,
                  RuntimeProfile::Counter* compressed_bytes_counter);
 };

 // buffer used for result customer and producer
 class BufferControlBlock {
 public:
     BufferControlBlock(const TUniqueId& id, int buffer_size, RuntimeState* state);
     virtual ~BufferControlBlock();

     Status init();
     // Only one fragment is written, so can_sink returns true, then the sink must be executed
     virtual bool can_sink();
     virtual Status add_batch(std::unique_ptr<TFetchDataResult>& result, bool is_pipeline = false);
     virtual Status add_arrow_batch(std::shared_ptr<vectorized::Block>& result);

     virtual void get_batch(GetResultBatchCtx* ctx);
     // for ArrowFlightBatchLocalReader
     virtual Status get_arrow_batch(std::shared_ptr<vectorized::Block>* result,
                                    cctz::time_zone& timezone_obj);
     // for ArrowFlightBatchRemoteReader
     virtual void get_arrow_batch(GetArrowResultBatchCtx* ctx);

     virtual void register_arrow_schema(const std::shared_ptr<arrow::Schema>& arrow_schema);
     virtual Status find_arrow_schema(std::shared_ptr<arrow::Schema>* arrow_schema);

     // close buffer block, set _status to exec_status and set _is_close to true;
     // called because data has been read or error happened.
     Status close(Status exec_status);

     virtual void cancel(const Status& reason);

     [[nodiscard]] const TUniqueId& fragment_id() const { return _fragment_id; }
     [[nodiscard]] std::shared_ptr<MemTrackerLimiter> mem_tracker() { return _mem_tracker; }

     void update_return_rows(int64_t num_rows) {
         // _query_statistics may be null when the result sink init failed
         // or some other failure.
         // and the number of written rows is only needed when all things go well.
         if (_query_statistics != nullptr) {
             _query_statistics->set_returned_rows(num_rows);
         }
     }

 protected:
     virtual bool _get_batch_queue_empty() {
         return _fe_result_batch_queue.empty() && _arrow_flight_result_batch_queue.empty();
     }
     virtual void _update_batch_queue_empty() {}

     using FeResultQueue = std::list<std::unique_ptr<TFetchDataResult>>;
     using ArrowFlightResultQueue = std::list<std::shared_ptr<vectorized::Block>>;

     // result's query id
     TUniqueId _fragment_id;
     bool _is_close;
     std::atomic_bool _is_cancelled;
     Status _status;
     std::atomic_int _buffer_rows;
     const int _buffer_limit;
     int64_t _packet_num;
     size_t _last_batch_bytes = 0;

     // blocking queue for batch
     FeResultQueue _fe_result_batch_queue;
     ArrowFlightResultQueue _arrow_flight_result_batch_queue;
     // for arrow flight
     std::shared_ptr<arrow::Schema> _arrow_schema;

     // protects all subsequent data in this block
     std::mutex _lock;
     // signal arrival of new batch or the eos/cancelled condition
     std::condition_variable _data_arrival;
     // signal removal of data by stream consumer
     std::condition_variable _data_removal;
     // get arrow flight result is a sync method, need wait for data ready and return result.
     // TODO, waiting for data will block pipeline, so use a request pool to save requests waiting for data.
     std::condition_variable _arrow_data_arrival;

     std::deque<GetResultBatchCtx*> _waiting_rpc;
     std::deque<GetArrowResultBatchCtx*> _waiting_arrow_result_batch_rpc;

     // only used for FE using return rows to check limit
     std::unique_ptr<QueryStatistics> _query_statistics;

     std::string _timezone;
     cctz::time_zone _timezone_obj;
     int _be_exec_version;
     segment_v2::CompressionTypePB _fragement_transmission_compression_type;
     std::shared_ptr<MemTrackerLimiter> _mem_tracker;

     // only used for ArrowFlightBatchRemoteReader
     RuntimeProfile _profile;
     RuntimeProfile::Counter* _serialize_batch_ns_timer = nullptr;
     RuntimeProfile::Counter* _uncompressed_bytes_counter = nullptr;
     RuntimeProfile::Counter* _compressed_bytes_counter = nullptr;
 };

 class PipBufferControlBlock : public BufferControlBlock {
 public:
     PipBufferControlBlock(const TUniqueId& id, int buffer_size, RuntimeState* state)
             : BufferControlBlock(id, buffer_size, state) {}

     bool can_sink() override {
         return _get_batch_queue_empty() || _buffer_rows < _buffer_limit || _is_cancelled;
     }

     Status add_batch(std::unique_ptr<TFetchDataResult>& result, bool is_pipeline = true) override;

     Status add_arrow_batch(std::shared_ptr<vectorized::Block>& result) override;

     void get_batch(GetResultBatchCtx* ctx) override;

     Status get_arrow_batch(std::shared_ptr<vectorized::Block>* result,
                            cctz::time_zone& timezone_obj) override;

     void get_arrow_batch(GetArrowResultBatchCtx* ctx) override;

     void cancel(const Status& reason) override;

     void set_dependency(std::shared_ptr<pipeline::Dependency> result_sink_dependency);

 private:
     void _update_dependency();
     bool _get_batch_queue_empty() override { return _batch_queue_empty; }
     void _update_batch_queue_empty() override;

     std::atomic_bool _batch_queue_empty {false};
     std::shared_ptr<pipeline::Dependency> _result_sink_dependency;
 };

 } // namespace doris
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#pragma once

	#include <arrow/type.h>
	#include <cctz/time_zone.h>
	#include <gen_cpp/PaloInternalService_types.h>
	#include <gen_cpp/Types_types.h>
	#include <gen_cpp/segment_v2.pb.h>
	#include <stdint.h>

	#include <atomic>
	#include <condition_variable>
	#include <deque>
	#include <list>
	#include <memory>
	#include <mutex>

	#include "common/status.h"
	#include "runtime/memory/mem_tracker_limiter.h"
	#include "runtime/query_statistics.h"
	#include "util/runtime_profile.h"

	namespace google {
	namespace protobuf {
	class Closure;
	}
	} // namespace google

	namespace arrow {
	class RecordBatch;
	} // namespace arrow

	namespace brpc {
	class Controller;
	}

	namespace doris {

	namespace pipeline {
	class Dependency;
	} // namespace pipeline

	namespace vectorized {
	class Block;
	} // namespace vectorized

	class PFetchDataResult;
	class PFetchArrowDataResult;
	class RuntimeState;

	struct GetResultBatchCtx {
	brpc::Controller* cntl = nullptr;
	PFetchDataResult* result = nullptr;
	google::protobuf::Closure* done = nullptr;

	GetResultBatchCtx(brpc::Controller* cntl_, PFetchDataResult* result_,
	google::protobuf::Closure* done_)
	: cntl(cntl_), result(result_), done(done_) {}

	void on_failure(const Status& status);
	void on_close(int64_t packet_seq, QueryStatistics* statistics = nullptr);
	void on_data(const std::unique_ptr<TFetchDataResult>& t_result, int64_t packet_seq,
	bool eos = false);
	};

	struct GetArrowResultBatchCtx {
	brpc::Controller* cntl = nullptr;
	PFetchArrowDataResult* result = nullptr;
	google::protobuf::Closure* done = nullptr;

	GetArrowResultBatchCtx(brpc::Controller* cntl_, PFetchArrowDataResult* result_,
	google::protobuf::Closure* done_)
	: cntl(cntl_), result(result_), done(done_) {}

	void on_failure(const Status& status);
	void on_close(int64_t packet_seq);
	void on_data(const std::shared_ptr<vectorized::Block>& block, int64_t packet_seq,
	int be_exec_version,
	segment_v2::CompressionTypePB fragement_transmission_compression_type,
	std::string timezone, RuntimeProfile::Counter* serialize_batch_ns_timer,
	RuntimeProfile::Counter* uncompressed_bytes_counter,
	RuntimeProfile::Counter* compressed_bytes_counter);
	};

	// buffer used for result customer and producer
	class BufferControlBlock {
	public:
	BufferControlBlock(const TUniqueId& id, int buffer_size, RuntimeState* state);
	virtual ~BufferControlBlock();

	Status init();
	// Only one fragment is written, so can_sink returns true, then the sink must be executed
	virtual bool can_sink();
	virtual Status add_batch(std::unique_ptr<TFetchDataResult>& result, bool is_pipeline = false);
	virtual Status add_arrow_batch(std::shared_ptr<vectorized::Block>& result);

	virtual void get_batch(GetResultBatchCtx* ctx);
	// for ArrowFlightBatchLocalReader
	virtual Status get_arrow_batch(std::shared_ptr<vectorized::Block>* result,
	cctz::time_zone& timezone_obj);
	// for ArrowFlightBatchRemoteReader
	virtual void get_arrow_batch(GetArrowResultBatchCtx* ctx);

	virtual void register_arrow_schema(const std::shared_ptr<arrow::Schema>& arrow_schema);
	virtual Status find_arrow_schema(std::shared_ptr<arrow::Schema>* arrow_schema);

	// close buffer block, set _status to exec_status and set _is_close to true;
	// called because data has been read or error happened.
	Status close(Status exec_status);

	virtual void cancel(const Status& reason);

	[[nodiscard]] const TUniqueId& fragment_id() const { return _fragment_id; }
	[[nodiscard]] std::shared_ptr<MemTrackerLimiter> mem_tracker() { return _mem_tracker; }

	void update_return_rows(int64_t num_rows) {
	// _query_statistics may be null when the result sink init failed
	// or some other failure.
	// and the number of written rows is only needed when all things go well.
	if (_query_statistics != nullptr) {
	_query_statistics->set_returned_rows(num_rows);
	}
	}

	protected:
	virtual bool _get_batch_queue_empty() {
	return _fe_result_batch_queue.empty() && _arrow_flight_result_batch_queue.empty();
	}
	virtual void _update_batch_queue_empty() {}

	using FeResultQueue = std::list<std::unique_ptr<TFetchDataResult>>;
	using ArrowFlightResultQueue = std::list<std::shared_ptr<vectorized::Block>>;

	// result's query id
	TUniqueId _fragment_id;
	bool _is_close;
	std::atomic_bool _is_cancelled;
	Status _status;
	std::atomic_int _buffer_rows;
	const int _buffer_limit;
	int64_t _packet_num;
	size_t _last_batch_bytes = 0;

	// blocking queue for batch
	FeResultQueue _fe_result_batch_queue;
	ArrowFlightResultQueue _arrow_flight_result_batch_queue;
	// for arrow flight
	std::shared_ptr<arrow::Schema> _arrow_schema;

	// protects all subsequent data in this block
	std::mutex _lock;
	// signal arrival of new batch or the eos/cancelled condition
	std::condition_variable _data_arrival;
	// signal removal of data by stream consumer
	std::condition_variable _data_removal;
	// get arrow flight result is a sync method, need wait for data ready and return result.
	// TODO, waiting for data will block pipeline, so use a request pool to save requests waiting for data.
	std::condition_variable _arrow_data_arrival;

	std::deque<GetResultBatchCtx*> _waiting_rpc;
	std::deque<GetArrowResultBatchCtx*> _waiting_arrow_result_batch_rpc;

	// only used for FE using return rows to check limit
	std::unique_ptr<QueryStatistics> _query_statistics;

	std::string _timezone;
	cctz::time_zone _timezone_obj;
	int _be_exec_version;
	segment_v2::CompressionTypePB _fragement_transmission_compression_type;
	std::shared_ptr<MemTrackerLimiter> _mem_tracker;

	// only used for ArrowFlightBatchRemoteReader
	RuntimeProfile _profile;
	RuntimeProfile::Counter* _serialize_batch_ns_timer = nullptr;
	RuntimeProfile::Counter* _uncompressed_bytes_counter = nullptr;
	RuntimeProfile::Counter* _compressed_bytes_counter = nullptr;
	};

	class PipBufferControlBlock : public BufferControlBlock {
	public:
	PipBufferControlBlock(const TUniqueId& id, int buffer_size, RuntimeState* state)
	: BufferControlBlock(id, buffer_size, state) {}

	bool can_sink() override {
	return _get_batch_queue_empty() \|\| _buffer_rows < _buffer_limit \|\| _is_cancelled;
	}

	Status add_batch(std::unique_ptr<TFetchDataResult>& result, bool is_pipeline = true) override;

	Status add_arrow_batch(std::shared_ptr<vectorized::Block>& result) override;

	void get_batch(GetResultBatchCtx* ctx) override;

	Status get_arrow_batch(std::shared_ptr<vectorized::Block>* result,
	cctz::time_zone& timezone_obj) override;

	void get_arrow_batch(GetArrowResultBatchCtx* ctx) override;

	void cancel(const Status& reason) override;

	void set_dependency(std::shared_ptr<pipeline::Dependency> result_sink_dependency);

	private:
	void _update_dependency();
	bool _get_batch_queue_empty() override { return _batch_queue_empty; }
	void _update_batch_queue_empty() override;

	std::atomic_bool _batch_queue_empty {false};
	std::shared_ptr<pipeline::Dependency> _result_sink_dependency;
	};

	} // namespace doris