be/src/runtime/krpc-data-stream-recvr.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "runtime/krpc-data-stream-recvr.h"

 #include <condition_variable>
 #include <queue>

 #include <boost/thread/locks.hpp>
 #include <boost/thread/mutex.hpp>

 #include "exec/kudu-util.h"
 #include "kudu/rpc/rpc_context.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/trace.h"
 #include "runtime/fragment-instance-state.h"
 #include "runtime/krpc-data-stream-recvr.h"
 #include "runtime/krpc-data-stream-mgr.h"
 #include "runtime/mem-tracker.h"
 #include "runtime/row-batch.h"
 #include "runtime/sorted-run-merger.h"
 #include "service/data-stream-service.h"
 #include "util/debug-util.h"
 #include "util/runtime-profile-counters.h"
 #include "util/periodic-counter-updater.h"
 #include "util/test-info.h"
 #include "util/time.h"

 #include "gen-cpp/data_stream_service.pb.h"

 #include "common/names.h"

 DECLARE_int32(datastream_service_num_deserialization_threads);

 using kudu::MonoDelta;
 using kudu::MonoTime;
 using kudu::rpc::RpcContext;
 using std::condition_variable_any;

 namespace impala {

 // Implements a FIFO queue of row batches from one or more senders. One queue is
 // maintained per sender if is_merging_ is true for the enclosing receiver, otherwise rows
 // from all senders are placed in the same queue.
 //
 // Batches are added by senders via AddBatch(), and removed by an enclosing
 // KrpcDataStreamRecvr via GetBatch(). There is a soft limit for the total amount of
 // memory consumed by buffered row batches in all sender queues of a receiver. If adding
 // a batch will push the memory consumption beyond the limit, that RPC is added to the
 // 'deferred batches' queue, which will be drained in FIFO order when space opens up.
 // Senders in that state will not be replied to until their row batches are deserialized
 // or the receiver is cancelled. This ensures that only one batch per sender is buffered
 // in the deferred batches queue.
 class KrpcDataStreamRecvr::SenderQueue {
  public:
   SenderQueue(KrpcDataStreamRecvr* parent_recvr, int num_senders);

   // Returns the next batch from this sender queue. Sets the returned batch in cur_batch_.
   // A returned batch that is not filled to capacity does *not* indicate end-of-stream.
   // The call blocks until another batch arrives or all senders close their channels.
   // The returned batch is owned by the sender queue. The caller must acquire the
   // resources from the returned batch before the next call to GetBatch().
   Status GetBatch(RowBatch** next_batch);

   // Adds a new row batch to this sender queue if this stream has not been cancelled.
   // If adding this batch causes us to exceed the receiver's buffer limit, the RPC state
   // is copied into 'deferred_rpcs_' for deferred processing and this function returns
   // immediately. The deferred RPCs are replied to later when space becomes available.
   void AddBatch(const TransmitDataRequestPB* request, TransmitDataResponsePB* response,
       RpcContext* context);

   // Tries inserting the front of 'deferred_rpcs_' queue into 'batch_queue_' if possible.
   // On success, the first entry of 'deferred_rpcs_' is removed and the sender of the RPC
   // will be responded to. If the serialized row batch fails to be extracted from the
   // entry, the error status will be sent as reply.
   void ProcessDeferredRpc();

   // Takes over the RPC state 'ctx' of an early sender for deferred processing and
   // kicks off a deserialization task to process it asynchronously. The ownership of
   // 'ctx' is transferred to this sender queue.
   void TakeOverEarlySender(std::unique_ptr<TransmitDataCtx> ctx);

   // Decrements the number of remaining senders for this queue and signal any threads
   // waiting on the arrival of new batch if the count drops to 0. The number of senders
   // will be 1 for a merging KrpcDataStreamRecvr.
   void DecrementSenders();

   // Sets cancellation flag and signals cancellation to receiver and sender. Subsequent
   // incoming batches will be dropped and senders in 'deferred_rpcs_' are replied to.
   void Cancel();

   // Must be called once to cleanup any queued resources.
   void Close();

   // Returns the current batch from this queue being processed by a consumer.
   RowBatch* current_batch() const { return current_batch_.get(); }

  private:
   // Returns true if either (1) 'batch_queue' is empty and there is no pending insertion
   // or (2) inserting a row batch of 'batch_size' into 'batch_queue' will not cause the
   // soft limit of the receiver to be exceeded. Expected to be called with 'lock_' held.
   bool CanEnqueue(int64_t batch_size, const unique_lock<SpinLock>& lock) const;

   // Helper function for inserting 'payload' into 'deferred_rpcs_'. Also does some
   // accounting for various counters. 'lock_' must be held when calling this function.
   void EnqueueDeferredRpc(unique_ptr<TransmitDataCtx> payload,
       const unique_lock<SpinLock>& lock);

   // Helper function for removing the first item from 'deferred_rpcs_'. Also does some
   // accounting for various counters. 'lock_' must be held when calling this function.
   void DequeueDeferredRpc(const unique_lock<SpinLock>& lock);

   // Mark an error 'status' into the overall status. 'lock_' must be held when calling
   // this function. Will notify all threads waiting on 'data_arrival_cv_'.
   void MarkErrorStatus(const Status& status, const unique_lock<SpinLock>& lock);

   // Unpacks a serialized row batch from 'request' and 'rpc_context' and populates
   // 'tuple_offsets' and 'tuple_data'. On success, the deserialized row batch sizes is
   // stored in 'deserialized_size'. If 'serialized_size' is not NULL, also stores the
   // serialized row batch size in it. On failure, the error status is returned.
   Status UnpackRequest(const TransmitDataRequestPB* request,
       RpcContext* rpc_context, kudu::Slice* tuple_offsets, kudu::Slice* tuple_data,
       int64_t* deserialized_size, int64_t* serialized_size = nullptr);

   // Helper function to compute the serialized row batch size from 'request'
   // and 'rpc_context'. Returns 0 on failure to unpack the serialized row batch.
   int64_t GetSerializedBatchSize(const TransmitDataRequestPB* request,
       RpcContext* rpc_context);

   // The workhorse function for deserializing a row batch represented by ('header',
   // 'tuple_offsets' and 'tuple_data') and inserting it into 'batch_queue'. Expects to be
   // called with 'lock_' held and passed into this function via the argument 'lock'. This
   // function may drop lock when deserializing the row batch and re-acquire it after
   // the row batch is deserialized. 'batch_size' is the size in bytes of the deserialized
   // row batch. The caller is expected to have called CanEnqueue() to make sure the row
   // batch can be inserted without exceeding the soft limit of the receiver. Also notify
   // a thread waiting on 'data_arrival_cv_'. Return error status if the row batch creation
   // failed. Returns OK otherwise.
   Status AddBatchWork(int64_t batch_size, const RowBatchHeaderPB& header,
       const kudu::Slice& tuple_offsets, const kudu::Slice& tuple_data,
       unique_lock<SpinLock>* lock, RpcContext* rpc_context) WARN_UNUSED_RESULT;

   // Receiver of which this queue is a member.
   KrpcDataStreamRecvr* recvr_;

   // Protects all subsequent fields.
   SpinLock lock_;

   // Record any error status within KrpcDataStreamRecvr when inserting row batch.
   Status status_;

   // If true, the receiver fragment for this stream got cancelled. This is usually
   // triggered by closing the owning exchange node or cancelling the query.
   bool is_cancelled_ = false;

   // Number of deserialization requests sent to deserialization threads to drain
   // 'deferred_rpcs_' which are yet to be processed. Used to limit the number of
   // requests queued.
   int num_deserialize_tasks_pending_ = 0;

   // Number of senders which haven't closed the channel yet
   // (if it drops to 0, end-of-stream is true)
   int num_remaining_senders_;

   // Number of pending row batch insertion. AddBatchWork() may drop and reacquire 'lock_',
   // causing race between multiple threads calling AddBatch() at the same time or race
   // between threads calling AddBatch() and threads calling Close() concurrently.
   // AddBatchWork() increments this counter before dropping 'lock_' for deserializing
   // the row batch. The counter is decremented after 'lock_' is re-acquired and the row
   // batch is inserted into 'batch_queue'. The races are as follows:
   //
   // 1. Multiple threads inserting into an empty 'batch_queue' concurrently may all see
   // it as empty before the first thread manages to insert into batch_queue. This may
   // cause the soft limit to be exceeded. A queue is truly empty iff this counter is 0.
   //
   // 2. Close() cannot proceed until this counter is 0 to make sure all pending inserts
   // complete before the 'batch_queue' is cleared.
   int num_pending_enqueue_ = 0;

   // Signal the arrival of new batch or the eos/cancelled condition.
   condition_variable_any data_arrival_cv_;

   // Queue of (batch length, batch) pairs. The SenderQueue owns the memory to these
   // batches until they are handed off to the callers of GetBatch().
   typedef list<pair<int, std::unique_ptr<RowBatch>>> RowBatchQueue;
   RowBatchQueue batch_queue_;

   // The batch that was most recently returned via GetBatch(), i.e. the current batch
   // from this queue being processed by a consumer. It's destroyed when the next batch
   // is retrieved.
   scoped_ptr<RowBatch> current_batch_;

   // Set to true when the first batch has been received
   bool received_first_batch_ = false;

   // Queue of deferred RPCs - those that have a batch to deliver, but the queue was
   // full when they last tried to do so. The senders wait here until there is a space for
   // their batches, allowing the receiver-side to implement basic flow-control.
   std::queue<std::unique_ptr<TransmitDataCtx>> deferred_rpcs_;

   // Monotonic time in nanoseconds of when 'deferred_rpcs_' goes from being empty to
   // non-empty. Set to 0 when 'deferred_rpcs_' becomes empty again. Used for computing
   // 'total_has_deferred_rpcs_timer_'.
   int64_t has_deferred_rpcs_start_time_ns_ = 0;
 };

 KrpcDataStreamRecvr::SenderQueue::SenderQueue(
     KrpcDataStreamRecvr* parent_recvr, int num_senders)
   : recvr_(parent_recvr), num_remaining_senders_(num_senders) { }

 Status KrpcDataStreamRecvr::SenderQueue::GetBatch(RowBatch** next_batch) {
   SCOPED_TIMER(recvr_->queue_get_batch_timer_);
   DCHECK(TestInfo::is_test() || FragmentInstanceState::IsFragmentExecThread());
   DCHECK(!recvr_->closed_);
   int num_to_dequeue = 0;
   // The sender id is set below when we decide to dequeue entries from 'deferred_rpcs_'.
   int sender_id = -1;
   {
     unique_lock<SpinLock> l(lock_);
     // current_batch_ must be replaced with the returned batch.
     current_batch_.reset();
     *next_batch = nullptr;

     // Wait until something shows up or we know we're done
     while (batch_queue_.empty() && status_.ok() && !is_cancelled_ &&
         num_remaining_senders_ > 0) {
       // Verify before waiting on 'data_arrival_cv_' that if there are any deferred
       // batches, either there is outstanding deserialization request queued or there
       // is pending insertion so this thread is guaranteed to wake up at some point.
       DCHECK(deferred_rpcs_.empty() ||
           (num_deserialize_tasks_pending_ + num_pending_enqueue_) > 0);
       VLOG_ROW << "wait arrival fragment_instance_id="
                << PrintId(recvr_->fragment_instance_id())
                << " node=" << recvr_->dest_node_id();
       // Don't count time spent waiting on the sender as active time.
       CANCEL_SAFE_SCOPED_TIMER3(recvr_->data_wait_timer_, recvr_->inactive_timer_,
           received_first_batch_ ? nullptr : recvr_->first_batch_wait_total_timer_,
           &is_cancelled_);
       data_arrival_cv_.wait(l);
     }

     // Return early if there is any error when inserting row batches.
     RETURN_IF_ERROR(status_);

     if (UNLIKELY(is_cancelled_)) {
       // Cancellation should have drained the entire 'deferred_rpcs_' queue.
       // Make sure the senders were replied to or they may be stuck waiting for a reply.
       DCHECK(deferred_rpcs_.empty());
       return Status::CANCELLED;
     }

     // All senders have sent their row batches. Nothing to do.
     if (num_remaining_senders_ == 0 && batch_queue_.empty()) {
       // Note that it's an invariant that a sender cannot send the EOS RPC until all
       // outstanding TransmitData() RPCs have been replied to. Therefore, it should be
       // impossible for num_remaining_senders_ to reach 0 before all RPCs in
       // 'deferred_rpcs_' have been replied to.
       DCHECK(deferred_rpcs_.empty());
       DCHECK_EQ(num_pending_enqueue_, 0);
       return Status::OK();
     }

     // Notify the deserialization threads to retry delivering the deferred RPCs.
     if (!deferred_rpcs_.empty()) {
       // Try dequeuing multiple entries from 'deferred_rpcs_' to parallelize the CPU
       // bound deserialization work. No point in dequeuing more than number of
       // deserialization threads available.
       DCHECK_GE(deferred_rpcs_.size(), num_deserialize_tasks_pending_);
       num_to_dequeue = min(FLAGS_datastream_service_num_deserialization_threads,
           (int)deferred_rpcs_.size() - num_deserialize_tasks_pending_);
       num_deserialize_tasks_pending_ += num_to_dequeue;
       sender_id = deferred_rpcs_.front()->request->sender_id();
     }

     DCHECK(!batch_queue_.empty());
     received_first_batch_ = true;
     RowBatch* result = batch_queue_.front().second.release();
     int64_t batch_size = batch_queue_.front().first;
     COUNTER_ADD(recvr_->bytes_dequeued_counter_, batch_size);
     recvr_->num_buffered_bytes_.Add(-batch_size);
     batch_queue_.pop_front();
     VLOG_ROW << "fetched #rows=" << result->num_rows();
     current_batch_.reset(result);
     *next_batch = current_batch_.get();
   }
   // Don't hold lock when calling EnqueueDeserializeTask() as it may block.
   // It's important that the dequeuing of 'deferred_rpcs_' is done after the entry
   // has been removed from 'batch_queue_' or the deserialization threads may fail to
   // insert into a non-empty 'batch_queue_' and the receiver will be waiting forever.
   if (num_to_dequeue > 0) {
     DCHECK_GE(sender_id, 0);
     recvr_->mgr_->EnqueueDeserializeTask(recvr_->fragment_instance_id(),
         recvr_->dest_node_id(), sender_id, num_to_dequeue);
   }
   return Status::OK();
 }

 inline bool KrpcDataStreamRecvr::SenderQueue::CanEnqueue(int64_t batch_size,
     const unique_lock<SpinLock>& lock) const {
   DCHECK(lock.owns_lock());
   // The queue is truly empty iff there is no pending insert. It's important that we
   // enqueue the new batch regardless of buffer limit if the queue is currently empty.
   // In the case of a merging receiver, batches are received from a specific queue
   // based on data order, and the pipeline will stall if the merger is waiting for data
   // from an empty queue that cannot be filled because the limit has been reached.
   bool queue_empty = batch_queue_.empty() && num_pending_enqueue_ == 0;
   return queue_empty || !recvr_->ExceedsLimit(batch_size);
 }

 void KrpcDataStreamRecvr::SenderQueue::EnqueueDeferredRpc(
     unique_ptr<TransmitDataCtx> payload, const unique_lock<SpinLock>& lock) {
   DCHECK(lock.owns_lock());
   TRACE_TO(payload->rpc_context->trace(), "Enqueuing deferred RPC");
   if (deferred_rpcs_.empty()) has_deferred_rpcs_start_time_ns_ = MonotonicNanos();
   deferred_rpcs_.push(move(payload));
   recvr_->num_deferred_rpcs_.Add(1);
   COUNTER_ADD(recvr_->total_deferred_rpcs_counter_, 1);
 }

 void KrpcDataStreamRecvr::SenderQueue::DequeueDeferredRpc(
     const unique_lock<SpinLock>& lock) {
   DCHECK(lock.owns_lock());
   deferred_rpcs_.pop();
   if (deferred_rpcs_.empty()) {
     DCHECK_NE(has_deferred_rpcs_start_time_ns_, 0);
     int64_t duration = MonotonicNanos() - has_deferred_rpcs_start_time_ns_;
     COUNTER_ADD(recvr_->total_has_deferred_rpcs_timer_, duration);
     has_deferred_rpcs_start_time_ns_ = 0;
   }
   recvr_->num_deferred_rpcs_.Add(-1);
 }

 inline void KrpcDataStreamRecvr::SenderQueue::MarkErrorStatus(const Status& status,
     const unique_lock<SpinLock>& lock) {
   DCHECK(lock.owns_lock());
   DCHECK(!status.ok());
   status_.MergeStatus(status);
   // Notify all threads which are waiting for row batches that an error has occurred.
   data_arrival_cv_.notify_all();
 }

 Status KrpcDataStreamRecvr::SenderQueue::UnpackRequest(
     const TransmitDataRequestPB* request, RpcContext* rpc_context,
     kudu::Slice* tuple_offsets, kudu::Slice* tuple_data, int64_t* deserialized_size,
     int64_t* serialized_size) {

   RETURN_IF_ERROR(DebugAction(recvr_->runtime_state_.query_options(),
       "RECVR_UNPACK_PAYLOAD"));

   // Unpack the tuple offsets.
   KUDU_RETURN_IF_ERROR(rpc_context->GetInboundSidecar(
       request->tuple_offsets_sidecar_idx(), tuple_offsets),
       "Failed to get the tuple offsets sidecar");
   // Unpack the tuple data.
   KUDU_RETURN_IF_ERROR(rpc_context->GetInboundSidecar(
       request->tuple_data_sidecar_idx(), tuple_data),
       "Failed to get the tuple data sidecar");
   // Compute the size of the deserialized row batch.
   *deserialized_size =
       RowBatch::GetDeserializedSize(request->row_batch_header(), *tuple_offsets);
   // Compute the size of the serialized row batch.
   if (serialized_size != nullptr) {
     *serialized_size = tuple_offsets->size() + tuple_data->size();
   }
   return Status::OK();
 }

 Status KrpcDataStreamRecvr::SenderQueue::AddBatchWork(int64_t batch_size,
     const RowBatchHeaderPB& header, const kudu::Slice& tuple_offsets,
     const kudu::Slice& tuple_data, unique_lock<SpinLock>* lock,
     RpcContext* rpc_context) {
   DCHECK(lock != nullptr);
   DCHECK(lock->owns_lock());
   DCHECK(!is_cancelled_);

   // Reserve queue space before dropping the lock below.
   recvr_->num_buffered_bytes_.Add(batch_size);
   // Bump 'num_pending_enqueue_' to avoid race with Close() when lock is dropped below.
   DCHECK_GE(num_pending_enqueue_, 0);
   ++num_pending_enqueue_;

   // Deserialization may take some time due to compression and memory allocation.
   // Drop the lock so we can deserialize multiple batches in parallel.
   lock->unlock();
   TRACE_TO(rpc_context->trace(), "Deserializing batch");
   unique_ptr<RowBatch> batch;
   Status status;
   {
     SCOPED_TIMER(recvr_->deserialize_row_batch_timer_);
     status = DebugAction(recvr_->runtime_state_.query_options(), "RECVR_ADD_BATCH");
     if (LIKELY(status.ok())) {
       // At this point, a row batch will be inserted into batch_queue_.
       // Close() will handle deleting any unconsumed batches from batch_queue_.
       // Close() cannot proceed until there are no pending insertion to batch_queue_.
       status = RowBatch::FromProtobuf(recvr_->row_desc(), header, tuple_offsets,
           tuple_data, recvr_->parent_tracker(), recvr_->buffer_pool_client(), &batch);
     }
   }
   lock->lock();

   DCHECK_GT(num_pending_enqueue_, 0);
   --num_pending_enqueue_;
   if (UNLIKELY(!status.ok())) {
     recvr_->num_buffered_bytes_.Add(-batch_size);
     VLOG_QUERY << "Failed to deserialize batch for "
                << PrintId(recvr_->fragment_instance_id());
     TRACE_TO(rpc_context->trace(), "Failed to deserialize batch: $0", status.GetDetail());
     MarkErrorStatus(status, *lock);
     return status;
   }
   VLOG_ROW << "added #rows=" << batch->num_rows() << " batch_size=" << batch_size;
   TRACE_TO(rpc_context->trace(), "Enqueuing deserialized batch");
   COUNTER_ADD(recvr_->total_enqueued_batches_counter_, 1);
   batch_queue_.emplace_back(batch_size, move(batch));
   data_arrival_cv_.notify_one();
   return Status::OK();
 }

 void KrpcDataStreamRecvr::SenderQueue::AddBatch(const TransmitDataRequestPB* request,
     TransmitDataResponsePB* response, RpcContext* rpc_context) {
   // TODO: Add timers for time spent in this function and queue time in 'batch_queue_'.
   const RowBatchHeaderPB& header = request->row_batch_header();
   kudu::Slice tuple_offsets;
   kudu::Slice tuple_data;
   int64_t batch_size;
   Status status = UnpackRequest(request, rpc_context, &tuple_offsets, &tuple_data,
       &batch_size);
   if (UNLIKELY(!status.ok())) {
     {
       unique_lock<SpinLock> l(lock_);
       MarkErrorStatus(status, l);
     }
     TRACE_TO(rpc_context->trace(), "Error unpacking request: $0", status.GetDetail());
     DataStreamService::RespondRpc(status, response, rpc_context);
     return;
   }
   COUNTER_ADD(recvr_->total_received_batches_counter_, 1);
   // To be consistent with the senders, only count the sidecars size.
   COUNTER_ADD(recvr_->bytes_received_counter_, tuple_data.size() + tuple_offsets.size());

   {
     unique_lock<SpinLock> l(lock_);
     // There should be one or more senders left when this function is called. The reason
     // is that EndDataStream RPC is not sent until all outstanding TransmitData() RPC has
     // been replied to. There is at least one TransmitData() RPC which hasn't yet been
     // responded to if we reach here.
     DCHECK_GT(num_remaining_senders_, 0);
     if (UNLIKELY(is_cancelled_)) {
       TRACE_TO(rpc_context->trace(), "Receiver was cancelled");
       Status cancel_status = Status::Expected(TErrorCode::DATASTREAM_RECVR_CLOSED,
           PrintId(recvr_->fragment_instance_id()), recvr_->dest_node_id());
       DataStreamService::RespondRpc(cancel_status, response, rpc_context);
       return;
     }

     // If there's something in the queue or this batch will push us over the buffer
     // limit we need to wait until the queue gets drained. We store the rpc context
     // so that we can signal it at a later time to resend the batch that we couldn't
     // process here. If there are already deferred RPCs waiting in queue, the new
     // batch needs to line up after the deferred RPCs to avoid starvation of senders
     // in the non-merging case.
     if (UNLIKELY(!deferred_rpcs_.empty() || !CanEnqueue(batch_size, l))) {
       recvr_->deferred_rpc_tracker()->Consume(rpc_context->GetTransferSize());
       auto payload = make_unique<TransmitDataCtx>(request, response, rpc_context);
       EnqueueDeferredRpc(move(payload), l);
       return;
     }

     // At this point, we are committed to inserting the row batch into 'batch_queue_'.
     status = AddBatchWork(batch_size, header, tuple_offsets, tuple_data, &l, rpc_context);
   }

   // Respond to the sender to ack the insertion of the row batches.
   DataStreamService::RespondRpc(status, response, rpc_context);
 }

 void KrpcDataStreamRecvr::SenderQueue::ProcessDeferredRpc() {
   // Owns the first entry of 'deferred_rpcs_' if it ends up being popped.
   std::unique_ptr<TransmitDataCtx> ctx;
   Status status;
   {
     unique_lock<SpinLock> l(lock_);
     DCHECK_GT(num_deserialize_tasks_pending_, 0);
     --num_deserialize_tasks_pending_;

     if (deferred_rpcs_.empty()) return;
     // A sender queue cannot be cancelled if there is any deferred RPC.
     DCHECK(!is_cancelled_);

     // Try enqueuing the first entry into 'batch_queue_'.
     ctx.swap(deferred_rpcs_.front());
     TRACE_TO(ctx->rpc_context->trace(), "Processing deferred RPC");
     kudu::Slice tuple_offsets;
     kudu::Slice tuple_data;
     int64_t batch_size;
     status = UnpackRequest(ctx->request, ctx->rpc_context, &tuple_offsets,
         &tuple_data, &batch_size);
     // Reply with error status if the entry cannot be unpacked.
     if (UNLIKELY(!status.ok())) {
       TRACE_TO(ctx->rpc_context->trace(),
           "Error unpacking deferred RPC: $0", status.GetDetail());
       MarkErrorStatus(status, l);
       DataStreamService::RespondAndReleaseRpc(status, ctx->response, ctx->rpc_context,
           recvr_->deferred_rpc_tracker());
       DequeueDeferredRpc(l);
       return;
     }

     // Stops if inserting the batch causes us to go over the limit.
     // Put 'ctx' back on the queue.
     if (!CanEnqueue(batch_size, l)) {
       TRACE_TO(ctx->rpc_context->trace(), "Batch queue is full");
       ctx.swap(deferred_rpcs_.front());
       DCHECK(deferred_rpcs_.front().get() != nullptr);
       return;
     }

     // Dequeues the deferred batch and adds it to 'batch_queue_'.
     DequeueDeferredRpc(l);
     const RowBatchHeaderPB& header = ctx->request->row_batch_header();
     status = AddBatchWork(
         batch_size, header, tuple_offsets, tuple_data, &l, ctx->rpc_context);
     DCHECK(!status.ok() || !batch_queue_.empty());

     // Release to MemTracker while still holding the lock to prevent race with Close().
     recvr_->deferred_rpc_tracker()->Release(ctx->rpc_context->GetTransferSize());
   }

   // Responds to the sender to ack the insertion of the row batches.
   // No need to hold lock when enqueuing the response.
   DataStreamService::RespondRpc(status, ctx->response, ctx->rpc_context);
 }

 int64_t KrpcDataStreamRecvr::SenderQueue::GetSerializedBatchSize(
     const TransmitDataRequestPB* request, RpcContext* rpc_context) {
   kudu::Slice tuple_offsets;
   kudu::Slice tuple_data;
   int64_t unused;
   int64_t serialized_size = 0;
   if (UnpackRequest(request, rpc_context, &tuple_offsets, &tuple_data, &unused,
           &serialized_size).ok()) {
     return serialized_size;
   }
   return 0;
 }

 void KrpcDataStreamRecvr::SenderQueue::TakeOverEarlySender(
     unique_ptr<TransmitDataCtx> ctx) {
   // TakeOverEarlySender() is called by the same thread which calls Close().
   // The receiver cannot be closed while this function is in progress so
   // 'recvr_->mgr_' shouldn't be NULL.
   DCHECK(TestInfo::is_test() || FragmentInstanceState::IsFragmentExecThread());
   DCHECK(!recvr_->closed_ && recvr_->mgr_ != nullptr);
   COUNTER_ADD(recvr_->total_received_batches_counter_, 1);
   COUNTER_ADD(recvr_->bytes_received_counter_,
       GetSerializedBatchSize(ctx->request, ctx->rpc_context));
   int sender_id = ctx->request->sender_id();
   {
     unique_lock<SpinLock> l(lock_);
     if (UNLIKELY(is_cancelled_)) {
       TRACE_TO(ctx->rpc_context->trace(), "Recvr closed");
       Status cancel_status = Status::Expected(TErrorCode::DATASTREAM_RECVR_CLOSED,
           PrintId(recvr_->fragment_instance_id()), recvr_->dest_node_id());
       DataStreamService::RespondRpc(cancel_status, ctx->response, ctx->rpc_context);
       return;
     }
     // Only enqueue a deferred RPC if the sender queue is not yet cancelled.
     recvr_->deferred_rpc_tracker()->Consume(ctx->rpc_context->GetTransferSize());
     EnqueueDeferredRpc(move(ctx), l);
     ++num_deserialize_tasks_pending_;
   }
   recvr_->mgr_->EnqueueDeserializeTask(recvr_->fragment_instance_id(),
       recvr_->dest_node_id(), sender_id, 1);
 }

 void KrpcDataStreamRecvr::SenderQueue::DecrementSenders() {
   lock_guard<SpinLock> l(lock_);
   DCHECK_GT(num_remaining_senders_, 0);
   num_remaining_senders_ = max(0, num_remaining_senders_ - 1);
   VLOG_FILE << "decremented senders: fragment_instance_id="
             << PrintId(recvr_->fragment_instance_id())
             << " node_id=" << recvr_->dest_node_id()
             << " #senders=" << num_remaining_senders_;
   if (num_remaining_senders_ == 0) data_arrival_cv_.notify_one();
 }

 void KrpcDataStreamRecvr::SenderQueue::Cancel() {
   {
     unique_lock<SpinLock> l(lock_);
     if (is_cancelled_) return;
     is_cancelled_ = true;

     // Respond to deferred RPCs.
     while (!deferred_rpcs_.empty()) {
       const unique_ptr<TransmitDataCtx>& ctx = deferred_rpcs_.front();
       Status cancel_status = Status::Expected(TErrorCode::DATASTREAM_RECVR_CLOSED,
           PrintId(recvr_->fragment_instance_id()), recvr_->dest_node_id());
       DataStreamService::RespondAndReleaseRpc(cancel_status, ctx->response,
           ctx->rpc_context, recvr_->deferred_rpc_tracker());
       DequeueDeferredRpc(l);
     }
   }
   VLOG(2) << "cancelled stream: fragment_instance_id="
           << PrintId(recvr_->fragment_instance_id())
           << " node_id=" << recvr_->dest_node_id();
   // Wake up all threads waiting to produce/consume batches. They will all
   // notice that the stream is cancelled and handle it.
   data_arrival_cv_.notify_all();
   PeriodicCounterUpdater::StopTimeSeriesCounter(
       recvr_->bytes_received_time_series_counter_);
 }

 void KrpcDataStreamRecvr::SenderQueue::Close() {
   unique_lock<SpinLock> l(lock_);
   // Note that the queue must be cancelled first before it can be closed or we may
   // risk running into a race which can leak row batches. Please see IMPALA-3034.
   DCHECK(is_cancelled_);

   // The deferred RPCs should all have been responded to in Cancel().
   DCHECK(deferred_rpcs_.empty());

   // Wait for any pending insertion to complete first.
   while (num_pending_enqueue_ > 0) data_arrival_cv_.wait(l);

   // Delete any batches queued in batch_queue_
   batch_queue_.clear();
   current_batch_.reset();
 }

 Status KrpcDataStreamRecvr::CreateMerger(const TupleRowComparator& less_than) {
   DCHECK(is_merging_);
   DCHECK(TestInfo::is_test() || FragmentInstanceState::IsFragmentExecThread());
   vector<SortedRunMerger::RunBatchSupplierFn> input_batch_suppliers;
   input_batch_suppliers.reserve(sender_queues_.size());

   // Create the merger that will a single stream of sorted rows.
   merger_.reset(new SortedRunMerger(less_than, row_desc_, profile_, false));

   for (SenderQueue* queue: sender_queues_) {
     input_batch_suppliers.push_back(
         [queue](RowBatch** next_batch) -> Status {
           return queue->GetBatch(next_batch);
         });
   }

   RETURN_IF_ERROR(merger_->Prepare(input_batch_suppliers));
   return Status::OK();
 }

 void KrpcDataStreamRecvr::TransferAllResources(RowBatch* transfer_batch) {
   DCHECK(TestInfo::is_test() || FragmentInstanceState::IsFragmentExecThread());
   for (SenderQueue* sender_queue: sender_queues_) {
     if (sender_queue->current_batch() != nullptr) {
       sender_queue->current_batch()->TransferResourceOwnership(transfer_batch);
     }
   }
 }

 KrpcDataStreamRecvr::KrpcDataStreamRecvr(KrpcDataStreamMgr* stream_mgr,
     MemTracker* parent_tracker, const RowDescriptor* row_desc,
     const RuntimeState& runtime_state, const TUniqueId& fragment_instance_id,
     PlanNodeId dest_node_id, int num_senders, bool is_merging,
     int64_t total_buffer_limit, RuntimeProfile* profile,
     BufferPool::ClientHandle* client)
   : mgr_(stream_mgr),
     runtime_state_(runtime_state),
     fragment_instance_id_(fragment_instance_id),
     dest_node_id_(dest_node_id),
     total_buffer_limit_(total_buffer_limit),
     row_desc_(row_desc),
     is_merging_(is_merging),
     closed_(false),
     num_buffered_bytes_(0),
     deferred_rpc_tracker_(new MemTracker(-1, "KrpcDeferredRpcs", parent_tracker)),
     parent_tracker_(parent_tracker),
     buffer_pool_client_(client),
     profile_(profile),
     dequeue_profile_(RuntimeProfile::Create(&pool_, "Dequeue")),
     enqueue_profile_(RuntimeProfile::Create(&pool_, "Enqueue")) {
   // Create one queue per sender if is_merging is true.
   int num_queues = is_merging ? num_senders : 1;
   sender_queues_.reserve(num_queues);
   int num_sender_per_queue = is_merging ? 1 : num_senders;
   for (int i = 0; i < num_queues; ++i) {
     SenderQueue* queue = pool_.Add(new SenderQueue(this, num_sender_per_queue));
     sender_queues_.push_back(queue);
   }

   // Add the profiles of the dequeuing side (i.e. GetBatch()) and the enqueuing side
   // (i.e. AddBatchWork()) as children of the owning exchange node's profile.
   profile_->AddChild(dequeue_profile_);
   profile_->AddChild(enqueue_profile_);

   // Initialize various counters for measuring dequeuing from queues.
   bytes_dequeued_counter_ =
       ADD_COUNTER(dequeue_profile_, "TotalBytesDequeued", TUnit::BYTES);
   bytes_dequeued_time_series_counter_ = ADD_TIME_SERIES_COUNTER(
       dequeue_profile_, "BytesDequeued", bytes_dequeued_counter_);
   queue_get_batch_timer_ = ADD_TIMER(dequeue_profile_, "TotalGetBatchTime");
   data_wait_timer_ =
       ADD_CHILD_TIMER(dequeue_profile_, "DataWaitTime", "TotalGetBatchTime");
   inactive_timer_ = profile_->inactive_timer();
   first_batch_wait_total_timer_ =
       ADD_TIMER(dequeue_profile_, "FirstBatchWaitTime");

   // Initialize various counters for measuring enqueuing into queues.
   bytes_received_counter_ =
       ADD_COUNTER(enqueue_profile_, "TotalBytesReceived", TUnit::BYTES);
   bytes_received_time_series_counter_ = ADD_TIME_SERIES_COUNTER(
       enqueue_profile_, "BytesReceived", bytes_received_counter_);
   deserialize_row_batch_timer_ =
       ADD_TIMER(enqueue_profile_, "DeserializeRowBatchTime");
   total_eos_received_counter_ =
       ADD_COUNTER(enqueue_profile_, "TotalEosReceived", TUnit::UNIT);
   total_early_senders_counter_ =
       ADD_COUNTER(enqueue_profile_, "TotalEarlySenders", TUnit::UNIT);
   total_received_batches_counter_ =
       ADD_COUNTER(enqueue_profile_, "TotalBatchesReceived", TUnit::UNIT);
   total_enqueued_batches_counter_ =
       ADD_COUNTER(enqueue_profile_, "TotalBatchesEnqueued", TUnit::UNIT);
   total_deferred_rpcs_counter_ =
       ADD_COUNTER(enqueue_profile_, "TotalRPCsDeferred", TUnit::UNIT);
   deferred_rpcs_time_series_counter_ =
       enqueue_profile_->AddSamplingTimeSeriesCounter("DeferredQueueSize", TUnit::UNIT,
       bind<int64_t>(mem_fn(&KrpcDataStreamRecvr::num_deferred_rpcs), this));
   total_has_deferred_rpcs_timer_ =
       ADD_TIMER(enqueue_profile_, "TotalHasDeferredRPCsTime");
   dispatch_timer_ =
       ADD_SUMMARY_STATS_TIMER(enqueue_profile_, "DispatchTime");
 }

 Status KrpcDataStreamRecvr::GetNext(RowBatch* output_batch, bool* eos) {
   DCHECK(TestInfo::is_test() || FragmentInstanceState::IsFragmentExecThread());
   DCHECK(merger_.get() != nullptr);
   return merger_->GetNext(output_batch, eos);
 }

 void KrpcDataStreamRecvr::AddBatch(const TransmitDataRequestPB* request,
     TransmitDataResponsePB* response, RpcContext* rpc_context) {
   MonoDelta duration(MonoTime::Now().GetDeltaSince(rpc_context->GetTimeReceived()));
   dispatch_timer_->UpdateCounter(duration.ToNanoseconds());
   int use_sender_id = is_merging_ ? request->sender_id() : 0;
   // Add all batches to the same queue if is_merging_ is false.
   sender_queues_[use_sender_id]->AddBatch(request, response, rpc_context);
 }

 void KrpcDataStreamRecvr::ProcessDeferredRpc(int sender_id) {
   int use_sender_id = is_merging_ ? sender_id : 0;
   // Add all batches to the same queue if is_merging_ is false.
   sender_queues_[use_sender_id]->ProcessDeferredRpc();
 }

 void KrpcDataStreamRecvr::TakeOverEarlySender(unique_ptr<TransmitDataCtx> ctx) {
   int use_sender_id = is_merging_ ? ctx->request->sender_id() : 0;
   // Add all batches to the same queue if is_merging_ is false.
   sender_queues_[use_sender_id]->TakeOverEarlySender(move(ctx));
   COUNTER_ADD(total_early_senders_counter_, 1);
 }

 void KrpcDataStreamRecvr::RemoveSender(int sender_id) {
   int use_sender_id = is_merging_ ? sender_id : 0;
   sender_queues_[use_sender_id]->DecrementSenders();
   COUNTER_ADD(total_eos_received_counter_, 1);
 }

 void KrpcDataStreamRecvr::CancelStream() {
   for (auto& queue: sender_queues_) queue->Cancel();
 }

 void KrpcDataStreamRecvr::Close() {
   DCHECK(TestInfo::is_test() || FragmentInstanceState::IsFragmentExecThread());
   DCHECK(!closed_);
   closed_ = true;
   // Remove this receiver from the KrpcDataStreamMgr that created it.
   // All the sender queues will be cancelled after this call returns.
   const Status status = mgr_->DeregisterRecvr(fragment_instance_id(), dest_node_id());
   if (!status.ok()) {
     LOG(ERROR) << "Error deregistering receiver: " << status.GetDetail();
   }
   for (auto& queue: sender_queues_) queue->Close();
   merger_.reset();

   // Given all queues have been cancelled and closed already at this point, it's safe to
   // call Close() on 'deferred_rpc_tracker_' without holding any lock here.
   deferred_rpc_tracker_->Close();
   dequeue_profile_->StopPeriodicCounters();
   enqueue_profile_->StopPeriodicCounters();

   // Remove reference to the unowned resources which may be freed after Close().
   mgr_ = nullptr;
   row_desc_ = nullptr;
   parent_tracker_ = nullptr;
   buffer_pool_client_ = nullptr;
   profile_ = nullptr;
 }

 KrpcDataStreamRecvr::~KrpcDataStreamRecvr() {
   DCHECK(mgr_ == nullptr) << "Must call Close()";
 }

 Status KrpcDataStreamRecvr::GetBatch(RowBatch** next_batch) {
   DCHECK(!is_merging_);
   DCHECK_EQ(sender_queues_.size(), 1);
   return sender_queues_[0]->GetBatch(next_batch);
 }

 } // namespace impala