src/kudu/rpc/connection.cc - kudu - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "kudu/rpc/connection.h"

 #include <netinet/in.h>
 #include <netinet/tcp.h>
 #include <string.h>

 #include <algorithm>
 #include <cerrno>
 #include <iostream>
 #include <memory>
 #include <set>
 #include <string>

 #include <boost/intrusive/detail/list_iterator.hpp>
 #include <boost/intrusive/list.hpp>
 #include <ev.h>
 #include <glog/logging.h>

 #include "kudu/gutil/map-util.h"
 #include "kudu/gutil/port.h"
 #include "kudu/gutil/strings/human_readable.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/rpc/inbound_call.h"
 #include "kudu/rpc/messenger.h"
 #include "kudu/rpc/outbound_call.h"
 #include "kudu/rpc/reactor.h"
 #include "kudu/rpc/rpc_controller.h"
 #include "kudu/rpc/rpc_header.pb.h"
 #include "kudu/rpc/rpc_introspection.pb.h"
 #include "kudu/rpc/transfer.h"
 #include "kudu/security/tls_socket.h"
 #include "kudu/util/errno.h"
 #include "kudu/util/faststring.h"
 #include "kudu/util/net/sockaddr.h"
 #include "kudu/util/net/socket.h"
 #include "kudu/util/slice.h"
 #include "kudu/util/status.h"

 #include <sys/socket.h>
 #ifdef __linux__
 #include <sys/ioctl.h>
 #endif

 using kudu::security::TlsSocket;
 using std::includes;
 using std::set;
 using std::shared_ptr;
 using std::unique_ptr;
 using strings::Substitute;

 namespace kudu {
 namespace rpc {

 typedef OutboundCall::Phase Phase;

 namespace {

 // tcp_info struct duplicated from linux/tcp.h.
 //
 // This allows us to decouple the compile-time Linux headers from the
 // runtime Linux kernel. The compile-time headers (and kernel) might be
 // older than the runtime kernel, in which case an ifdef-based approach
 // wouldn't allow us to get all of the info available.
 //
 // NOTE: this struct has been annotated with some local notes about the
 // contents of each field.
 struct tcp_info {
   // Various state-tracking information.
   // ------------------------------------------------------------
   uint8_t    tcpi_state;
   uint8_t    tcpi_ca_state;
   uint8_t    tcpi_retransmits;
   uint8_t    tcpi_probes;
   uint8_t    tcpi_backoff;
   uint8_t    tcpi_options;
   uint8_t    tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
   uint8_t    tcpi_delivery_rate_app_limited:1;

   // Configurations.
   // ------------------------------------------------------------
   uint32_t   tcpi_rto;
   uint32_t   tcpi_ato;
   uint32_t   tcpi_snd_mss;
   uint32_t   tcpi_rcv_mss;

   // Counts of packets in various states in the outbound queue.
   // At first glance one might think these are monotonic counters, but
   // in fact they are instantaneous counts of queued packets and thus
   // not very useful for our purposes.
   // ------------------------------------------------------------
   // Number of packets outstanding that haven't been acked.
   uint32_t   tcpi_unacked;

   // Number of packets outstanding that have been selective-acked.
   uint32_t   tcpi_sacked;

   // Number of packets outstanding that have been deemed lost (a SACK arrived
   // for a later packet)
   uint32_t   tcpi_lost;

   // Number of packets in the queue that have been retransmitted.
   uint32_t   tcpi_retrans;

   // The number of packets towards the highest SACKed sequence number
   // (some measure of reording, removed in later Linux versions by
   // 737ff314563ca27f044f9a3a041e9d42491ef7ce)
   uint32_t   tcpi_fackets;

   // Times when various events occurred.
   // ------------------------------------------------------------
   uint32_t   tcpi_last_data_sent;
   uint32_t   tcpi_last_ack_sent;     /* Not remembered, sorry. */
   uint32_t   tcpi_last_data_recv;
   uint32_t   tcpi_last_ack_recv;

   // Path MTU.
   uint32_t   tcpi_pmtu;

   // Receiver slow start threshold.
   uint32_t   tcpi_rcv_ssthresh;

   // Smoothed RTT estimate and variance based on the time between sending data and receiving
   // corresponding ACK. See https://tools.ietf.org/html/rfc2988 for details.
   uint32_t   tcpi_rtt;
   uint32_t   tcpi_rttvar;

   // Slow start threshold.
   uint32_t   tcpi_snd_ssthresh;
   // Sender congestion window (in number of MSS-sized packets)
   uint32_t   tcpi_snd_cwnd;
   // Advertised MSS.
   uint32_t   tcpi_advmss;
   // Amount of packet reordering allowed.
   uint32_t   tcpi_reordering;

   // Receiver-side RTT estimate per the Dynamic Right Sizing algorithm:
   //
   // "A system that is only transmitting acknowledgements can still estimate the round-trip
   // time by observing the time between when a byte is first acknowledged and the receipt of
   // data that is at least one window beyond the sequence number that was acknowledged. If the
   // sender is being throttled by the network, this estimate will be valid. However, if the
   // sending application did not have any data to send, the measured time could be much larger
   // than the actual round-trip time. Thus this measurement acts only as an upper-bound on the
   // round-trip time and should be be used only when it is the only source of round-trip time
   // information."
   uint32_t   tcpi_rcv_rtt;
   uint32_t   tcpi_rcv_space;

   // Total number of retransmitted packets.
   uint32_t   tcpi_total_retrans;

   // Pacing-related metrics.
   uint64_t   tcpi_pacing_rate;
   uint64_t   tcpi_max_pacing_rate;

   // Total bytes ACKed by remote peer.
   uint64_t   tcpi_bytes_acked;    /* RFC4898 tcpEStatsAppHCThruOctetsAcked */
   // Total bytes received (for which ACKs have been sent out).
   uint64_t   tcpi_bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived */
   // Segments sent and received.
   uint32_t   tcpi_segs_out;       /* RFC4898 tcpEStatsPerfSegsOut */
   uint32_t   tcpi_segs_in;        /* RFC4898 tcpEStatsPerfSegsIn */

   // The following metrics are quite new and not in el7.
   // ------------------------------------------------------------
   uint32_t   tcpi_notsent_bytes;
   uint32_t   tcpi_min_rtt;
   uint32_t   tcpi_data_segs_in;      /* RFC4898 tcpEStatsDataSegsIn */
   uint32_t   tcpi_data_segs_out;     /* RFC4898 tcpEStatsDataSegsOut */

   // Calculated rate at which data was delivered.
   uint64_t   tcpi_delivery_rate;

   // Timers for various states.
   uint64_t   tcpi_busy_time;      /* Time (usec) busy sending data */
   uint64_t   tcpi_rwnd_limited;   /* Time (usec) limited by receive window */
   uint64_t   tcpi_sndbuf_limited; /* Time (usec) limited by send buffer */
 };

 } // anonymous namespace

 ///
 /// Connection
 ///
 Connection::Connection(ReactorThread *reactor_thread,
                        Sockaddr remote,
                        unique_ptr<Socket> socket,
                        Direction direction,
                        CredentialsPolicy policy)
     : reactor_thread_(reactor_thread),
       remote_(remote),
       socket_(std::move(socket)),
       direction_(direction),
       last_activity_time_(MonoTime::Now()),
       is_epoll_registered_(false),
       call_id_(std::numeric_limits<int32_t>::max()),
       credentials_policy_(policy),
       negotiation_complete_(false),
       is_confidential_(false),
       scheduled_for_shutdown_(false) {
 }

 Status Connection::SetNonBlocking(bool enabled) {
   return socket_->SetNonBlocking(enabled);
 }

 Status Connection::SetTcpKeepAlive(int idle_time_s, int retry_time_s, int num_retries) {
   DCHECK_GT(idle_time_s, 0);
   DCHECK_GE(retry_time_s, 0);
   DCHECK_GE(num_retries, 0);
   return socket_->SetTcpKeepAlive(std::max(1, idle_time_s), std::max(0, retry_time_s),
       std::max(0, num_retries));
 }

 void Connection::EpollRegister(ev::loop_ref& loop) {
   DCHECK(reactor_thread_->IsCurrentThread());
   DVLOG(4) << "Registering connection for epoll: " << ToString();
   write_io_.set(loop);
   write_io_.set(socket_->GetFd(), ev::WRITE);
   write_io_.set<Connection, &Connection::WriteHandler>(this);
   if (direction_ == CLIENT && negotiation_complete_) {
     write_io_.start();
   }
   read_io_.set(loop);
   read_io_.set(socket_->GetFd(), ev::READ);
   read_io_.set<Connection, &Connection::ReadHandler>(this);
   read_io_.start();
   is_epoll_registered_ = true;
 }

 Connection::~Connection() {
   // Must clear the outbound_transfers_ list before deleting.
   CHECK(outbound_transfers_.begin() == outbound_transfers_.end());

   // It's crucial that the connection is Shutdown first -- otherwise
   // our destructor will end up calling read_io_.stop() and write_io_.stop()
   // from a possibly non-reactor thread context. This can then make all
   // hell break loose with libev.
   CHECK(!is_epoll_registered_);
 }

 bool Connection::Idle() const {
   DCHECK(reactor_thread_->IsCurrentThread());
   // check if we're in the middle of receiving something
   InboundTransfer *transfer = inbound_.get();
   if (transfer && (transfer->TransferStarted())) {
     return false;
   }
   // check if we still need to send something
   if (!outbound_transfers_.empty()) {
     return false;
   }
   // can't kill a connection if calls are waiting response
   if (!awaiting_response_.empty()) {
     return false;
   }

   if (!calls_being_handled_.empty()) {
     return false;
   }

   // We are not idle if we are in the middle of connection negotiation.
   if (!negotiation_complete_) {
     return false;
   }

   return true;
 }

 void Connection::Shutdown(const Status &status,
                           unique_ptr<ErrorStatusPB> rpc_error) {
   DCHECK(reactor_thread_->IsCurrentThread());
   shutdown_status_ = status.CloneAndPrepend("RPC connection failed");

   if (inbound_ && inbound_->TransferStarted()) {
     double secs_since_active =
         (reactor_thread_->cur_time() - last_activity_time_).ToSeconds();
     LOG(WARNING) << "Shutting down " << ToString()
                  << " with pending inbound data ("
                  << inbound_->StatusAsString() << ", last active "
                  << HumanReadableElapsedTime::ToShortString(secs_since_active)
                  << " ago, status=" << status.ToString() << ")";
   }

   // Clear any calls which have been sent and were awaiting a response.
   for (const car_map_t::value_type &v : awaiting_response_) {
     CallAwaitingResponse *c = v.second;
     if (c->call) {
       // Make sure every awaiting call receives the error info, if any.
       unique_ptr<ErrorStatusPB> error;
       if (rpc_error) {
         error.reset(new ErrorStatusPB(*rpc_error));
       }
       c->call->SetFailed(status,
                          negotiation_complete_ ? Phase::REMOTE_CALL
                                                : Phase::CONNECTION_NEGOTIATION,
                          std::move(error));
     }
     // And we must return the CallAwaitingResponse to the pool
     car_pool_.Destroy(c);
   }
   awaiting_response_.clear();

   // Clear any outbound transfers.
   while (!outbound_transfers_.empty()) {
     OutboundTransfer *t = &outbound_transfers_.front();
     outbound_transfers_.pop_front();
     delete t;
   }

   read_io_.stop();
   write_io_.stop();
   is_epoll_registered_ = false;
   if (socket_) {
     WARN_NOT_OK(socket_->Close(), "Error closing socket");
   }
 }

 void Connection::QueueOutbound(unique_ptr<OutboundTransfer> transfer) {
   DCHECK(reactor_thread_->IsCurrentThread());

   if (!shutdown_status_.ok()) {
     // If we've already shut down, then we just need to abort the
     // transfer rather than bothering to queue it.
     transfer->Abort(shutdown_status_);
     return;
   }

   DVLOG(3) << "Queueing transfer: " << transfer->HexDump();

   outbound_transfers_.push_back(*transfer.release());

   if (negotiation_complete_ && !write_io_.is_active()) {
     // Optimistically assume that the socket is writable if we didn't already
     // have something queued.
     if (ProcessOutboundTransfers() == kMoreToSend) {
       write_io_.start();
     }
   }
 }

 Connection::CallAwaitingResponse::~CallAwaitingResponse() {
   DCHECK(conn->reactor_thread_->IsCurrentThread());
 }

 void Connection::CallAwaitingResponse::HandleTimeout(ev::timer &watcher, int revents) {
   if (remaining_timeout > 0) {
     if (watcher.remaining() < -1.0) {
       LOG(WARNING) << "RPC call timeout handler was delayed by "
                    << -watcher.remaining() << "s! This may be due to a process-wide "
                    << "pause such as swapping, logging-related delays, or allocator lock "
                    << "contention. Will allow an additional "
                    << remaining_timeout << "s for a response.";
     }

     watcher.set(remaining_timeout, 0);
     watcher.start();
     remaining_timeout = 0;
     return;
   }

   conn->HandleOutboundCallTimeout(this);
 }

 void Connection::HandleOutboundCallTimeout(CallAwaitingResponse *car) {
   DCHECK(reactor_thread_->IsCurrentThread());
   if (!car->call) {
     // The RPC may have been cancelled before the timeout was hit.
     return;
   }
   // The timeout timer is stopped by the car destructor exiting Connection::HandleCallResponse()
   DCHECK(!car->call->IsFinished());

   // Mark the call object as failed.
   car->call->SetTimedOut(negotiation_complete_ ? Phase::REMOTE_CALL
                                                : Phase::CONNECTION_NEGOTIATION);

   // Test cancellation when 'car->call' is in 'TIMED_OUT' state
   MaybeInjectCancellation(car->call);

   // Drop the reference to the call. If the original caller has moved on after
   // seeing the timeout, we no longer need to hold onto the allocated memory
   // from the request.
   car->call.reset();

   // We still leave the CallAwaitingResponse in the map -- this is because we may still
   // receive a response from the server, and we don't want a spurious log message
   // when we do finally receive the response. The fact that CallAwaitingResponse::call
   // is a NULL pointer indicates to the response processing code that the call
   // already timed out.
 }

 void Connection::CancelOutboundCall(const shared_ptr<OutboundCall> &call) {
   CallAwaitingResponse* car = FindPtrOrNull(awaiting_response_, call->call_id());
   if (car != nullptr) {
     // car->call may be NULL if the call has timed out already.
     DCHECK(!car->call || car->call.get() == call.get());
     car->call.reset();
   }
 }

 // Inject a cancellation when 'call' is in state 'FLAGS_rpc_inject_cancellation_state'.
 void inline Connection::MaybeInjectCancellation(const shared_ptr<OutboundCall> &call) {
   if (PREDICT_FALSE(call->ShouldInjectCancellation())) {
     reactor_thread_->reactor()->messenger()->QueueCancellation(call);
   }
 }

 // Callbacks after sending a call on the wire.
 // This notifies the OutboundCall object to change its state to SENT once it
 // has been fully transmitted.
 struct CallTransferCallbacks : public TransferCallbacks {
  public:
   explicit CallTransferCallbacks(shared_ptr<OutboundCall> call,
                                  Connection *conn)
       : call_(std::move(call)), conn_(conn) {}

   virtual void NotifyTransferFinished() OVERRIDE {
     // TODO: would be better to cancel the transfer while it is still on the queue if we
     // timed out before the transfer started, but there is still a race in the case of
     // a partial send that we have to handle here
     if (call_->IsFinished()) {
       DCHECK(call_->IsTimedOut() || call_->IsCancelled());
     } else {
       call_->SetSent();
       // Test cancellation when 'call_' is in 'SENT' state.
       conn_->MaybeInjectCancellation(call_);
     }
     delete this;
   }

   virtual void NotifyTransferAborted(const Status &status) OVERRIDE {
     VLOG(1) << "Transfer of RPC call " << call_->ToString() << " aborted: "
             << status.ToString();
     delete this;
   }

  private:
   shared_ptr<OutboundCall> call_;
   Connection* conn_;
 };

 void Connection::QueueOutboundCall(shared_ptr<OutboundCall> call) {
   DCHECK(call);
   DCHECK_EQ(direction_, CLIENT);
   DCHECK(reactor_thread_->IsCurrentThread());

   if (PREDICT_FALSE(!shutdown_status_.ok())) {
     // Already shutdown
     call->SetFailed(shutdown_status_,
                     negotiation_complete_ ? Phase::REMOTE_CALL
                                           : Phase::CONNECTION_NEGOTIATION);
     return;
   }

   // At this point the call has a serialized request, but no call header, since we haven't
   // yet assigned a call ID.
   DCHECK(!call->call_id_assigned());

   // We shouldn't reach this point if 'call' was requested to be cancelled.
   DCHECK(!call->cancellation_requested());

   // Assign the call ID.
   int32_t call_id = GetNextCallId();
   call->set_call_id(call_id);

   // Serialize the actual bytes to be put on the wire.
   TransferPayload tmp_slices;
   call->SerializeTo(&tmp_slices);

   call->SetQueued();

   // Test cancellation when 'call_' is in 'ON_OUTBOUND_QUEUE' state.
   MaybeInjectCancellation(call);

   scoped_car car(car_pool_.make_scoped_ptr(car_pool_.Construct()));
   car->conn = this;
   car->call = call;

   // Set up the timeout timer.
   const MonoDelta &timeout = call->controller()->timeout();
   if (timeout.Initialized()) {
     reactor_thread_->RegisterTimeout(&car->timeout_timer);
     car->timeout_timer.set<CallAwaitingResponse, // NOLINT(*)
                            &CallAwaitingResponse::HandleTimeout>(car.get());

     // For calls with a timeout of at least 500ms, we actually run the timeout
     // handler in two stages. The first timeout fires with a timeout 10% less
     // than the user-specified one. It then schedules a second timeout for the
     // remaining amount of time.
     //
     // The purpose of this two-stage timeout is to be more robust when the client
     // has some process-wide pause, such as lock contention in tcmalloc, or a
     // reactor callback that blocks in glog. Consider the following case:
     //
     // T = 0s        user issues an RPC with 5 second timeout
     // T = 0.5s - 6s   process is blocked
     // T = 6s        process unblocks, and the timeout fires (1s late)
     //
     // Without the two-stage timeout, we would determine that the call had timed out,
     // even though it's likely that the response is waiting on our TCP socket.
     // With the two-stage timeout, we'll end up with:
     //
     // T = 0s           user issues an RPC with 5 second timeout
     // T = 0.5s - 6s    process is blocked
     // T = 6s           process unblocks, and the first-stage timeout fires (1.5s late)
     // T = 6s - 6.200s  time for the client to read the response which is waiting
     // T = 6.200s       if the response was not actually available, we'll time out here
     //
     // We don't bother with this logic for calls with very short timeouts - assumedly
     // a user setting such a short RPC timeout is well equipped to handle one.
     double time = timeout.ToSeconds();
     if (time >= 0.5) {
       car->remaining_timeout = time * 0.1;
       time -= car->remaining_timeout;
     } else {
       car->remaining_timeout = 0;
     }

     car->timeout_timer.set(time, 0);
     car->timeout_timer.start();
   }

   TransferCallbacks *cb = new CallTransferCallbacks(std::move(call), this);
   awaiting_response_[call_id] = car.release();
   QueueOutbound(unique_ptr<OutboundTransfer>(
       OutboundTransfer::CreateForCallRequest(call_id, tmp_slices, cb)));
 }

 // Callbacks for sending an RPC call response from the server.
 // This takes ownership of the InboundCall object so that, once it has
 // been responded to, we can free up all of the associated memory.
 struct ResponseTransferCallbacks : public TransferCallbacks {
  public:
   ResponseTransferCallbacks(unique_ptr<InboundCall> call,
                             Connection *conn) :
     call_(std::move(call)),
     conn_(conn)
   {}

   ~ResponseTransferCallbacks() {
     // Remove the call from the map.
     InboundCall *call_from_map = EraseKeyReturnValuePtr(
       &conn_->calls_being_handled_, call_->call_id());
     DCHECK_EQ(call_from_map, call_.get());
   }

   virtual void NotifyTransferFinished() OVERRIDE {
     delete this;
   }

   virtual void NotifyTransferAborted(const Status &status) OVERRIDE {
     LOG(WARNING) << "Connection torn down before " <<
       call_->ToString() << " could send its response";
     delete this;
   }

  private:
   unique_ptr<InboundCall> call_;
   Connection *conn_;
 };

 // Reactor task which puts a transfer on the outbound transfer queue.
 class QueueTransferTask : public ReactorTask {
  public:
   QueueTransferTask(unique_ptr<OutboundTransfer> transfer,
                     Connection *conn)
     : transfer_(std::move(transfer)),
       conn_(conn)
   {}

   virtual void Run(ReactorThread *thr) OVERRIDE {
     conn_->QueueOutbound(std::move(transfer_));
     delete this;
   }

   virtual void Abort(const Status &status) OVERRIDE {
     transfer_->Abort(status);
     delete this;
   }

  private:
   unique_ptr<OutboundTransfer> transfer_;
   Connection *conn_;
 };

 void Connection::QueueResponseForCall(unique_ptr<InboundCall> call) {
   // This is usually called by the IPC worker thread when the response
   // is set, but in some circumstances may also be called by the
   // reactor thread (e.g. if the service has shut down)

   DCHECK_EQ(direction_, SERVER);

   // If the connection is torn down, then the QueueOutbound() call that
   // eventually runs in the reactor thread will take care of calling
   // ResponseTransferCallbacks::NotifyTransferAborted.

   TransferPayload tmp_slices;
   call->SerializeResponseTo(&tmp_slices);

   TransferCallbacks *cb = new ResponseTransferCallbacks(std::move(call), this);
   // After the response is sent, can delete the InboundCall object.
   // We set a dummy call ID and required feature set, since these are not needed
   // when sending responses.
   unique_ptr<OutboundTransfer> t(
       OutboundTransfer::CreateForCallResponse(tmp_slices, cb));

   QueueTransferTask *task = new QueueTransferTask(std::move(t), this);
   reactor_thread_->reactor()->ScheduleReactorTask(task);
 }

 void Connection::set_confidential(bool is_confidential) {
   is_confidential_ = is_confidential;
 }

 bool Connection::SatisfiesCredentialsPolicy(CredentialsPolicy policy) const {
   DCHECK_EQ(direction_, CLIENT);
   return (policy == CredentialsPolicy::ANY_CREDENTIALS) ||
       (policy == credentials_policy_);
 }

 RpczStore* Connection::rpcz_store() {
   return reactor_thread_->reactor()->messenger()->rpcz_store();
 }

 void Connection::ReadHandler(ev::io &watcher, int revents) {
   DCHECK(reactor_thread_->IsCurrentThread());

   DVLOG(3) << ToString() << " ReadHandler(revents=" << revents << ")";
   if (revents & EV_ERROR) {
     reactor_thread_->DestroyConnection(this, Status::NetworkError(ToString() +
                                      ": ReadHandler encountered an error"));
     return;
   }
   last_activity_time_ = reactor_thread_->cur_time();

   faststring extra_buf;
   while (true) {
     if (!inbound_) {
       inbound_.reset(new InboundTransfer());
     }
     Status status = inbound_->ReceiveBuffer(socket_.get(), &extra_buf);
     if (PREDICT_FALSE(!status.ok())) {
       if (status.posix_code() == ESHUTDOWN) {
         VLOG(1) << ToString() << " shut down by remote end.";
       } else {
         LOG(WARNING) << ToString() << " recv error: " << status.ToString();
       }
       reactor_thread_->DestroyConnection(this, status);
       return;
     }
     if (!inbound_->TransferFinished()) {
       DVLOG(3) << ToString() << ": read is not yet finished yet.";
       return;
     }
     DVLOG(3) << ToString() << ": finished reading " << inbound_->data().size() << " bytes";

     if (direction_ == CLIENT) {
       HandleCallResponse(std::move(inbound_));
     } else if (direction_ == SERVER) {
       HandleIncomingCall(std::move(inbound_));
     } else {
       LOG(FATAL) << "Invalid direction: " << direction_;
     }

     if (extra_buf.size() > 0) {
       inbound_.reset(new InboundTransfer(std::move(extra_buf)));
     } else {
       break;
     }
   }
 }

 void Connection::HandleIncomingCall(unique_ptr<InboundTransfer> transfer) {
   DCHECK(reactor_thread_->IsCurrentThread());

   unique_ptr<InboundCall> call(new InboundCall(this));
   Status s = call->ParseFrom(std::move(transfer));
   if (!s.ok()) {
     LOG(WARNING) << ToString() << ": received bad data: " << s.ToString();
     // TODO: shutdown? probably, since any future stuff on this socket will be
     // "unsynchronized"
     return;
   }

   if (!InsertIfNotPresent(&calls_being_handled_, call->call_id(), call.get())) {
     LOG(WARNING) << ToString() << ": received call ID " << call->call_id() <<
       " but was already processing this ID! Ignoring";
     reactor_thread_->DestroyConnection(
       this, Status::RuntimeError("Received duplicate call id",
                                  Substitute("$0", call->call_id())));
     return;
   }

   reactor_thread_->reactor()->messenger()->QueueInboundCall(std::move(call));
 }

 void Connection::HandleCallResponse(unique_ptr<InboundTransfer> transfer) {
   DCHECK(reactor_thread_->IsCurrentThread());
   unique_ptr<CallResponse> resp(new CallResponse);
   CHECK_OK(resp->ParseFrom(std::move(transfer)));

   CallAwaitingResponse *car_ptr =
     EraseKeyReturnValuePtr(&awaiting_response_, resp->call_id());
   if (PREDICT_FALSE(car_ptr == nullptr)) {
     LOG(WARNING) << ToString() << ": Got a response for call id " << resp->call_id() << " which "
                  << "was not pending! Ignoring.";
     return;
   }

   // The car->timeout_timer ev::timer will be stopped automatically by its destructor.
   scoped_car car(car_pool_.make_scoped_ptr(car_ptr));

   if (PREDICT_FALSE(!car->call)) {
     // The call already failed due to a timeout.
     VLOG(1) << "Got response to call id " << resp->call_id() << " after client "
             << "already timed out or cancelled";
     return;
   }

   car->call->SetResponse(std::move(resp));

   // Test cancellation when 'car->call' is in 'FINISHED_SUCCESS' or 'FINISHED_ERROR' state.
   MaybeInjectCancellation(car->call);
 }

 void Connection::WriteHandler(ev::io &watcher, int revents) {
   DCHECK(reactor_thread_->IsCurrentThread());

   if (revents & EV_ERROR) {
     reactor_thread_->DestroyConnection(this, Status::NetworkError(ToString() +
           ": writeHandler encountered an error"));
     return;
   }
   DVLOG(3) << ToString() << ": writeHandler: revents = " << revents;

   if (outbound_transfers_.empty()) {
     LOG(WARNING) << ToString() << " got a ready-to-write callback, but there is "
       "nothing to write.";
     write_io_.stop();
     return;
   }
   if (ProcessOutboundTransfers() == kNoMoreToSend) {
     write_io_.stop();
   }
 }

 Connection::ProcessOutboundTransfersResult Connection::ProcessOutboundTransfers() {
   while (!outbound_transfers_.empty()) {
     OutboundTransfer* transfer = &(outbound_transfers_.front());

     if (!transfer->TransferStarted()) {
       if (transfer->is_for_outbound_call()) {
         CallAwaitingResponse* car = FindOrDie(awaiting_response_, transfer->call_id());
         if (!car->call) {
           // If the call has already timed out or has already been cancelled, the 'call'
           // field would be set to NULL. In that case, don't bother sending it.
           outbound_transfers_.pop_front();
           transfer->Abort(Status::Aborted("already timed out or cancelled"));
           delete transfer;
           continue;
         }

         // If this is the start of the transfer, then check if the server has the
         // required RPC flags. We have to wait until just before the transfer in
         // order to ensure that the negotiation has taken place, so that the flags
         // are available.
         const set<RpcFeatureFlag>& required_features = car->call->required_rpc_features();
         if (!includes(remote_features_.begin(), remote_features_.end(),
                       required_features.begin(), required_features.end())) {
           outbound_transfers_.pop_front();
           Status s = Status::NotSupported("server does not support the required RPC features");
           transfer->Abort(s);
           Phase phase = negotiation_complete_ ? Phase::REMOTE_CALL : Phase::CONNECTION_NEGOTIATION;
           car->call->SetFailed(std::move(s), phase);
           // Test cancellation when 'call_' is in 'FINISHED_ERROR' state.
           MaybeInjectCancellation(car->call);
           car->call.reset();
           delete transfer;
           continue;
         }

         car->call->SetSending();

         // Test cancellation when 'call_' is in 'SENDING' state.
         MaybeInjectCancellation(car->call);
       }
     }

     last_activity_time_ = reactor_thread_->cur_time();
     Status status = transfer->SendBuffer(socket_.get());
     if (PREDICT_FALSE(!status.ok())) {
       LOG(WARNING) << ToString() << " send error: " << status.ToString();
       reactor_thread_->DestroyConnection(this, status);
       return kConnectionDestroyed;
     }

     if (!transfer->TransferFinished()) {
       DVLOG(3) << ToString() << ": writeHandler: xfer not finished.";
       return kMoreToSend;
     }

     outbound_transfers_.pop_front();
     delete transfer;
   }

   return kNoMoreToSend;
 }

 std::string Connection::ToString() const {
   // This may be called from other threads, so we cannot
   // include anything in the output about the current state,
   // which might concurrently change from another thread.
   return strings::Substitute(
     "$0 $1",
     direction_ == SERVER ? "server connection from" : "client connection to",
     remote_.ToString());
 }

 // Reactor task that transitions this Connection from connection negotiation to
 // regular RPC handling. Destroys Connection on negotiation error.
 class NegotiationCompletedTask : public ReactorTask {
  public:
   NegotiationCompletedTask(Connection* conn,
                            Status negotiation_status,
                            std::unique_ptr<ErrorStatusPB> rpc_error)
     : conn_(conn),
       negotiation_status_(std::move(negotiation_status)),
       rpc_error_(std::move(rpc_error)) {
   }

   virtual void Run(ReactorThread *rthread) OVERRIDE {
     rthread->CompleteConnectionNegotiation(conn_,
                                            negotiation_status_,
                                            std::move(rpc_error_));
     delete this;
   }

   virtual void Abort(const Status &status) OVERRIDE {
     DCHECK(conn_->reactor_thread()->reactor()->closing());
     VLOG(1) << "Failed connection negotiation due to shut down reactor thread: "
             << status.ToString();
     delete this;
   }

  private:
   scoped_refptr<Connection> conn_;
   const Status negotiation_status_;
   std::unique_ptr<ErrorStatusPB> rpc_error_;
 };

 void Connection::CompleteNegotiation(Status negotiation_status,
                                      unique_ptr<ErrorStatusPB> rpc_error) {
   auto task = new NegotiationCompletedTask(
       this, std::move(negotiation_status), std::move(rpc_error));
   reactor_thread_->reactor()->ScheduleReactorTask(task);
 }

 void Connection::MarkNegotiationComplete() {
   DCHECK(reactor_thread_->IsCurrentThread());
   negotiation_complete_ = true;
 }

 Status Connection::DumpPB(const DumpConnectionsRequestPB& req,
                           RpcConnectionPB* resp) {
   DCHECK(reactor_thread_->IsCurrentThread());
   resp->set_remote_ip(remote_.ToString());
   if (negotiation_complete_) {
     resp->set_state(RpcConnectionPB::OPEN);
   } else {
     resp->set_state(RpcConnectionPB::NEGOTIATING);
   }

   if (direction_ == CLIENT) {
     for (const car_map_t::value_type& entry : awaiting_response_) {
       CallAwaitingResponse* c = entry.second;
       if (c->call) {
         c->call->DumpPB(req, resp->add_calls_in_flight());
       }
     }

     resp->set_outbound_queue_size(num_queued_outbound_transfers());
   } else if (direction_ == SERVER) {
     if (negotiation_complete_) {
       // It's racy to dump credentials while negotiating, since the Connection
       // object is owned by the negotiation thread at that point.
       resp->set_remote_user_credentials(remote_user_.ToString());
     }
     for (const inbound_call_map_t::value_type& entry : calls_being_handled_) {
       InboundCall* c = entry.second;
       c->DumpPB(req, resp->add_calls_in_flight());
     }
   } else {
     LOG(FATAL);
   }
 #ifdef __linux__
   if (negotiation_complete_ && remote_.is_ip()) {
     // TODO(todd): it's a little strange to not set socket level stats during
     // negotiation, but we don't have access to the socket here until negotiation
     // is complete.
     WARN_NOT_OK(GetSocketStatsPB(resp->mutable_socket_stats()),
                 "could not fill in TCP info for RPC connection");
   }
 #endif // __linux__

   if (negotiation_complete_ && remote_.is_ip()) {
     WARN_NOT_OK(GetTransportDetailsPB(resp->mutable_transport_details()),
                 "could not fill in transport info for RPC connection");
   }
   return Status::OK();
 }

 #ifdef __linux__
 Status Connection::GetSocketStatsPB(SocketStatsPB* pb) const {
   DCHECK(reactor_thread_->IsCurrentThread());
   int fd = socket_->GetFd();
   CHECK_GE(fd, 0);

   // Fetch TCP_INFO statistics from the kernel.
   tcp_info ti;
   memset(&ti, 0, sizeof(ti));
   socklen_t len = sizeof(ti);
   int rc = getsockopt(fd, IPPROTO_TCP, TCP_INFO, &ti, &len);
   if (rc == 0) {
 #   define HAS_FIELD(field_name) \
         (len >= offsetof(tcp_info, field_name) + sizeof(ti.field_name))
     if (!HAS_FIELD(tcpi_total_retrans)) {
       // All the fields up through tcpi_total_retrans were present since very old
       // kernel versions, beyond our minimal supported. So, we can just bail if we
       // don't get sufficient data back.
       return Status::NotSupported("bad length returned for TCP_INFO");
     }

     pb->set_rtt(ti.tcpi_rtt);
     pb->set_rttvar(ti.tcpi_rttvar);
     pb->set_snd_cwnd(ti.tcpi_snd_cwnd);
     pb->set_total_retrans(ti.tcpi_total_retrans);

     // The following fields were added later in kernel development history.
     // In RHEL6 they were backported starting in 6.8. Even though they were
     // backported all together as a group, we'll just be safe and check for
     // each individually.
     if (HAS_FIELD(tcpi_pacing_rate)) {
       pb->set_pacing_rate(ti.tcpi_pacing_rate);
     }
     if (HAS_FIELD(tcpi_max_pacing_rate)) {
       pb->set_max_pacing_rate(ti.tcpi_max_pacing_rate);
     }
     if (HAS_FIELD(tcpi_bytes_acked)) {
       pb->set_bytes_acked(ti.tcpi_bytes_acked);
     }
     if (HAS_FIELD(tcpi_bytes_received)) {
       pb->set_bytes_received(ti.tcpi_bytes_received);
     }
     if (HAS_FIELD(tcpi_segs_out)) {
       pb->set_segs_out(ti.tcpi_segs_out);
     }
     if (HAS_FIELD(tcpi_segs_in)) {
       pb->set_segs_in(ti.tcpi_segs_in);
     }

     // Calculate sender bandwidth based on the same logic used by the 'ss' utility.
     if (ti.tcpi_rtt > 0 && ti.tcpi_snd_mss && ti.tcpi_snd_cwnd) {
       // Units:
       //  rtt = usec
       //  cwnd = number of MSS-size packets
       //  mss = bytes / packet
       //
       // Dimensional analysis:
       //   packets * bytes/packet * usecs/sec / usec -> bytes/sec
       static constexpr int kUsecsPerSec = 1000000;
       pb->set_send_bytes_per_sec(static_cast<int64_t>(ti.tcpi_snd_cwnd) *
                                  ti.tcpi_snd_mss * kUsecsPerSec / ti.tcpi_rtt);
     }
   }

   // Fetch the queue sizes.
   int queue_len = 0;
   rc = ioctl(fd, TIOCOUTQ, &queue_len);
   if (rc == 0) {
     pb->set_send_queue_bytes(queue_len);
   }
   rc = ioctl(fd, FIONREAD, &queue_len);
   if (rc == 0) {
     pb->set_receive_queue_bytes(queue_len);
   }
   return Status::OK();
 }
 #endif // __linux__

 Status Connection::GetTransportDetailsPB(TransportDetailsPB* pb) const {
   DCHECK(reactor_thread_->IsCurrentThread());
   DCHECK(pb);

   // As for the dynamic_cast below: this is not very elegant or performant code,
   // but introducing a generic virtual method with vague semantics into the base
   // Socket class doesn't look like a good choice either. Also, the
   // GetTransportDetailsPB() method isn't supposed to be a part of any hot path.
   const TlsSocket* tls_socket = dynamic_cast<TlsSocket*>(socket_.get());
   if (tls_socket) {
     auto* tls = pb->mutable_tls();
     tls->set_protocol(tls_socket->GetProtocolName());
     tls->set_cipher_suite(tls_socket->GetCipherDescription());
   }

   int fd = socket_->GetFd();
   CHECK_GE(fd, 0);
   int32_t max_seg_size = 0;
   socklen_t optlen = sizeof(max_seg_size);
   int ret = ::getsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, &max_seg_size, &optlen);
   if (ret) {
     int err = errno;
     return Status::NetworkError(
         "getsockopt(TCP_MAXSEG) failed", ErrnoToString(err), err);
   }
   pb->mutable_tcp()->set_max_segment_size(max_seg_size);

   return Status::OK();
 }

 } // namespace rpc
 } // namespace kudu