src/kudu/rpc/reactor.cc - kudu - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "kudu/rpc/reactor.h"

 #include <openssl/crypto.h>
 #include <openssl/err.h> // IWYU pragma: keep
 #include <sys/socket.h>

 #include <cerrno>
 #include <functional>
 #include <memory>
 #include <mutex>
 #include <ostream>
 #include <string>
 #include <utility>

 #include <boost/intrusive/list.hpp>
 #include <ev++.h>
 #include <ev.h>
 #include <gflags/gflags.h>
 #include <glog/logging.h>

 #include "kudu/gutil/port.h"
 #include "kudu/gutil/ref_counted.h"
 #include "kudu/gutil/stringprintf.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/gutil/sysinfo.h"
 #include "kudu/gutil/walltime.h"
 #include "kudu/rpc/connection.h"
 #include "kudu/rpc/messenger.h"
 #include "kudu/rpc/negotiation.h"
 #include "kudu/rpc/outbound_call.h"
 #include "kudu/rpc/rpc_controller.h"
 #include "kudu/rpc/rpc_introspection.pb.h"
 #include "kudu/util/countdown_latch.h"
 #include "kudu/util/debug/sanitizer_scopes.h"
 #include "kudu/util/flag_tags.h"
 #include "kudu/util/metrics.h"
 #include "kudu/util/monotime.h"
 #include "kudu/util/net/sockaddr.h"
 #include "kudu/util/net/socket.h"
 #include "kudu/util/random_util.h"
 #include "kudu/util/status.h"
 #include "kudu/util/thread.h"
 #include "kudu/util/thread_restrictions.h"
 #include "kudu/util/threadpool.h"
 #include "kudu/util/trace.h"

 // When compiling on Mac OS X, use 'kqueue' instead of the default, 'select', for the event loop.
 // Otherwise we run into problems because 'select' can't handle connections when more than 1024
 // file descriptors are open by the process.
 #if defined(__APPLE__)
 static const int kDefaultLibEvFlags = ev::KQUEUE;
 #else
 static const int kDefaultLibEvFlags = ev::AUTO;
 #endif

 using std::string;
 using std::shared_ptr;
 using std::unique_ptr;
 using strings::Substitute;

 DEFINE_bool(rpc_reopen_outbound_connections, false,
             "Open a new connection to the server for every RPC call. "
             "If not enabled, an already existing connection to a "
             "server is reused upon making another call to the same server. "
             "When this flag is enabled, an already existing _idle_ connection "
             "to the server is closed upon making another RPC call which would "
             "reuse the connection otherwise. "
             "Used by tests only.");
 TAG_FLAG(rpc_reopen_outbound_connections, unsafe);
 TAG_FLAG(rpc_reopen_outbound_connections, runtime);

 DEFINE_int32(tcp_keepalive_probe_period_s, 60,
              "The duration in seconds after an outbound connection has gone idle "
              "before a TCP keepalive probe is sent to the peer. Set to 0 to disable "
              "TCP keepalive probes from being sent.");
 DEFINE_int32(tcp_keepalive_retry_period_s, 3,
              "The duration in seconds between successive keepalive probes from an "
              "outbound connection if the previous probes are not acknowledged. "
              "Effective only if --tcp_keepalive_probe_period_s is not 0.");
 DEFINE_int32(tcp_keepalive_retry_count, 10,
              "The maximum number of keepalive probes sent before declaring the remote "
              "end as dead. Effective only if --tcp_keepalive_probe_period_s is not 0.");
 TAG_FLAG(tcp_keepalive_probe_period_s, advanced);
 TAG_FLAG(tcp_keepalive_retry_period_s, advanced);
 TAG_FLAG(tcp_keepalive_retry_count, advanced);

 METRIC_DEFINE_histogram(server, reactor_load_percent,
                         "Reactor Thread Load Percentage",
                         kudu::MetricUnit::kUnits,
                         "The percentage of time that the reactor is busy "
                         "(not blocked awaiting network activity). If this metric "
                         "shows significant samples nears 100%, increasing the "
                         "number of reactors may be beneficial.",
                         kudu::MetricLevel::kInfo,
                         100, 2);

 METRIC_DEFINE_histogram(server, reactor_active_latency_us,
                         "Reactor Thread Active Latency",
                         kudu::MetricUnit::kMicroseconds,
                         "Histogram of the wall clock time for reactor thread wake-ups. "
                         "The reactor thread is responsible for all network I/O and "
                         "therefore outliers in this latency histogram directly contribute "
                         "to the latency of both inbound and outbound RPCs.",
                         kudu::MetricLevel::kInfo,
                         1000000, 2);

 namespace kudu {
 namespace rpc {

 namespace {
 Status ShutdownError(bool aborted) {
   const char* msg = "reactor is shutting down";
   return aborted ?
       Status::Aborted(msg, "", ESHUTDOWN) :
       Status::ServiceUnavailable(msg, "", ESHUTDOWN);
 }

 // Callback for libev fatal errors (eg running out of file descriptors).
 // Unfortunately libev doesn't plumb these back through to the caller, but
 // instead just expects the callback to abort.
 //
 // This implementation is slightly preferable to the built-in one since
 // it uses a FATAL log message instead of printing to stderr, which might
 // not end up anywhere useful in a daemonized context.
 void LibevSysErr(const char* msg) throw() {
   PLOG(FATAL) << "LibEV fatal error: " << msg;
 }

 void DoInitLibEv() {
   ev::set_syserr_cb(LibevSysErr);
 }

 } // anonymous namespace

 ReactorThread::ReactorThread(Reactor* reactor, const MessengerBuilder& bld)
   : loop_(kDefaultLibEvFlags),
     cur_time_(MonoTime::Now()),
     last_unused_tcp_scan_(cur_time_),
     reactor_(reactor),
     connection_keepalive_time_(bld.connection_keepalive_time_),
     coarse_timer_granularity_(bld.coarse_timer_granularity_),
     total_client_conns_cnt_(0),
     total_server_conns_cnt_(0),
     rng_(GetRandomSeed32()) {

   if (bld.metric_entity_) {
     invoke_us_histogram_ =
         METRIC_reactor_active_latency_us.Instantiate(bld.metric_entity_);
     load_percent_histogram_ =
         METRIC_reactor_load_percent.Instantiate(bld.metric_entity_);
   }
 }

 Status ReactorThread::Init() {
   DCHECK(thread_.get() == nullptr) << "Already started";
   DVLOG(6) << "Called ReactorThread::Init()";
   // Register to get async notifications in our epoll loop.
   async_.set(loop_);
   async_.set<ReactorThread, &ReactorThread::AsyncHandler>(this); // NOLINT(*)
   async_.start();

   // Register the timer watcher.
   // The timer is used for closing old TCP connections and applying
   // backpressure.
   timer_.set(loop_);
   timer_.set<ReactorThread, &ReactorThread::TimerHandler>(this); // NOLINT(*)
   timer_.start(coarse_timer_granularity_.ToSeconds(),
                coarse_timer_granularity_.ToSeconds());

   // Register our callbacks. ev++ doesn't provide handy wrappers for these.
   ev_set_userdata(loop_, this);
   ev_set_loop_release_cb(loop_, &ReactorThread::AboutToPollCb, &ReactorThread::PollCompleteCb);
   ev_set_invoke_pending_cb(loop_, &ReactorThread::InvokePendingCb);

   // Create Reactor thread.
   return kudu::Thread::Create("reactor", "rpc reactor",
                               [this]() { this->RunThread(); }, &thread_);
 }

 void ReactorThread::InvokePendingCb(struct ev_loop* loop) {
   // Calculate the number of cycles spent calling our callbacks.
   // This is called quite frequently so we use CycleClock rather than MonoTime
   // since it's a bit faster.
   int64_t start = CycleClock::Now();
   ev_invoke_pending(loop);
   int64_t dur_cycles = CycleClock::Now() - start;

   // Contribute this to our histogram.
   ReactorThread* thr = static_cast<ReactorThread*>(ev_userdata(loop));
   if (thr->invoke_us_histogram_) {
     thr->invoke_us_histogram_->Increment(dur_cycles * 1000000 / base::CyclesPerSecond());
   }
 }

 void ReactorThread::AboutToPollCb(struct ev_loop* loop) noexcept {
   // Store the current time in a member variable to be picked up below
   // in PollCompleteCb.
   ReactorThread* thr = static_cast<ReactorThread*>(ev_userdata(loop));
   thr->cycle_clock_before_poll_ = CycleClock::Now();
 }

 void ReactorThread::PollCompleteCb(struct ev_loop* loop) noexcept {
   // First things first, capture the time, so that this is as accurate as possible
   int64_t cycle_clock_after_poll = CycleClock::Now();

   // Record it in our accounting.
   ReactorThread* thr = static_cast<ReactorThread*>(ev_userdata(loop));
   DCHECK_NE(thr->cycle_clock_before_poll_, -1)
       << "PollCompleteCb called without corresponding AboutToPollCb";

   int64_t poll_cycles = cycle_clock_after_poll - thr->cycle_clock_before_poll_;
   thr->cycle_clock_before_poll_ = -1;
   thr->total_poll_cycles_ += poll_cycles;
 }

 void ReactorThread::Shutdown(Messenger::ShutdownMode mode) {
   CHECK(reactor_->closing()) << "Should be called after setting closing_ flag";

   VLOG(1) << name() << ": shutting down Reactor thread.";
   WakeThread();

   if (mode == Messenger::ShutdownMode::SYNC) {
     // Join() will return a bad status if asked to join on the currently
     // running thread.
     CHECK_OK(ThreadJoiner(thread_.get()).Join());
   }
 }

 void ReactorThread::ShutdownInternal() {
   DCHECK(IsCurrentThread());

   // Tear down any outbound TCP connections.
   Status service_unavailable = ShutdownError(false);
   VLOG(1) << name() << ": tearing down outbound TCP connections...";
   for (const auto& elem : client_conns_) {
     const auto& conn = elem.second;
     VLOG(1) << name() << ": shutting down " << conn->ToString();
     conn->Shutdown(service_unavailable);
   }
   client_conns_.clear();

   // Tear down any inbound TCP connections.
   VLOG(1) << name() << ": tearing down inbound TCP connections...";
   for (const auto& conn : server_conns_) {
     VLOG(1) << name() << ": shutting down " << conn->ToString();
     conn->Shutdown(service_unavailable);
   }
   server_conns_.clear();

   // Abort any scheduled tasks.
   //
   // These won't be found in the ReactorThread's list of pending tasks
   // because they've been "run" (that is, they've been scheduled).
   Status aborted = ShutdownError(true); // aborted
   while (!scheduled_tasks_.empty()) {
     DelayedTask* t = &scheduled_tasks_.front();
     scheduled_tasks_.pop_front();
     t->Abort(aborted); // should also free the task.
   }

   // Remove the OpenSSL thread state.
   //
   // As of OpenSSL 1.1, this [1] is a no-op and can be ignored.
   //
   // 1. https://www.openssl.org/docs/man1.1.0/crypto/ERR_remove_thread_state.html
 #if OPENSSL_VERSION_NUMBER < 0x10100000L
   ERR_remove_thread_state(nullptr);
 #endif
 }

 ReactorTask::ReactorTask() {
 }
 ReactorTask::~ReactorTask() {
 }

 Status ReactorThread::GetMetrics(ReactorMetrics* metrics) {
   DCHECK(IsCurrentThread());
   metrics->num_client_connections_ = client_conns_.size();
   metrics->num_server_connections_ = server_conns_.size();
   metrics->total_client_connections_ = total_client_conns_cnt_;
   metrics->total_server_connections_ = total_server_conns_cnt_;
   return Status::OK();
 }

 Status ReactorThread::DumpConnections(const DumpConnectionsRequestPB& req,
                                       DumpConnectionsResponsePB* resp) {
   DCHECK(IsCurrentThread());
   for (const scoped_refptr<Connection>& conn : server_conns_) {
     RETURN_NOT_OK(conn->DumpPB(req, resp->add_inbound_connections()));
   }
   for (const conn_multimap_t::value_type& entry : client_conns_) {
     Connection* conn = entry.second.get();
     RETURN_NOT_OK(conn->DumpPB(req, resp->add_outbound_connections()));
   }
   return Status::OK();
 }

 void ReactorThread::WakeThread() {
   // libev uses some lock-free synchronization, but doesn't have TSAN annotations.
   // See http://lists.schmorp.de/pipermail/libev/2013q2/002178.html or KUDU-366
   // for examples.
   debug::ScopedTSANIgnoreReadsAndWrites ignore_tsan;
   async_.send();
 }

 // Handle async events.  These events are sent to the reactor by other
 // threads that want to bring something to our attention, like the fact that
 // we're shutting down, or the fact that there is a new outbound Transfer
 // ready to send.
 void ReactorThread::AsyncHandler(ev::async& /*watcher*/, int /*revents*/) {
   DCHECK(IsCurrentThread());

   if (PREDICT_FALSE(reactor_->closing())) {
     ShutdownInternal();
     loop_.break_loop(); // break the epoll loop and terminate the thread
     return;
   }

   boost::intrusive::list<ReactorTask> tasks;
   reactor_->DrainTaskQueue(&tasks);

   while (!tasks.empty()) {
     ReactorTask& task = tasks.front();
     tasks.pop_front();
     task.Run(this);
   }
 }

 void ReactorThread::RegisterConnection(scoped_refptr<Connection> conn) {
   DCHECK(IsCurrentThread());

   Status s = StartConnectionNegotiation(conn);
   if (PREDICT_FALSE(!s.ok())) {
     LOG(ERROR) << "Server connection negotiation failed: " << s.ToString();
     DestroyConnection(conn.get(), s);
     return;
   }
   ++total_server_conns_cnt_;
   server_conns_.emplace_back(std::move(conn));
 }

 void ReactorThread::AssignOutboundCall(shared_ptr<OutboundCall> call) {
   DCHECK(IsCurrentThread());

   // Skip if the outbound has been cancelled already.
   if (PREDICT_FALSE(call->IsCancelled())) {
     return;
   }

   scoped_refptr<Connection> conn;
   Status s = FindOrStartConnection(call->conn_id(),
                                    call->controller()->credentials_policy(),
                                    &conn);
   if (PREDICT_FALSE(!s.ok())) {
     call->SetFailed(std::move(s), OutboundCall::Phase::CONNECTION_NEGOTIATION);
     return;
   }

   conn->QueueOutboundCall(std::move(call));
 }

 void ReactorThread::CancelOutboundCall(const shared_ptr<OutboundCall>& call) {
   DCHECK(IsCurrentThread());

   // If the callback has been invoked already, the cancellation is a no-op.
   // The controller may be gone already if the callback has been invoked.
   if (call->IsFinished()) {
     return;
   }

   scoped_refptr<Connection> conn;
   if (FindConnection(call->conn_id(),
                      call->controller()->credentials_policy(),
                      &conn)) {
     conn->CancelOutboundCall(call);
   }
   call->Cancel();
 }

 //
 // Handles timer events.  The periodic timer:
 //
 // 1. updates Reactor::cur_time_
 // 2. every tcp_conn_timeo_ seconds, close down connections older than
 //    tcp_conn_timeo_ seconds.
 //
 void ReactorThread::TimerHandler(ev::timer& /*watcher*/, int revents) {
   DCHECK(IsCurrentThread());
   if (EV_ERROR & revents) {
     LOG(WARNING) << "Reactor " << name() << " got an error in "
       "the timer handler.";
     return;
   }
   cur_time_ = MonoTime::Now();

   // Compute load percentage.
   int64_t now_cycles = CycleClock::Now();
   if (last_load_measurement_.time_cycles != -1) {
     int64_t cycles_delta = (now_cycles - last_load_measurement_.time_cycles);
     int64_t poll_cycles_delta = total_poll_cycles_ - last_load_measurement_.poll_cycles;
     double poll_fraction = static_cast<double>(poll_cycles_delta) / cycles_delta;
     double active_fraction = 1 - poll_fraction;
     if (load_percent_histogram_) {
       load_percent_histogram_->Increment(static_cast<int>(active_fraction * 100));
     }
   }
   last_load_measurement_.time_cycles = now_cycles;
   last_load_measurement_.poll_cycles = total_poll_cycles_;

   ScanIdleConnections();
 }

 void ReactorThread::RegisterTimeout(ev::timer* watcher) {
   watcher->set(loop_);
 }

 void ReactorThread::ScanIdleConnections() {
   DCHECK(IsCurrentThread());
   // Enforce TCP connection timeouts: server-side connections.
   const auto server_conns_end = server_conns_.end();
   uint64_t timed_out = 0;
   // Scan for idle server connections if it's enabled.
   if (connection_keepalive_time_ >= MonoDelta::FromMilliseconds(0)) {
     for (auto it = server_conns_.begin(); it != server_conns_end; ) {
       Connection* conn = it->get();
       if (!conn->Idle()) {
         VLOG(10) << "Connection " << conn->ToString() << " not idle";
         ++it;
         continue;
       }

       const MonoDelta connection_delta(cur_time_ - conn->last_activity_time());
       if (connection_delta <= connection_keepalive_time_) {
         ++it;
         continue;
       }

       conn->Shutdown(Status::NetworkError(
           Substitute("connection timed out after $0", connection_keepalive_time_.ToString())));
       VLOG(1) << "Timing out connection " << conn->ToString() << " - it has been idle for "
               << connection_delta.ToString();
       ++timed_out;
       it = server_conns_.erase(it);
     }
   }
   // Take care of idle client-side connections marked for shutdown.
   uint64_t shutdown = 0;
   for (auto it = client_conns_.begin(); it != client_conns_.end();) {
     Connection* conn = it->second.get();
     if (conn->scheduled_for_shutdown() && conn->Idle()) {
       conn->Shutdown(Status::NetworkError(
           "connection has been marked for shutdown"));
       it = client_conns_.erase(it);
       ++shutdown;
     } else {
       ++it;
     }
   }
   // TODO(aserbin): clients may want to set their keepalive timeout for idle
   //                but not scheduled for shutdown connections.

   VLOG_IF(1, timed_out > 0) << name() << ": timed out " << timed_out << " TCP connections.";
   VLOG_IF(1, shutdown > 0) << name() << ": shutdown " << shutdown << " TCP connections.";
 }

 const string& ReactorThread::name() const {
   return reactor_->name();
 }

 MonoTime ReactorThread::cur_time() const {
   return cur_time_;
 }

 Reactor* ReactorThread::reactor() {
   return reactor_;
 }

 bool ReactorThread::IsCurrentThread() const {
   return thread_.get() == kudu::Thread::current_thread();
 }

 void ReactorThread::RunThread() {
   ThreadRestrictions::SetWaitAllowed(false);
   ThreadRestrictions::SetIOAllowed(false);
   DVLOG(6) << "Calling ReactorThread::RunThread()...";
   loop_.run(0);
   VLOG(1) << name() << " thread exiting.";

   // No longer need the messenger. This causes the messenger to
   // get deleted when all the reactors exit.
   reactor_->messenger_.reset();
 }

 bool ReactorThread::FindConnection(const ConnectionId& conn_id,
                                    CredentialsPolicy cred_policy,
                                    scoped_refptr<Connection>* conn) {
   DCHECK(IsCurrentThread());
   const auto range = client_conns_.equal_range(conn_id);
   scoped_refptr<Connection> found_conn;
   for (auto it = range.first; it != range.second;) {
     const auto& c = it->second.get();
     // * Do not use connections scheduled for shutdown to place new calls.
     //
     // * Do not use a connection with a non-compliant credentials policy.
     //   Instead, open a new one, while marking the former as scheduled for
     //   shutdown. This process converges: any connection that satisfies the
     //   PRIMARY_CREDENTIALS policy automatically satisfies the ANY_CREDENTIALS
     //   policy as well. The idea is to keep only one usable connection
     //   identified by the specified 'conn_id'.
     //
     // * If the test-only 'one-connection-per-RPC' mode is enabled, connections
     //   are re-established at every RPC call.
     if (c->scheduled_for_shutdown() ||
         !c->SatisfiesCredentialsPolicy(cred_policy) ||
         PREDICT_FALSE(FLAGS_rpc_reopen_outbound_connections)) {
       if (c->Idle()) {
         // Shutdown idle connections to the target destination. Non-idle ones
         // will be taken care of later by the idle connection scanner.
         DCHECK_EQ(Connection::CLIENT, c->direction());
         c->Shutdown(Status::NetworkError("connection is closed due to non-reuse policy"));
         it = client_conns_.erase(it);
         continue;
       }
       c->set_scheduled_for_shutdown();
     } else {
       DCHECK(!found_conn);
       found_conn = c;
       // Appropriate connection is found; continue further to take care of the
       // rest of connections to mark them for shutdown if they are not
       // satisfying the policy.
     }
     ++it;
   }
   if (found_conn) {
     // Found matching not-to-be-shutdown connection: return it as the result.
     conn->swap(found_conn);
     return true;
   }
   return false;
 }

 Status ReactorThread::FindOrStartConnection(const ConnectionId& conn_id,
                                             CredentialsPolicy cred_policy,
                                             scoped_refptr<Connection>* conn) {
   DCHECK(IsCurrentThread());
   if (FindConnection(conn_id, cred_policy, conn)) {
     return Status::OK();
   }

   // No connection to this remote. Need to create one.
   VLOG(2) << name() << " FindOrStartConnection: creating "
           << "new connection for " << conn_id.remote().ToString();

   // Create a new socket and start connecting to the remote.
   Socket sock;
   RETURN_NOT_OK(CreateClientSocket(conn_id.remote().family(), &sock));
   RETURN_NOT_OK(StartConnect(&sock, conn_id.remote()));

   unique_ptr<Socket> new_socket(new Socket(sock.Release()));

   // Register the new connection in our map.
   *conn = new Connection(
       this, conn_id.remote(), std::move(new_socket), Connection::CLIENT, cred_policy);
   (*conn)->set_outbound_connection_id(conn_id);

   // Kick off blocking client connection negotiation.
   Status s = StartConnectionNegotiation(*conn);
   if (s.IsIllegalState()) {
     // Return a nicer error message to the user indicating -- if we just
     // forward the status we'd get something generic like "ThreadPool is closing".
     return Status::ServiceUnavailable("Client RPC Messenger shutting down");
   }
   // Propagate any other errors as-is.
   RETURN_NOT_OK_PREPEND(s, "Unable to start connection negotiation thread");

   // Insert into the client connection map to avoid duplicate connection requests.
   client_conns_.emplace(conn_id, *conn);
   ++total_client_conns_cnt_;

   return Status::OK();
 }

 Status ReactorThread::StartConnectionNegotiation(const scoped_refptr<Connection>& conn) {
   DCHECK(IsCurrentThread());

   // Set a limit on how long the server will negotiate with a new client.
   MonoTime deadline = MonoTime::Now() +
       MonoDelta::FromMilliseconds(reactor()->messenger()->rpc_negotiation_timeout_ms());

   scoped_refptr<Trace> trace(new Trace());
   ADOPT_TRACE(trace.get());
   TRACE("Submitting negotiation task for $0", conn->ToString());
   auto authentication = reactor()->messenger()->authentication();
   auto encryption = reactor()->messenger()->encryption();
   auto loopback_encryption = reactor()->messenger()->loopback_encryption();
   ThreadPool* negotiation_pool =
       reactor()->messenger()->negotiation_pool(conn->direction());
   RETURN_NOT_OK(negotiation_pool->Submit([conn, authentication, encryption, loopback_encryption,
                                           deadline]() {
         Negotiation::RunNegotiation(conn, authentication, encryption, loopback_encryption,
                                     deadline);
       }));
   return Status::OK();
 }

 void ReactorThread::CompleteConnectionNegotiation(
     const scoped_refptr<Connection>& conn,
     const Status& status,
     unique_ptr<ErrorStatusPB> rpc_error) {
   DCHECK(IsCurrentThread());
   if (PREDICT_FALSE(!status.ok())) {
     DestroyConnection(conn.get(), status, std::move(rpc_error));
     return;
   }

   // Switch the socket back to non-blocking mode after negotiation.
   Status s = conn->SetNonBlocking(true);
   if (PREDICT_FALSE(!s.ok())) {
     LOG(DFATAL) << "Unable to set connection to non-blocking mode: " << s.ToString();
     DestroyConnection(conn.get(), s, std::move(rpc_error));
     return;
   }

   if (conn->remote().is_ip() &&
       FLAGS_tcp_keepalive_probe_period_s > 0) {
     // Try spreading out the idle poll period to avoid thundering herd in case connections
     // are all created at the same time (e.g. after a cluster is restarted).
     Status keepalive_status = conn->SetTcpKeepAlive(
         FLAGS_tcp_keepalive_probe_period_s + rng_.Uniform32(4),
         FLAGS_tcp_keepalive_retry_period_s, FLAGS_tcp_keepalive_retry_count);
     if (PREDICT_FALSE(!keepalive_status.ok())) {
       LOG(DFATAL) << "Unable to set TCP keepalive for connection: "
                   << keepalive_status.ToString();
       DestroyConnection(conn.get(), keepalive_status, std::move(rpc_error));
       return;
     }
   }

   conn->MarkNegotiationComplete();
   conn->EpollRegister(loop_);
 }

 Status ReactorThread::CreateClientSocket(int family, Socket* sock) {
   Status ret = sock->Init(family, Socket::FLAG_NONBLOCKING);
   if (ret.ok() && family == AF_INET) {
     ret = sock->SetNoDelay(true);
   }
   LOG_IF(WARNING, !ret.ok())
       << "failed to create an outbound connection because a new socket could not be created: "
       << ret.ToString();
   return ret;
 }

 Status ReactorThread::StartConnect(Socket* sock, const Sockaddr& remote) {
   const Status ret = sock->Connect(remote);
   if (ret.ok()) {
     VLOG(3) << "StartConnect: connect finished immediately for " << remote.ToString();
     return Status::OK();
   }

   int posix_code = ret.posix_code();
   if (Socket::IsTemporarySocketError(posix_code) || posix_code == EINPROGRESS) {
     VLOG(3) << "StartConnect: connect in progress for " << remote.ToString();
     return Status::OK();
   }

   LOG(WARNING) << "Failed to create an outbound connection to " << remote.ToString()
                << " because connect() failed: " << ret.ToString();
   return ret;
 }

 void ReactorThread::DestroyConnection(Connection* conn,
                                       const Status& conn_status,
                                       unique_ptr<ErrorStatusPB> rpc_error) {
   DCHECK(IsCurrentThread());

   conn->Shutdown(conn_status, std::move(rpc_error));

   // Unlink connection from lists.
   if (conn->direction() == Connection::CLIENT) {
     const auto range = client_conns_.equal_range(conn->outbound_connection_id());
     CHECK(range.first != range.second) << "Couldn't find connection " << conn->ToString();
     // The client_conns_ container is a multi-map.
     for (auto it = range.first; it != range.second;) {
       if (it->second.get() == conn) {
         it = client_conns_.erase(it);
         break;
       }
       ++it;
     }
   } else if (conn->direction() == Connection::SERVER) {
     auto it = server_conns_.begin();
     while (it != server_conns_.end()) {
       if ((*it).get() == conn) {
         server_conns_.erase(it);
         break;
       }
       ++it;
     }
   }
 }

 DelayedTask::DelayedTask(std::function<void(const Status&)> func,
                          MonoDelta when)
     : func_(std::move(func)),
       when_(when),
       thread_(nullptr) {
 }

 void DelayedTask::Run(ReactorThread* thread) {
   DCHECK(thread_ == nullptr) << "Task has already been scheduled";
   DCHECK(thread->IsCurrentThread());
   DCHECK(!is_linked()) << "Should not be linked on pending_tasks_ anymore";

   // Schedule the task to run later.
   thread_ = thread;
   timer_.set(thread->loop_);
   timer_.set<DelayedTask, &DelayedTask::TimerHandler>(this); // NOLINT(*)
   timer_.start(when_.ToSeconds(), // after
                0);                // repeat
   thread_->scheduled_tasks_.push_back(*this);
 }

 void DelayedTask::Abort(const Status& abort_status) {
   func_(abort_status);
   delete this;
 }

 void DelayedTask::TimerHandler(ev::timer& /*watcher*/, int revents) {
   DCHECK(is_linked()) << "should be linked on scheduled_tasks_";
   // We will free this task's memory.
   thread_->scheduled_tasks_.erase(thread_->scheduled_tasks_.iterator_to(*this));

   if (EV_ERROR & revents) {
     string msg = "Delayed task got an error in its timer handler";
     LOG(WARNING) << msg;
     Abort(Status::Aborted(msg)); // Will delete 'this'.
   } else {
     func_(Status::OK());
     delete this;
   }
 }

 Reactor::Reactor(shared_ptr<Messenger> messenger,
                  int index, const MessengerBuilder& bld)
     : messenger_(std::move(messenger)),
       name_(StringPrintf("%s_R%03d", messenger_->name().c_str(), index)),
       closing_(false),
       thread_(this, bld) {
   static std::once_flag libev_once;
   std::call_once(libev_once, DoInitLibEv);
 }

 Status Reactor::Init() {
   DVLOG(6) << "Called Reactor::Init()";
   return thread_.Init();
 }

 void Reactor::Shutdown(Messenger::ShutdownMode mode) {
   {
     std::lock_guard<LockType> l(lock_);
     if (closing_) {
       return;
     }
     closing_ = true;
   }

   thread_.Shutdown(mode);

   // Abort all pending tasks. No new tasks can get scheduled after this
   // because ScheduleReactorTask() tests the closing_ flag set above.
   Status aborted = ShutdownError(true);
   while (!pending_tasks_.empty()) {
     ReactorTask& task = pending_tasks_.front();
     pending_tasks_.pop_front();
     task.Abort(aborted);
   }
 }

 Reactor::~Reactor() {
   Shutdown(Messenger::ShutdownMode::ASYNC);
 }

 const string& Reactor::name() const {
   return name_;
 }

 bool Reactor::closing() const {
   std::lock_guard<LockType> l(lock_);
   return closing_;
 }

 // Task to call an arbitrary function within the reactor thread.
 class RunFunctionTask : public ReactorTask {
  public:
   explicit RunFunctionTask(std::function<Status()> f)
       : function_(std::move(f)), latch_(1) {}

   void Run(ReactorThread* /*reactor*/) override {
     status_ = function_();
     latch_.CountDown();
   }
   void Abort(const Status& status) override {
     status_ = status;
     latch_.CountDown();
   }

   // Wait until the function has completed, and return the Status
   // returned by the function.
   Status Wait() {
     latch_.Wait();
     return status_;
   }

  private:
   const std::function<Status()> function_;
   Status status_;
   CountDownLatch latch_;
 };

 Status Reactor::GetMetrics(ReactorMetrics* metrics) {
   return RunOnReactorThread([&]() { return this->thread_.GetMetrics(metrics); });
 }

 Status Reactor::RunOnReactorThread(std::function<Status()> f) {
   RunFunctionTask task(std::move(f));
   ScheduleReactorTask(&task);
   return task.Wait();
 }

 Status Reactor::DumpConnections(const DumpConnectionsRequestPB& req,
                                 DumpConnectionsResponsePB* resp) {
   return RunOnReactorThread([&]() { return this->thread_.DumpConnections(req, resp); });
 }

 class RegisterConnectionTask : public ReactorTask {
  public:
   explicit RegisterConnectionTask(scoped_refptr<Connection> conn)
       : conn_(std::move(conn)) {
   }

   void Run(ReactorThread* reactor) override {
     reactor->RegisterConnection(std::move(conn_));
     delete this;
   }

   void Abort(const Status& /*status*/) override {
     // We don't need to Shutdown the connection since it was never registered.
     // This is only used for inbound connections, and inbound connections will
     // never have any calls added to them until they've been registered.
     delete this;
   }

  private:
   const scoped_refptr<Connection> conn_;
 };

 void Reactor::RegisterInboundSocket(Socket* socket, const Sockaddr& remote) {
   VLOG(3) << name_ << ": new inbound connection to " << remote.ToString();
   unique_ptr<Socket> new_socket(new Socket(socket->Release()));
   auto task = new RegisterConnectionTask(
       new Connection(&thread_, remote, std::move(new_socket), Connection::SERVER));
   ScheduleReactorTask(task);
 }

 // Task which runs in the reactor thread to assign an outbound call
 // to a connection.
 class AssignOutboundCallTask : public ReactorTask {
  public:
   explicit AssignOutboundCallTask(shared_ptr<OutboundCall> call)
       : call_(std::move(call)) {}

   void Run(ReactorThread* reactor) override {
     reactor->AssignOutboundCall(std::move(call_));
     delete this;
   }

   void Abort(const Status& status) override {
     // It doesn't matter what is the actual phase of the OutboundCall: just set
     // it to Phase::REMOTE_CALL to finalize the state of the call.
     call_->SetFailed(status, OutboundCall::Phase::REMOTE_CALL);
     delete this;
   }

  private:
   const shared_ptr<OutboundCall> call_;
 };

 void Reactor::QueueOutboundCall(shared_ptr<OutboundCall> call) {
   DVLOG(3) << name_ << ": queueing outbound call "
            << call->ToString() << " to remote " << call->conn_id().remote().ToString();
   // Test cancellation when 'call_' is in 'READY' state.
   if (PREDICT_FALSE(call->ShouldInjectCancellation())) {
     QueueCancellation(call);
   }
   ScheduleReactorTask(new AssignOutboundCallTask(std::move(call)));
 }

 class CancellationTask : public ReactorTask {
  public:
   explicit CancellationTask(shared_ptr<OutboundCall> call)
       : call_(std::move(call)) {}

   void Run(ReactorThread* reactor) override {
     reactor->CancelOutboundCall(call_);
     delete this;
   }

   void Abort(const Status& /*status*/) override {
     delete this;
   }

  private:
   const shared_ptr<OutboundCall> call_;
 };

 void Reactor::QueueCancellation(shared_ptr<OutboundCall> call) {
   ScheduleReactorTask(new CancellationTask(std::move(call)));
 }

 void Reactor::ScheduleReactorTask(ReactorTask* task) {
   bool was_empty;
   {
     std::unique_lock<LockType> l(lock_);
     if (closing_) {
       // We guarantee the reactor lock is not taken when calling Abort().
       l.unlock();
       task->Abort(ShutdownError(false));
       return;
     }
     was_empty = pending_tasks_.empty();
     pending_tasks_.push_back(*task);
   }
   if (was_empty) {
     thread_.WakeThread();
   }
 }

 bool Reactor::DrainTaskQueue(boost::intrusive::list<ReactorTask>* tasks) { // NOLINT(*)
   std::lock_guard<LockType> l(lock_);
   if (closing_) {
     return false;
   }
   tasks->swap(pending_tasks_);
   return true;
 }

 } // namespace rpc
 } // namespace kudu