ksck: print tablet server states

This adds a table of tserver states to the ksck output:

...
Tablet Server States
              Server              |      State
----------------------------------+------------------
 767c081ea5564c9a8ed8615c00658e1b | MAINTENANCE_MODE

Tablet Server Summary
...

When there are no special tablet server states, the section doesn't get
printed.

I wanted to reuse some of the client code to list tablet servers but didn't
want to change the public interface, so this patch also rejiggers the
ListTabletServers client logic into the client data class.

Change-Id: I4cec8c85a08bcb4f7be9e98a34d4b5ae2f05f3be
Reviewed-on: http://gerrit.cloudera.org:8080/14436
Reviewed-by: Adar Dembo <adar@cloudera.com>
Tested-by: Kudu Jenkins
diff --git a/src/kudu/client/client-internal.cc b/src/kudu/client/client-internal.cc
index befa03c..75c7200 100644
--- a/src/kudu/client/client-internal.cc
+++ b/src/kudu/client/client-internal.cc
@@ -99,6 +99,8 @@
 using master::IsAlterTableDoneResponsePB;
 using master::IsCreateTableDoneRequestPB;
 using master::IsCreateTableDoneResponsePB;
+using master::ListTabletServersResponsePB;
+using master::ListTabletServersRequestPB;
 using master::MasterFeatures;
 using master::MasterServiceProxy;
 using master::TableIdentifierPB;
@@ -446,6 +448,23 @@
   return false;
 }
 
+Status KuduClient::Data::ListTabletServers(KuduClient* client,
+                                           const MonoTime& deadline,
+                                           const ListTabletServersRequestPB& req,
+                                           ListTabletServersResponsePB* resp) const {
+  Synchronizer sync;
+  AsyncLeaderMasterRpc<ListTabletServersRequestPB, ListTabletServersResponsePB> rpc(
+      deadline, client, BackoffType::EXPONENTIAL, req, resp,
+      &MasterServiceProxy::ListTabletServersAsync, "ListTabletServers",
+      sync.AsStatusCallback(), {});
+  rpc.SendRpc();
+  RETURN_NOT_OK(sync.Wait());
+  if (resp->has_error()) {
+    return StatusFromPB(resp->error().status());
+  }
+  return Status::OK();
+}
+
 void KuduClient::Data::StoreAuthzToken(const string& table_id,
                                        const SignedTokenPB& token) {
   authz_token_cache_.Put(table_id, token);
diff --git a/src/kudu/client/client-internal.h b/src/kudu/client/client-internal.h
index be9d0ed..4d5a533 100644
--- a/src/kudu/client/client-internal.h
+++ b/src/kudu/client/client-internal.h
@@ -64,6 +64,8 @@
 class CreateTableRequestPB;
 class CreateTableResponsePB;
 class GetTableSchemaResponsePB;
+class ListTabletServersRequestPB;
+class ListTabletServersResponsePB;
 class MasterServiceProxy;
 class TableIdentifierPB;
 } // namespace master
@@ -169,6 +171,11 @@
 
   bool IsTabletServerLocal(const internal::RemoteTabletServer& rts) const;
 
+  Status ListTabletServers(KuduClient* client,
+                           const MonoTime& deadline,
+                           const master::ListTabletServersRequestPB& req,
+                           master::ListTabletServersResponsePB* resp) const;
+
   // Returns a non-failed replica of the specified tablet based on the provided
   // selection criteria and tablet server blacklist.
   //
diff --git a/src/kudu/client/client.cc b/src/kudu/client/client.cc
index 28e67f9..93930d2 100644
--- a/src/kudu/client/client.cc
+++ b/src/kudu/client/client.cc
@@ -449,18 +449,8 @@
 Status KuduClient::ListTabletServers(vector<KuduTabletServer*>* tablet_servers) {
   ListTabletServersRequestPB req;
   ListTabletServersResponsePB resp;
-
   MonoTime deadline = MonoTime::Now() + default_admin_operation_timeout();
-  Synchronizer sync;
-  AsyncLeaderMasterRpc<ListTabletServersRequestPB, ListTabletServersResponsePB> rpc(
-      deadline, this, BackoffType::EXPONENTIAL, req, &resp,
-      &MasterServiceProxy::ListTabletServersAsync, "ListTabletServers",
-      sync.AsStatusCallback(), {});
-  rpc.SendRpc();
-  RETURN_NOT_OK(sync.Wait());
-  if (resp.has_error()) {
-    return StatusFromPB(resp.error().status());
-  }
+  RETURN_NOT_OK(data_->ListTabletServers(this, deadline, req, &resp));
   for (int i = 0; i < resp.servers_size(); i++) {
     const ListTabletServersResponsePB_Entry& e = resp.servers(i);
     HostPort hp;
diff --git a/src/kudu/client/client.h b/src/kudu/client/client.h
index 2412f8b..f4dc618 100644
--- a/src/kudu/client/client.h
+++ b/src/kudu/client/client.h
@@ -65,6 +65,7 @@
 
 namespace tools {
 class LeaderMasterProxy;
+class RemoteKsckCluster;
 } // namespace tools
 
 namespace client {
@@ -612,6 +613,7 @@
   friend class kudu::AuthzTokenTest;
   friend class kudu::SecurityUnknownTskTest;
   friend class tools::LeaderMasterProxy;
+  friend class tools::RemoteKsckCluster;
 
   FRIEND_TEST(kudu::ClientStressTest, TestUniqueClientIds);
   FRIEND_TEST(ClientTest, TestCacheAuthzTokens);
diff --git a/src/kudu/tools/ksck.cc b/src/kudu/tools/ksck.cc
index 6c39b7f..e8b61bc 100644
--- a/src/kudu/tools/ksck.cc
+++ b/src/kudu/tools/ksck.cc
@@ -420,6 +420,9 @@
     if (section_upper == "MASTER_SUMMARIES") {
       print_sections_flags_ |= PrintSections::MASTER_SUMMARIES;
     }
+    if (section_upper == "TSERVER_STATES") {
+      print_sections_flags_ |= PrintSections::TSERVER_STATES;
+    }
     if (section_upper == "TSERVER_SUMMARIES") {
       print_sections_flags_ |= PrintSections::TSERVER_SUMMARIES;
     }
@@ -461,6 +464,10 @@
   PUSH_PREPEND_NOT_OK(s, results_.error_messages, fetch_prefix);
   RETURN_NOT_OK_PREPEND(s, fetch_prefix);
 
+  // In getting table and tablet info, we should have also received info about
+  // the tablet servers, including any special states they might be in.
+  results_.ts_states = cluster_->ts_states();
+
   PUSH_PREPEND_NOT_OK(FetchInfoFromTabletServers(), results_.error_messages,
                       "error fetching info from tablet servers");
   PUSH_PREPEND_NOT_OK(CheckTabletServerUnusualFlags(), results_.warning_messages,
diff --git a/src/kudu/tools/ksck.h b/src/kudu/tools/ksck.h
index 71aa2b7..b631d1a 100644
--- a/src/kudu/tools/ksck.h
+++ b/src/kudu/tools/ksck.h
@@ -437,6 +437,10 @@
     return tablet_servers_;
   }
 
+  const KsckTServerStateMap& ts_states() const {
+    return ts_states_;
+  }
+
   const std::vector<std::shared_ptr<KsckTable>>& tables() const {
     return tables_;
   }
@@ -478,6 +482,7 @@
   KsckCluster() : filtered_tables_count_(0), filtered_tablets_count_(0) {}
   MasterList masters_;
   TSMap tablet_servers_;
+  KsckTServerStateMap ts_states_;
   std::vector<std::shared_ptr<KsckTable>> tables_;
   std::unique_ptr<ThreadPool> pool_;
 
diff --git a/src/kudu/tools/ksck_remote.cc b/src/kudu/tools/ksck_remote.cc
index b5b2912..d9a4c46 100644
--- a/src/kudu/tools/ksck_remote.cc
+++ b/src/kudu/tools/ksck_remote.cc
@@ -31,6 +31,7 @@
 #include <gflags/gflags_declare.h>
 #include <glog/logging.h>
 
+#include "kudu/client/client-internal.h"
 #include "kudu/client/client.h"
 #include "kudu/client/replica_controller-internal.h"
 #include "kudu/client/schema.h"
@@ -47,6 +48,7 @@
 #include "kudu/gutil/stl_util.h"
 #include "kudu/gutil/strings/substitute.h"
 #include "kudu/master/master.h"
+#include "kudu/master/master.pb.h"
 #include "kudu/master/sys_catalog.h"
 #include "kudu/rebalance/cluster_status.h"
 #include "kudu/rpc/messenger.h"
@@ -57,6 +59,7 @@
 #include "kudu/tablet/tablet.pb.h"
 #include "kudu/tools/ksck.h"
 #include "kudu/tools/ksck_checksum.h"
+#include "kudu/tools/ksck_results.h"
 #include "kudu/tools/tool_action_common.h"
 #include "kudu/tserver/tablet_server.h"
 #include "kudu/tserver/tserver.pb.h"
@@ -79,9 +82,11 @@
 using kudu::client::KuduScanTokenBuilder;
 using kudu::client::KuduSchema;
 using kudu::client::KuduTable;
-using kudu::client::KuduTabletServer;
 using kudu::client::internal::ReplicaController;
 using kudu::cluster_summary::ServerHealth;
+using kudu::master::ListTabletServersRequestPB;
+using kudu::master::ListTabletServersResponsePB;
+using kudu::master::TServerStatePB;
 using kudu::rpc::Messenger;
 using kudu::rpc::MessengerBuilder;
 using kudu::rpc::RpcController;
@@ -494,21 +499,38 @@
 }
 
 Status RemoteKsckCluster::RetrieveTabletServers() {
-  vector<KuduTabletServer*> servers;
-  ElementDeleter deleter(&servers);
-  RETURN_NOT_OK(client_->ListTabletServers(&servers));
+  // Request that the tablet server states also be reported. This may include
+  // information about tablet servers that have not yet registered with the
+  // Masters.
+  ListTabletServersRequestPB req;
+  req.set_include_states(true);
+  ListTabletServersResponsePB resp;
+  RETURN_NOT_OK(client_->data_->ListTabletServers(
+      client_.get(), MonoTime::Now() + client_->default_admin_operation_timeout(), req, &resp));
 
   TSMap tablet_servers;
-  for (const auto* s : servers) {
-    auto ts = std::make_shared<RemoteKsckTabletServer>(
-        s->uuid(),
-        HostPort(s->hostname(), s->port()),
-        messenger_,
-        s->location());
+  KsckTServerStateMap ts_states;
+  for (int i = 0; i < resp.servers_size(); i++) {
+    const auto& ts_pb = resp.servers(i);
+    const auto& uuid = ts_pb.instance_id().permanent_uuid();
+    if (ts_pb.has_state() && ts_pb.state() != TServerStatePB::NONE) {
+      // We don't expect updates, but it's straightforward to handle, so let's
+      // update instead of die just in case.
+      EmplaceOrUpdate(&ts_states, uuid, ts_pb.state());
+    }
+    // If there's no registration for the tablet server, all we can really get
+    // is the state, so move on.
+    if (!ts_pb.has_registration()) {
+      continue;
+    }
+    HostPort hp;
+    RETURN_NOT_OK(HostPortFromPB(ts_pb.registration().rpc_addresses(0), &hp));
+    auto ts = std::make_shared<RemoteKsckTabletServer>(uuid, hp, messenger_, ts_pb.location());
     RETURN_NOT_OK(ts->Init());
-    InsertOrDie(&tablet_servers, ts->uuid(), ts);
+    EmplaceOrUpdate(&tablet_servers, uuid, std::move(ts));
   }
-  tablet_servers_.swap(tablet_servers);
+  tablet_servers_ = std::move(tablet_servers);
+  ts_states_ = std::move(ts_states);
   return Status::OK();
 }
 
diff --git a/src/kudu/tools/ksck_results.cc b/src/kudu/tools/ksck_results.cc
index 1ceea49..029b5f6 100644
--- a/src/kudu/tools/ksck_results.cc
+++ b/src/kudu/tools/ksck_results.cc
@@ -202,6 +202,13 @@
     }
   }
 
+  // Then, on any special tablet server states.
+  if (sections & PrintSections::TSERVER_STATES &&
+      !ts_states.empty()) {
+    RETURN_NOT_OK(PrintTServerStatesTable(ts_states, out));
+    out << endl;
+  }
+
   // Then, on the health of the tablet servers.
   if (sections & PrintSections::TSERVER_SUMMARIES) {
     std::sort(cluster_status.tserver_summaries.begin(),
@@ -407,6 +414,17 @@
   return flags_table.PrintTo(out);
 }
 
+Status PrintTServerStatesTable(const KsckTServerStateMap& ts_states, ostream& out) {
+  out << "Tablet Server States" << endl;
+  // We expect tserver states to be relatively uncommon, so print one per row.
+  DataTable table({"Server", "State"});
+  for (const auto& id_and_state : ts_states) {
+    table.AddRow({ id_and_state.first,
+                   TServerStatePB_Name(id_and_state.second) });
+  }
+  return table.PrintTo(out);
+}
+
 Status PrintVersionTable(const KsckVersionToServersMap& version_summaries,
                          int num_servers,
                          ostream& out) {
@@ -682,7 +700,7 @@
 }
 
 void ServerHealthSummaryToPb(const ServerHealthSummary& summary,
-                                 ServerHealthSummaryPB* pb) {
+                             ServerHealthSummaryPB* pb) {
   switch (summary.health) {
     case ServerHealth::HEALTHY:
       pb->set_health(ServerHealthSummaryPB_ServerHealth_HEALTHY);
diff --git a/src/kudu/tools/ksck_results.h b/src/kudu/tools/ksck_results.h
index 34675db..ec69b6b 100644
--- a/src/kudu/tools/ksck_results.h
+++ b/src/kudu/tools/ksck_results.h
@@ -26,6 +26,7 @@
 
 #include <boost/optional/optional.hpp>
 
+#include "kudu/master/master.pb.h"
 #include "kudu/rebalance/cluster_status.h"
 #include "kudu/tablet/tablet.pb.h"  // IWYU pragma: keep
 #include "kudu/util/status.h"
@@ -79,15 +80,16 @@
   enum Values {
     NONE = 0,
     MASTER_SUMMARIES = 1 << 0,
-    TSERVER_SUMMARIES = 1 << 1,
-    VERSION_SUMMARIES = 1 << 2,
-    TABLET_SUMMARIES = 1 << 3,
-    TABLE_SUMMARIES = 1 << 4,
-    CHECKSUM_RESULTS = 1 << 5,
-    TOTAL_COUNT = 1 << 6,
+    TSERVER_STATES = 1 << 1,
+    TSERVER_SUMMARIES = 1 << 2,
+    VERSION_SUMMARIES = 1 << 3,
+    TABLET_SUMMARIES = 1 << 4,
+    TABLE_SUMMARIES = 1 << 5,
+    CHECKSUM_RESULTS = 1 << 6,
+    TOTAL_COUNT = 1 << 7,
 
     // Print all sections above.
-    ALL_SECTIONS = 0b01111111
+    ALL_SECTIONS = 0b011111111
   };
 };
 
@@ -103,6 +105,8 @@
 // Convenience map version -> servers.
 typedef std::map<std::string, std::vector<std::string>> KsckVersionToServersMap;
 
+typedef std::map<std::string, master::TServerStatePB> KsckTServerStateMap;
+
 // Container for all the results of a series of ksck checks.
 struct KsckResults {
 
@@ -127,6 +131,9 @@
   KsckFlagToServersMap tserver_flag_to_servers_map;
   KsckFlagTagsMap tserver_flag_tags_map;
 
+  // Any special states that the tablet servers may be in.
+  KsckTServerStateMap ts_states;
+
   // Collected results of the checksum scan.
   KsckChecksumResults checksum_results;
 
@@ -158,6 +165,9 @@
                       const KsckFlagTagsMap& flag_tags_map,
                       std::ostream& out);
 
+Status PrintTServerStatesTable(const KsckTServerStateMap& ts_states,
+                               std::ostream& out);
+
 // Print a summary of the Kudu versions running across all servers from which
 // information could be fetched. Servers are grouped by version to make the
 // table compact.
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index a5066dc..d7db521 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -2935,6 +2935,18 @@
         Substitute("tserver list $0 --columns=uuid,state --format=csv", master_addr), &out));
   ASSERT_STR_CONTAINS(out, Substitute("$0,$1", ts_uuid, "MAINTENANCE_MODE"));
 
+  // Ksck should show a table showing the state.
+  {
+    string out;
+    NO_FATALS(RunActionStdoutString(
+        Substitute("cluster ksck $0", master_addr), &out));
+    ASSERT_STR_CONTAINS(out, Substitute(
+         "Tablet Server States\n"
+         "              Server              |      State\n"
+         "----------------------------------+------------------\n"
+         " $0 | MAINTENANCE_MODE", ts_uuid));
+  }
+
   // We should still see the state if the tserver hasn't been registered.
   // "Unregister" the server by restarting the cluster and only bringing up the
   // master, and verify that we still see the tserver in maintenance mode.
@@ -2944,6 +2956,16 @@
   NO_FATALS(RunActionStdoutString(
         Substitute("tserver list $0 --columns=uuid,state --format=csv", master_addr), &out));
   ASSERT_STR_CONTAINS(out, Substitute("$0,$1", ts_uuid, "MAINTENANCE_MODE"));
+  {
+    string out;
+    NO_FATALS(RunActionStdoutString(
+        Substitute("cluster ksck $0", master_addr), &out));
+    ASSERT_STR_CONTAINS(out, Substitute(
+         "Tablet Server States\n"
+         "              Server              |      State\n"
+         "----------------------------------+------------------\n"
+         " $0 | MAINTENANCE_MODE", ts_uuid));
+  }
 
   NO_FATALS(RunActionStdoutNone(Substitute("tserver state exit_maintenance $0 $1",
                                            master_addr, ts_uuid)));
@@ -2962,6 +2984,10 @@
           Substitute("tserver list $0 --columns=uuid,state --format=csv", master_addr), &out));
     ASSERT_STR_CONTAINS(out, Substitute("$0,$1", ts_uuid, "NONE"));
   });
+  // Ksck should also report that there aren't special tablet server states.
+  NO_FATALS(RunActionStdoutString(
+      Substitute("cluster ksck $0", master_addr), &out));
+  ASSERT_STR_NOT_CONTAINS(out, "Tablet Server States");
 }
 
 TEST_F(ToolTest, TestMasterList) {