blob: b9cad6c8d2d4e41ef7e522aa5424e88ada838c69 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include <iosfwd>
#include <map>
#include <set>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <boost/optional/optional.hpp>
#include <glog/logging.h>
#include "kudu/tablet/metadata.pb.h"
#include "kudu/tablet/tablet.pb.h" // IWYU pragma: keep
#include "kudu/util/status.h"
namespace kudu {
namespace tools {
class KsckResultsPB;
// The result of health check on a tablet.
// Also used to indicate the health of a table, since the health of a table is
// the health of its least healthy tablet.
enum class KsckCheckResult {
// The tablet is healthy.
HEALTHY,
// The tablet has on-going tablet copies.
RECOVERING,
// The tablet has fewer replicas than its table's replication factor and
// has no on-going tablet copies.
UNDER_REPLICATED,
// The tablet is missing a majority of its replicas and is unavailable for
// writes. If a majority cannot be brought back online, then the tablet
// requires manual intervention to recover.
UNAVAILABLE,
// There was a discrepancy among the tablets' consensus configs and the master's.
CONSENSUS_MISMATCH,
};
const char* const KsckCheckResultToString(KsckCheckResult cr);
// Possible types of consensus configs.
enum class KsckConsensusConfigType {
// A config reported by the master.
MASTER,
// A config that has been committed.
COMMITTED,
// A config that has not yet been committed.
PENDING,
};
// Representation of a consensus state.
struct KsckConsensusState {
KsckConsensusState() = default;
KsckConsensusState(KsckConsensusConfigType type,
boost::optional<int64_t> term,
boost::optional<int64_t> opid_index,
boost::optional<std::string> leader_uuid,
const std::vector<std::string>& voters,
const std::vector<std::string>& non_voters)
: type(type),
term(std::move(term)),
opid_index(std::move(opid_index)),
leader_uuid(std::move(leader_uuid)),
voter_uuids(voters.cbegin(), voters.cend()),
non_voter_uuids(non_voters.cbegin(), non_voters.cend()) {
// A consensus state must have a term unless it's one sourced from the master.
CHECK(type == KsckConsensusConfigType::MASTER || term);
}
// Two KsckConsensusState structs match if they have the same
// leader_uuid, the same set of peers, and one of the following holds:
// - at least one of them is of type MASTER
// - they are configs of the same type and they have the same term
bool Matches(const KsckConsensusState &other) const {
bool same_leader_and_peers =
leader_uuid == other.leader_uuid &&
voter_uuids == other.voter_uuids &&
non_voter_uuids == other.non_voter_uuids;
if (type == KsckConsensusConfigType::MASTER ||
other.type == KsckConsensusConfigType::MASTER) {
return same_leader_and_peers;
}
return type == other.type && term == other.term && same_leader_and_peers;
}
KsckConsensusConfigType type;
boost::optional<int64_t> term;
boost::optional<int64_t> opid_index;
boost::optional<std::string> leader_uuid;
std::set<std::string> voter_uuids;
std::set<std::string> non_voter_uuids;
};
// Represents the health of a server.
enum class KsckServerHealth {
// The server is healthy.
HEALTHY,
// The server rejected attempts to communicate as unauthorized.
UNAUTHORIZED,
// The server can't be contacted.
UNAVAILABLE,
// The server reported an unexpected UUID.
WRONG_SERVER_UUID,
};
// Return a string representation of 'sh'.
const char* const ServerHealthToString(KsckServerHealth sh);
// Returns an int signifying the "unhealthiness level" of 'sh'.
// 0 means healthy; higher values are unhealthier.
// Useful for sorting or comparing.
int ServerHealthScore(KsckServerHealth sh);
// A summary of a server health check.
struct KsckServerHealthSummary {
std::string uuid;
std::string address;
boost::optional<std::string> version;
KsckServerHealth health = KsckServerHealth::HEALTHY;
Status status = Status::OK();
};
// A summary of the state of a table.
struct KsckTableSummary {
std::string id;
std::string name;
int replication_factor = 0;
int healthy_tablets = 0;
int recovering_tablets = 0;
int underreplicated_tablets = 0;
int consensus_mismatch_tablets = 0;
int unavailable_tablets = 0;
int TotalTablets() const {
return healthy_tablets + recovering_tablets + underreplicated_tablets +
consensus_mismatch_tablets + unavailable_tablets;
}
int UnhealthyTablets() const {
return TotalTablets() - healthy_tablets;
}
// Summarize the table's status with a KsckCheckResult.
// A table's status is determined by the health of the least healthy tablet.
KsckCheckResult TableStatus() const {
if (unavailable_tablets > 0) {
return KsckCheckResult::UNAVAILABLE;
}
if (consensus_mismatch_tablets > 0) {
return KsckCheckResult::CONSENSUS_MISMATCH;
}
if (underreplicated_tablets > 0) {
return KsckCheckResult::UNDER_REPLICATED;
}
if (recovering_tablets > 0) {
return KsckCheckResult::RECOVERING;
}
return KsckCheckResult::HEALTHY;
}
};
// Types of Kudu servers.
enum class KsckServerType {
MASTER,
TABLET_SERVER,
};
// Return a string representation of 'type'.
const char* const ServerTypeToString(KsckServerType type);
// A summary of the state of a tablet replica.
struct KsckReplicaSummary {
std::string ts_uuid;
boost::optional<std::string> ts_address;
bool ts_healthy = false;
bool is_leader = false;
bool is_voter = false;
tablet::TabletStatePB state = tablet::UNKNOWN;
boost::optional<tablet::TabletStatusPB> status_pb;
boost::optional<KsckConsensusState> consensus_state;
};
// A summary of the state of a tablet.
struct KsckTabletSummary {
std::string id;
std::string table_id;
std::string table_name;
KsckCheckResult result;
std::string status;
KsckConsensusState master_cstate;
std::vector<KsckReplicaSummary> replicas;
};
// The result of a checksum on a tablet replica.
struct KsckReplicaChecksum {
std::string ts_address;
std::string ts_uuid;
Status status;
uint64_t checksum = 0;
};
// The result of a tablet checksum scan.
struct KsckTabletChecksum {
bool mismatch = false;
std::string tablet_id;
std::map<std::string, KsckReplicaChecksum> replica_checksums;
};
// The results of a checksum operation on a whole table.
typedef std::map<std::string, KsckTabletChecksum> KsckTableChecksum;
typedef std::map<std::string, KsckTableChecksum> KsckTableChecksumMap;
struct KsckChecksumResults {
boost::optional<uint64_t> snapshot_timestamp;
KsckTableChecksumMap tables;
};
enum class PrintMode {
// Print results in pretty-printed JSON format.
JSON_PRETTY,
// Print results in compact JSON format. Differs from JSON_PRETTY only in
// format, not content.
JSON_COMPACT,
// Print results in plain text, focusing on errors and omitting most
// information about healthy tablets.
PLAIN_CONCISE,
// Print results in plain text.
PLAIN_FULL,
};
typedef std::map<std::string, KsckConsensusState> KsckConsensusStateMap;
// A flag and its value.
typedef std::pair<std::string, std::string> KsckFlag;
// Map (flag name, flag value) -> server uuids with --flag=value.
typedef std::map<KsckFlag, std::vector<std::string>> KsckFlagToServersMap;
// Convenience map flag name -> flag tags.
typedef std::unordered_map<std::string, std::string> KsckFlagTagsMap;
// Container for all the results of a series of ksck checks.
struct KsckResults {
// Collection of error status for failed checks. Used to print out a final
// summary of all failed checks.
// All checks passed if and only if this vector is empty.
std::vector<Status> error_messages;
// Collection of warnings from checks.
// These errors are not considered to indicate an unhealthy cluster,
// so they do not cause ksck to report an error.
std::vector<Status> warning_messages;
// Health summaries for master and tablet servers.
std::vector<KsckServerHealthSummary> master_summaries;
std::vector<KsckServerHealthSummary> tserver_summaries;
// Information about the flags of masters and tablet servers.
KsckFlagToServersMap master_flag_to_servers_map;
KsckFlagTagsMap master_flag_tags_map;
KsckFlagToServersMap tserver_flag_to_servers_map;
KsckFlagTagsMap tserver_flag_tags_map;
// Information about the master consensus configuration.
std::vector<std::string> master_uuids;
bool master_consensus_conflict = false;
KsckConsensusStateMap master_consensus_state_map;
// Detailed information about each table and tablet.
// Tablet information includes consensus state.
std::vector<KsckTabletSummary> tablet_summaries;
std::vector<KsckTableSummary> table_summaries;
// Collected results of the checksum scan.
KsckChecksumResults checksum_results;
// Print this KsckResults to 'out', according to the PrintMode 'mode'.
Status PrintTo(PrintMode mode, std::ostream& out);
// Print this KsckResults to 'out' in JSON format.
// 'mode' must be PrintMode::JSON_PRETTY or PrintMode::JSON_COMPACT.
Status PrintJsonTo(PrintMode mode, std::ostream& out) const;
void ToPb(KsckResultsPB* pb) const;
};
// Print a formatted health summary to 'out', given a list `summaries`
// describing the health of servers of type 'type'.
Status PrintServerHealthSummaries(KsckServerType type,
const std::vector<KsckServerHealthSummary>& summaries,
std::ostream& out);
// Print a formatted summary of the flags in 'flag_to_servers_map', indicating
// which servers have which (flag, value) pairs set.
// Flag tag information is sourced from 'flag_tags_map'.
Status PrintFlagTable(KsckServerType type,
int num_servers,
const KsckFlagToServersMap& flag_to_servers_map,
const KsckFlagTagsMap& flag_tags_map,
std::ostream& out);
// Print a summary of the Kudu versions running across all servers from which
// information could be fetched. Servers are grouped by version to make the
// table compact.
Status PrintVersionTable(const std::vector<KsckServerHealthSummary>& masters,
const std::vector<KsckServerHealthSummary>& tservers,
std::ostream& out);
// Print a formatted summary of the tables in 'table_summaries' to 'out'.
Status PrintTableSummaries(const std::vector<KsckTableSummary>& table_summaries,
std::ostream& out);
// Print a formatted summary of the tablets in 'tablet_summaries' to 'out'.
Status PrintTabletSummaries(const std::vector<KsckTabletSummary>& tablet_summaries,
PrintMode mode,
std::ostream& out);
// Print to 'out' a "consensus matrix" that compares the consensus states of the
// replicas on servers with ids in 'server_uuids', given the set of consensus
// states in 'consensus_states'. If given, 'ref_cstate' will be used as the
// master's point of view of the consensus state of the tablet.
Status PrintConsensusMatrix(const std::vector<std::string>& server_uuids,
const boost::optional<KsckConsensusState> ref_cstate,
const KsckConsensusStateMap& consensus_states,
std::ostream& out);
Status PrintChecksumResults(const KsckChecksumResults& checksum_results,
std::ostream& out);
Status PrintTotalCounts(const KsckResults& results, std::ostream& out);
} // namespace tools
} // namespace kudu