| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include <algorithm> |
| #include <functional> |
| #include <map> |
| #include <memory> |
| #include <ostream> |
| #include <set> |
| #include <string> |
| #include <unordered_map> |
| #include <utility> |
| #include <vector> |
| |
| #include <gflags/gflags_declare.h> |
| #include <glog/logging.h> |
| #include <gtest/gtest.h> |
| |
| #include "kudu/common/row_operations.pb.h" |
| #include "kudu/common/schema.h" |
| #include "kudu/common/wire_protocol-test-util.h" |
| #include "kudu/common/wire_protocol.h" |
| #include "kudu/consensus/consensus.pb.h" |
| #include "kudu/consensus/metadata.pb.h" |
| #include "kudu/gutil/strings/substitute.h" |
| #include "kudu/integration-tests/cluster_itest_util.h" |
| #include "kudu/integration-tests/cluster_verifier.h" |
| #include "kudu/integration-tests/external_mini_cluster-itest-base.h" |
| #include "kudu/integration-tests/mini_cluster_fs_inspector.h" |
| #include "kudu/integration-tests/test_workload.h" |
| #include "kudu/mini-cluster/external_mini_cluster.h" |
| #include "kudu/rpc/rpc_controller.h" |
| #include "kudu/tablet/metadata.pb.h" |
| #include "kudu/tablet/tablet.pb.h" |
| #include "kudu/tserver/tserver.pb.h" |
| #include "kudu/tserver/tserver_service.proxy.h" |
| #include "kudu/util/countdown_latch.h" |
| #include "kudu/util/monotime.h" |
| #include "kudu/util/status.h" |
| #include "kudu/util/test_macros.h" |
| #include "kudu/util/test_util.h" |
| |
| DECLARE_bool(raft_prepare_replacement_before_eviction); |
| |
| using kudu::consensus::RaftPeerPB; |
| using kudu::consensus::EXCLUDE_HEALTH_REPORT; |
| using kudu::itest::AddServer; |
| using kudu::itest::RemoveServer; |
| using kudu::itest::StartElection; |
| using kudu::itest::TServerDetails; |
| using kudu::itest::WaitForNumTabletsOnTS; |
| using kudu::itest::WaitForServersToAgree; |
| using kudu::itest::WaitUntilCommittedOpIdIndexIs; |
| using kudu::itest::WaitUntilTabletRunning; |
| using kudu::tablet::TABLET_DATA_READY; |
| using kudu::tablet::TABLET_DATA_TOMBSTONED; |
| using kudu::tserver::ListTabletsResponsePB; |
| using std::map; |
| using std::set; |
| using std::string; |
| using std::unordered_map; |
| using std::vector; |
| using strings::Substitute; |
| |
| namespace kudu { |
| |
| enum InstabilityType { |
| NODE_DOWN, |
| NODE_STOPPED |
| }; |
| |
| class TabletReplacementITest : public ExternalMiniClusterITestBase { |
| protected: |
| // Maps tablet identifier (UUID) into the set of TS UUIDs that host |
| // replicas of the tablet. |
| typedef map<string, vector<string>> TabletToReplicaUUIDs; |
| |
| Status GetTabletToReplicaUUIDsMapping(const MonoDelta& timeout, |
| TabletToReplicaUUIDs* mappings) const; |
| |
| // Depending on replica management mode the test is running, not all elements |
| // of ts_map_ are relevant. So, construct ts_map containing information on |
| // tablet servers which host tablet replicas. |
| void GetTsMapForReplicas(const vector<string>& replica_uuids, |
| unordered_map<string, TServerDetails*>* ts_map) const; |
| |
| void TestDontEvictIfRemainingConfigIsUnstable(InstabilityType type, |
| bool is_3_4_3_mode); |
| }; |
| |
| Status TabletReplacementITest::GetTabletToReplicaUUIDsMapping( |
| const MonoDelta& timeout, |
| TabletToReplicaUUIDs* mappings) const { |
| map<string, set<string>> tablet_to_replicas; |
| for (const auto& e : ts_map_) { |
| vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
| RETURN_NOT_OK(itest::ListTablets(e.second, timeout, &tablets)); |
| for (const auto& tablet : tablets) { |
| const auto& tablet_id = tablet.tablet_status().tablet_id(); |
| tablet_to_replicas[tablet_id].insert(e.first); |
| } |
| } |
| TabletToReplicaUUIDs ret; |
| for (const auto& e : tablet_to_replicas) { |
| ret.emplace(e.first, vector<string>(e.second.begin(), e.second.end())); |
| } |
| mappings->swap(ret); |
| return Status::OK(); |
| } |
| |
| void TabletReplacementITest::GetTsMapForReplicas( |
| const vector<string>& replica_uuids, |
| unordered_map<string, TServerDetails*>* ts_map) const { |
| decltype(ts_map_) ret; |
| for (const auto& uuid : replica_uuids) { |
| const auto it = ts_map_.find(uuid); |
| ASSERT_NE(ts_map_.end(), it); |
| ret[uuid] = it->second; |
| } |
| ts_map->swap(ret); |
| } |
| |
| void TabletReplacementITest::TestDontEvictIfRemainingConfigIsUnstable( |
| InstabilityType type, bool is_3_4_3_mode) { |
| SKIP_IF_SLOW_NOT_ALLOWED(); |
| |
| // The configuration is tuned to minimize chances of reporting on failed |
| // tablet replicas one-by-one. That's because by the scenario 2 replicas out |
| // of 3 are becoming unresponsive, and the scenario assumes the decision |
| // on whether to replace failed replicas is made knowing about both failed |
| // replicas. |
| const MonoDelta kTimeout = MonoDelta::FromSeconds(60); |
| constexpr auto kUnavailableSec = 3; |
| constexpr auto kTsToMasterHbIntervalSec = 2 * kUnavailableSec; |
| constexpr auto kConsensusRpcTimeoutSec = 2; |
| constexpr auto kNumReplicas = 3; |
| const vector<string> ts_flags = { |
| Substitute("--raft_prepare_replacement_before_eviction=$0", is_3_4_3_mode), |
| |
| Substitute("--follower_unavailable_considered_failed_sec=$0", kUnavailableSec), |
| Substitute("--consensus_rpc_timeout_ms=$0", kConsensusRpcTimeoutSec * 1000), |
| Substitute("--heartbeat_interval_ms=$0", kTsToMasterHbIntervalSec * 1000), |
| // 'update_tablet_stats_interval_ms' should be larger than 'heartbeat_interval_ms'. |
| Substitute("--update_tablet_stats_interval_ms=$0", (kTsToMasterHbIntervalSec + 1) * 1000), |
| "--raft_heartbeat_interval_ms=50", |
| "--enable_leader_failure_detection=false", |
| }; |
| const vector<string> master_flags = { |
| Substitute("--raft_prepare_replacement_before_eviction=$0", is_3_4_3_mode), |
| |
| "--catalog_manager_wait_for_new_tablets_to_elect_leader=false", |
| }; |
| // Additional tablet server is needed when running in 3-4-3 replica management |
| // scheme to allow for eviction of failed tablet replicas. |
| const auto kNumTservers = is_3_4_3_mode ? kNumReplicas + 1 : kNumReplicas; |
| NO_FATALS(StartCluster(ts_flags, master_flags, kNumTservers)); |
| |
| TestWorkload workload(cluster_.get()); |
| workload.set_num_replicas(kNumReplicas); |
| workload.Setup(); // Easy way to create a new tablet. |
| |
| TabletToReplicaUUIDs tablet_to_replicas; |
| ASSERT_EVENTUALLY([&] { |
| ASSERT_OK(GetTabletToReplicaUUIDsMapping(kTimeout, &tablet_to_replicas)); |
| // There should be only one tablet. |
| ASSERT_EQ(1, tablet_to_replicas.size()); |
| // It takes some time to bootstrap all replicas across all tablet servers |
| ASSERT_EQ(kNumReplicas, tablet_to_replicas.cbegin()->second.size()); |
| }); |
| const string tablet_id = tablet_to_replicas.cbegin()->first; |
| const auto& replica_uuids = tablet_to_replicas.cbegin()->second; |
| |
| // Wait until all replicas are up and running. |
| for (const auto& uuid : replica_uuids) { |
| ASSERT_OK(WaitUntilTabletRunning(ts_map_[uuid], tablet_id, kTimeout)); |
| } |
| |
| // Elect a leader. |
| const auto& kLeaderId = replica_uuids[0]; |
| TServerDetails* leader_ts = ts_map_[kLeaderId]; |
| ASSERT_OK(StartElection(leader_ts, tablet_id, kTimeout)); |
| { |
| decltype(ts_map_) ts_map; |
| NO_FATALS(GetTsMapForReplicas(replica_uuids, &ts_map)); |
| // Wait for NO_OP. |
| ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map, tablet_id, 1)); |
| } |
| |
| consensus::ConsensusStatePB cstate_initial; |
| ASSERT_OK(GetConsensusState(leader_ts, tablet_id, kTimeout, EXCLUDE_HEALTH_REPORT, |
| &cstate_initial)); |
| |
| const auto& kFollower1Id = replica_uuids[1]; |
| const auto& kFollower2Id = replica_uuids[2]; |
| |
| // Shut down both followers and wait for enough time that the leader thinks they are |
| // unresponsive. It should not trigger a config change to evict either one. |
| switch (type) { |
| case NODE_DOWN: |
| cluster_->tablet_server_by_uuid(kFollower1Id)->Shutdown(); |
| cluster_->tablet_server_by_uuid(kFollower2Id)->Shutdown(); |
| break; |
| case NODE_STOPPED: |
| ASSERT_OK(cluster_->tablet_server_by_uuid(kFollower1Id)->Pause()); |
| ASSERT_OK(cluster_->tablet_server_by_uuid(kFollower2Id)->Pause()); |
| break; |
| } |
| |
| // Sleep to make sure the leader replica recognized the stopped/shutdown |
| // followers as unresponsive according to |
| // --follower_unavailable_considered_failed_sec. Since unreachable peers |
| // are not considered viable per PeerMessageQueue::SafeToEvictUnlocked(), |
| // which makes that calculation based on --consensus_rpc_timeout_ms, we also |
| // wait until that timeout expires to proceed. This ensures that later, when |
| // we resume a follower, the leader does not consider itself unreachable, |
| // which was a bug that we had (KUDU-2230) and that this test also serves as |
| // a regression test for. |
| auto min_sleep_required_sec = std::max(kUnavailableSec, kConsensusRpcTimeoutSec); |
| min_sleep_required_sec = std::max(min_sleep_required_sec, kTsToMasterHbIntervalSec); |
| SleepFor(MonoDelta::FromSeconds(2 * min_sleep_required_sec)); |
| |
| { |
| consensus::ConsensusStatePB cstate; |
| ASSERT_OK(GetConsensusState(leader_ts, tablet_id, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate)); |
| SCOPED_TRACE(cstate.DebugString()); |
| // It's possible the leader only registered one replica as failed when |
| // sending its report to the master, so the master may have requested a |
| // change config request to add non-voter. Regardless, there should be no |
| // new committed config since a majority is down. |
| ASSERT_EQ(cstate_initial.committed_config().opid_index(), |
| cstate.committed_config().opid_index()) |
| << "Leader should not have issued any config change"; |
| } |
| |
| switch (type) { |
| case NODE_DOWN: |
| ASSERT_OK(cluster_->tablet_server_by_uuid(kFollower1Id)->Restart()); |
| break; |
| case NODE_STOPPED: |
| ASSERT_OK(cluster_->tablet_server_by_uuid(kFollower1Id)->Resume()); |
| break; |
| } |
| |
| // At this point the majority of voters is back online, so the leader should |
| // evict the failed replica, resulting in Raft configuration update. |
| ASSERT_EVENTUALLY([&] { |
| consensus::ConsensusStatePB cstate; |
| ASSERT_OK(GetConsensusState(leader_ts, tablet_id, kTimeout, EXCLUDE_HEALTH_REPORT, &cstate)); |
| ASSERT_GT(cstate.committed_config().opid_index(), |
| cstate_initial.committed_config().opid_index() + |
| (is_3_4_3_mode ? 1 : 0)) |
| << "Leader should have issued config change to evict failed follower;" |
| << " the consensus state is: " << cstate.DebugString(); |
| }); |
| } |
| |
| // Test that the Master will tombstone a newly-evicted replica. |
| // Then, test that the Master will NOT tombstone a newly-added replica that is |
| // not part of the committed config yet (only the pending config). |
| TEST_F(TabletReplacementITest, TestMasterTombstoneEvictedReplica) { |
| MonoDelta timeout = MonoDelta::FromSeconds(30); |
| vector<string> ts_flags = { "--enable_leader_failure_detection=false" }; |
| int num_tservers = 5; |
| vector<string> master_flags = { "--master_add_server_when_underreplicated=false" }; |
| master_flags.emplace_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); |
| NO_FATALS(StartCluster(ts_flags, master_flags, num_tservers)); |
| |
| TestWorkload workload(cluster_.get()); |
| workload.set_num_replicas(num_tservers); |
| workload.Setup(); // Easy way to create a new tablet. |
| |
| const int kLeaderIndex = 0; |
| TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()]; |
| const int kFollowerIndex = 4; |
| TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()]; |
| |
| // Figure out the tablet id of the created tablet. |
| vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
| ASSERT_OK(WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); |
| string tablet_id = tablets[0].tablet_status().tablet_id(); |
| |
| // Wait until all replicas are up and running. |
| for (int i = 0; i < cluster_->num_tablet_servers(); i++) { |
| ASSERT_OK(WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], |
| tablet_id, timeout)); |
| } |
| |
| // Elect a leader (TS 0) |
| ASSERT_OK(StartElection(leader_ts, tablet_id, timeout)); |
| ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. |
| |
| // Wait until it has committed its NO_OP, so that we can perform a config change. |
| ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id, timeout)); |
| |
| // Remove a follower from the config. |
| ASSERT_OK(RemoveServer(leader_ts, tablet_id, follower_ts, timeout)); |
| |
| // Wait for the Master to tombstone the replica. |
| ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, |
| { TABLET_DATA_TOMBSTONED }, |
| timeout)); |
| |
| if (!AllowSlowTests()) { |
| // The rest of this test has multi-second waits, so we do it in slow test mode. |
| LOG(WARNING) << "not verifying that a newly-added replica won't be tombstoned; " |
| "run with KUDU_ALLOW_SLOW_TESTS=1 to verify"; |
| GTEST_SKIP(); |
| } |
| |
| // Shut down a majority of followers (3 servers) and then try to add the |
| // follower back to the config. This will cause the config change to end up |
| // in a pending state. |
| unordered_map<string, TServerDetails*> active_ts_map = ts_map_; |
| for (int i = 1; i <= 3; i++) { |
| cluster_->tablet_server(i)->Shutdown(); |
| ASSERT_EQ(1, active_ts_map.erase(cluster_->tablet_server(i)->uuid())); |
| } |
| // This will time out, but should take effect. |
| Status s = AddServer(leader_ts, tablet_id, follower_ts, RaftPeerPB::VOTER, |
| MonoDelta::FromSeconds(5)); |
| ASSERT_TRUE(s.IsTimedOut()) << s.ToString(); |
| ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, { TABLET_DATA_READY }, |
| timeout)); |
| ASSERT_OK(WaitForServersToAgree(timeout, active_ts_map, tablet_id, 3)); |
| |
| // Sleep for a few more seconds and check again to ensure that the Master |
| // didn't end up tombstoning the replica. |
| SleepFor(MonoDelta::FromSeconds(3)); |
| ASSERT_OK(inspect_->CheckTabletDataStateOnTS(kFollowerIndex, tablet_id, { TABLET_DATA_READY })); |
| } |
| |
| // Test for KUDU-2138: ensure that the master will tombstone failed tablets |
| // that have previously been evicted. |
| TEST_F(TabletReplacementITest, TestMasterTombstoneFailedEvictedReplicaOnReport) { |
| const MonoDelta kTimeout = MonoDelta::FromSeconds(30); |
| const int kNumServers = 4; |
| NO_FATALS(StartCluster({"--follower_unavailable_considered_failed_sec=5"}, |
| {"--master_tombstone_evicted_tablet_replicas=false"}, kNumServers)); |
| |
| TestWorkload workload(cluster_.get()); |
| workload.Setup(); // Easy way to create a new tablet. |
| |
| // Determine the tablet id. |
| string tablet_id; |
| ASSERT_EVENTUALLY([&] { |
| vector<string> tablets = inspect_->ListTablets(); |
| ASSERT_FALSE(tablets.empty()); |
| tablet_id = tablets[0]; |
| }); |
| |
| // Determine which tablet servers have data. One should be empty. |
| unordered_map<string, TServerDetails*> active_ts_map = ts_map_; |
| int empty_server_idx = -1; |
| string empty_ts_uuid; |
| consensus::ConsensusMetadataPB cmeta_pb; |
| for (int i = 0; i < kNumServers; i++) { |
| consensus::ConsensusMetadataPB cmeta_pb; |
| if (inspect_->ReadConsensusMetadataOnTS(i, tablet_id, &cmeta_pb).IsNotFound()) { |
| empty_ts_uuid = cluster_->tablet_server(i)->uuid(); |
| ASSERT_EQ(1, active_ts_map.erase(empty_ts_uuid)); |
| empty_server_idx = i; |
| break; |
| } |
| } |
| ASSERT_NE(empty_server_idx, -1); |
| |
| // Wait until all replicas are up and running. |
| for (const auto& e : active_ts_map) { |
| ASSERT_OK(WaitUntilTabletRunning(e.second, tablet_id, kTimeout)); |
| } |
| |
| // Select a replica to fail by shutting it down and mucking with its |
| // metadata. When it restarts, it will fail to open. |
| int idx_to_fail = (empty_server_idx + 1) % kNumServers; |
| auto* ts = cluster_->tablet_server(idx_to_fail); |
| ts->Shutdown(); |
| ASSERT_OK(inspect_->ReadConsensusMetadataOnTS(idx_to_fail, tablet_id, &cmeta_pb)); |
| cmeta_pb.set_current_term(-1); |
| ASSERT_OK(inspect_->WriteConsensusMetadataOnTS(idx_to_fail, tablet_id, cmeta_pb)); |
| |
| // Wait until the replica is evicted and replicated to the empty server. |
| ASSERT_OK(WaitUntilTabletInState(ts_map_[empty_ts_uuid], |
| tablet_id, |
| tablet::RUNNING, |
| kTimeout)); |
| |
| // Restart the tserver and ensure the tablet is failed. |
| ASSERT_OK(ts->Restart()); |
| ASSERT_OK(WaitUntilTabletInState(ts_map_[ts->uuid()], |
| tablet_id, |
| tablet::FAILED, |
| kTimeout)); |
| |
| // Upon restarting, the master will request a report and notice the failed |
| // replica. Wait for the master to tombstone the failed follower. |
| cluster_->master()->Shutdown(); |
| cluster_->master()->mutable_flags()->emplace_back( |
| "--master_tombstone_evicted_tablet_replicas=true"); |
| ASSERT_OK(cluster_->master()->Restart()); |
| ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(idx_to_fail, tablet_id, |
| { TABLET_DATA_TOMBSTONED }, |
| kTimeout)); |
| } |
| |
| // Ensure that the Master will tombstone a replica if it reports in with an old |
| // config. This tests a slightly different code path in the catalog manager |
| // than TestMasterTombstoneEvictedReplica does. |
| TEST_F(TabletReplacementITest, TestMasterTombstoneOldReplicaOnReport) { |
| MonoDelta timeout = MonoDelta::FromSeconds(30); |
| vector<string> ts_flags = { "--enable_leader_failure_detection=false" }; |
| vector<string> master_flags = { "--master_add_server_when_underreplicated=false" }; |
| master_flags.emplace_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); |
| NO_FATALS(StartCluster(ts_flags, master_flags)); |
| |
| TestWorkload workload(cluster_.get()); |
| workload.Setup(); // Easy way to create a new tablet. |
| |
| const int kLeaderIndex = 0; |
| TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()]; |
| const int kFollowerIndex = 2; |
| TServerDetails* follower_ts = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()]; |
| |
| // Figure out the tablet id of the created tablet. |
| vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
| ASSERT_OK(WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); |
| string tablet_id = tablets[0].tablet_status().tablet_id(); |
| |
| // Wait until all replicas are up and running. |
| for (int i = 0; i < cluster_->num_tablet_servers(); i++) { |
| ASSERT_OK(WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], |
| tablet_id, timeout)); |
| } |
| |
| // Elect a leader (TS 0) |
| ASSERT_OK(StartElection(leader_ts, tablet_id, timeout)); |
| ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. |
| |
| // Wait until it has committed its NO_OP, so that we can perform a config change. |
| ASSERT_OK(WaitUntilCommittedOpIdIndexIs(1, leader_ts, tablet_id, timeout)); |
| |
| // Shut down the follower to be removed, then remove it from the config. |
| // We will wait for the Master to be notified of the config change, then shut |
| // down the rest of the cluster and bring the follower back up. The follower |
| // will heartbeat to the Master and then be tombstoned. |
| cluster_->tablet_server(kFollowerIndex)->Shutdown(); |
| |
| // Remove the follower from the config and wait for the Master to notice the |
| // config change. |
| ASSERT_OK(RemoveServer(leader_ts, tablet_id, follower_ts, timeout)); |
| ASSERT_OK(itest::WaitForNumVotersInConfigOnMaster(cluster_->master_proxy(), tablet_id, 2, |
| timeout)); |
| |
| // Shut down the remaining tablet servers and restart the dead one. |
| cluster_->tablet_server(0)->Shutdown(); |
| cluster_->tablet_server(1)->Shutdown(); |
| ASSERT_OK(cluster_->tablet_server(kFollowerIndex)->Restart()); |
| |
| // Wait for the Master to tombstone the revived follower. |
| ASSERT_OK(inspect_->WaitForTabletDataStateOnTS(kFollowerIndex, tablet_id, |
| { TABLET_DATA_TOMBSTONED }, |
| timeout)); |
| } |
| |
| ///////////////////////////////////////////////////////////////////////////// |
| |
| class EvictAndReplaceDeadFollowerITest : |
| public TabletReplacementITest, |
| public ::testing::WithParamInterface<bool> { |
| }; |
| |
| // Test that unreachable followers are evicted and replaced. |
| TEST_P(EvictAndReplaceDeadFollowerITest, UnreachableFollower) { |
| SKIP_IF_SLOW_NOT_ALLOWED(); |
| |
| const bool is_3_4_3_mode = GetParam(); |
| MonoDelta kTimeout = MonoDelta::FromSeconds(30); |
| const vector<string> ts_flags = { |
| "--enable_leader_failure_detection=false", |
| "--follower_unavailable_considered_failed_sec=5", |
| Substitute("--raft_prepare_replacement_before_eviction=$0", is_3_4_3_mode), |
| }; |
| const vector<string> master_flags = { |
| "--catalog_manager_wait_for_new_tablets_to_elect_leader=false", |
| Substitute("--raft_prepare_replacement_before_eviction=$0", is_3_4_3_mode), |
| }; |
| constexpr auto kNumReplicas = 3; |
| |
| // Additional tablet server is needed when running in 3-4-3 replica management |
| // scheme to allow for eviction of failed tablet replicas. |
| const auto kNumTservers = is_3_4_3_mode ? kNumReplicas + 1 : kNumReplicas; |
| |
| NO_FATALS(StartCluster(ts_flags, master_flags, kNumTservers)); |
| |
| TestWorkload workload(cluster_.get()); |
| workload.set_num_replicas(kNumReplicas); |
| workload.Setup(); // Easy way to create a new tablet. |
| |
| TabletToReplicaUUIDs tablet_to_replicas; |
| ASSERT_EVENTUALLY([&] { |
| ASSERT_OK(GetTabletToReplicaUUIDsMapping(kTimeout, &tablet_to_replicas)); |
| // There should be only one tablet. |
| ASSERT_EQ(1, tablet_to_replicas.size()); |
| // It takes some time to bootstrap all replicas across all tablet servers |
| ASSERT_EQ(kNumReplicas, tablet_to_replicas.cbegin()->second.size()); |
| }); |
| const string tablet_id = tablet_to_replicas.cbegin()->first; |
| const auto& replica_uuids = tablet_to_replicas.cbegin()->second; |
| |
| // Wait until all replicas are up and running. |
| for (const auto& uuid : replica_uuids) { |
| ASSERT_OK(WaitUntilTabletRunning(ts_map_[uuid], tablet_id, kTimeout)); |
| } |
| |
| // Elect a leader. |
| const auto& kLeaderId = replica_uuids.front(); |
| TServerDetails* leader_ts = ts_map_[kLeaderId]; |
| ASSERT_OK(StartElection(leader_ts, tablet_id, kTimeout)); |
| { |
| decltype(ts_map_) ts_map; |
| NO_FATALS(GetTsMapForReplicas(replica_uuids, &ts_map)); |
| // Wait for NO_OP. |
| ASSERT_OK(WaitForServersToAgree(kTimeout, ts_map, tablet_id, 1)); |
| } |
| |
| // Shut down the follower to be removed. It should be evicted. |
| const auto& kFollowerId = replica_uuids.back(); |
| cluster_->tablet_server_by_uuid(kFollowerId)->Shutdown(); |
| |
| // Expected OpId index of the committed config: |
| // * with AddServer, Promote and RemoveServer, the opid_index will be 4. |
| // * with RemoveServer and AddServer, the opid_index will be 3. |
| const auto expected_opid_index = is_3_4_3_mode ? 4 : 3; |
| ASSERT_OK(itest::WaitUntilCommittedConfigOpIdIndexIs( |
| expected_opid_index, leader_ts, tablet_id, kTimeout)); |
| ASSERT_OK(cluster_->tablet_server_by_uuid(kFollowerId)->Restart()); |
| } |
| |
| INSTANTIATE_TEST_SUITE_P(, |
| EvictAndReplaceDeadFollowerITest, |
| ::testing::Bool()); |
| |
| ///////////////////////////////////////////////////////////////////////////// |
| |
| class DontEvictIfRemainingConfigIsUnstableITest : |
| public TabletReplacementITest, |
| public ::testing::WithParamInterface<bool> { |
| }; |
| |
| // Regression test for KUDU-2048 and KUDU-2230. If a majority of followers are |
| // unresponsive, the leader should not evict any of them. |
| TEST_P(DontEvictIfRemainingConfigIsUnstableITest, NodesDown) { |
| TestDontEvictIfRemainingConfigIsUnstable(NODE_DOWN, GetParam()); |
| } |
| |
| TEST_P(DontEvictIfRemainingConfigIsUnstableITest, NodesStopped) { |
| TestDontEvictIfRemainingConfigIsUnstable(NODE_STOPPED, GetParam()); |
| } |
| |
| INSTANTIATE_TEST_SUITE_P(, |
| DontEvictIfRemainingConfigIsUnstableITest, |
| ::testing::Bool()); |
| |
| ///////////////////////////////////////////////////////////////////////////// |
| |
| // Regression test for KUDU-1233. This test creates a situation in which tablet |
| // bootstrap will attempt to replay committed (and applied) config change |
| // operations. This is achieved by delaying application of a write at the |
| // tablet level that precedes the config change operations in the WAL, then |
| // initiating a tablet copy to a follower. The follower will not have the |
| // COMMIT for the write operation, so will ignore COMMIT messages for the |
| // applied config change operations. At startup time, the newly |
| // copied tablet should detect that these config change |
| // operations have already been applied and skip them. |
| TEST_F(TabletReplacementITest, TestRemoteBoostrapWithPendingConfigChangeCommits) { |
| SKIP_IF_SLOW_NOT_ALLOWED(); |
| |
| const MonoDelta timeout = MonoDelta::FromSeconds(30); |
| vector<string> ts_flags; |
| ts_flags.emplace_back("--enable_leader_failure_detection=false"); |
| vector<string> master_flags; |
| // We will manage doing the AddServer() manually, in order to make this test |
| // more deterministic. |
| master_flags.emplace_back("--master_add_server_when_underreplicated=false"); |
| master_flags.emplace_back("--master_tombstone_evicted_tablet_replicas=false"); |
| master_flags.emplace_back("--catalog_manager_wait_for_new_tablets_to_elect_leader=false"); |
| NO_FATALS(StartCluster(ts_flags, master_flags)); |
| |
| TestWorkload workload(cluster_.get()); |
| workload.Setup(); // Convenient way to create a table. |
| |
| const int kLeaderIndex = 0; |
| TServerDetails* leader_ts = ts_map_[cluster_->tablet_server(kLeaderIndex)->uuid()]; |
| const int kFollowerIndex = 2; |
| TServerDetails* ts_to_remove = ts_map_[cluster_->tablet_server(kFollowerIndex)->uuid()]; |
| |
| // Wait for tablet creation and then identify the tablet id. |
| vector<ListTabletsResponsePB::StatusAndSchemaPB> tablets; |
| ASSERT_OK(WaitForNumTabletsOnTS(leader_ts, 1, timeout, &tablets)); |
| string tablet_id = tablets[0].tablet_status().tablet_id(); |
| |
| // Wait until all replicas are up and running. |
| for (int i = 0; i < cluster_->num_tablet_servers(); i++) { |
| ASSERT_OK(WaitUntilTabletRunning(ts_map_[cluster_->tablet_server(i)->uuid()], |
| tablet_id, timeout)); |
| } |
| |
| // Elect a leader (TS 0) |
| ASSERT_OK(StartElection(leader_ts, tablet_id, timeout)); |
| ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 1)); // Wait for NO_OP. |
| |
| // Write a single row. |
| ASSERT_OK(WriteSimpleTestRow(leader_ts, tablet_id, RowOperationsPB::INSERT, 0, 0, "", timeout)); |
| |
| // Delay tablet applies in order to delay COMMIT messages to trigger KUDU-1233. |
| // Then insert another row. |
| ASSERT_OK(cluster_->SetFlag(cluster_->tablet_server_by_uuid(leader_ts->uuid()), |
| "tablet_inject_latency_on_apply_write_op_ms", "5000")); |
| |
| // Kick off an async insert, which will be delayed for 5 seconds. This is |
| // normally enough time to evict a replica, tombstone it, add it back, and |
| // Tablet Copy a new replica to it when the log is only a few entries. |
| tserver::WriteRequestPB req; |
| tserver::WriteResponsePB resp; |
| CountDownLatch latch(1); |
| rpc::RpcController rpc; |
| rpc.set_timeout(timeout); |
| req.set_tablet_id(tablet_id); |
| Schema schema = GetSimpleTestSchema(); |
| ASSERT_OK(SchemaToPB(schema, req.mutable_schema())); |
| AddTestRowToPB(RowOperationsPB::INSERT, schema, 1, 1, "", req.mutable_row_operations()); |
| leader_ts->tserver_proxy->WriteAsync(req, &resp, &rpc, |
| [&latch]() { latch.CountDown(); }); |
| |
| // Wait for the replicate to show up (this doesn't wait for COMMIT messages). |
| ASSERT_OK(WaitForServersToAgree(timeout, ts_map_, tablet_id, 3)); |
| |
| // Manually evict the server from the cluster, tombstone the replica, then |
| // add the replica back to the cluster. Without the fix for KUDU-1233, this |
| // will cause the replica to fail to start up. |
| ASSERT_OK(RemoveServer(leader_ts, tablet_id, ts_to_remove, timeout)); |
| ASSERT_OK(itest::DeleteTablet(ts_to_remove, tablet_id, TABLET_DATA_TOMBSTONED, |
| timeout)); |
| ASSERT_OK(AddServer(leader_ts, tablet_id, ts_to_remove, RaftPeerPB::VOTER, timeout)); |
| ASSERT_OK(WaitUntilTabletRunning(ts_to_remove, tablet_id, timeout)); |
| |
| ClusterVerifier v(cluster_.get()); |
| NO_FATALS(v.CheckCluster()); |
| NO_FATALS(v.CheckRowCount(workload.table_name(), |
| ClusterVerifier::EXACTLY, 2)); |
| |
| latch.Wait(); // Avoid use-after-free on the response from the delayed RPC callback. |
| } |
| |
| } // namespace kudu |