blob: 13fe68f4e1418e5f8f216b3d87ab2e2cab395ba2 [file] [log] [blame]
// Copyright 2018 The etcd Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tester
import (
"context"
"fmt"
"sort"
"strings"
"time"
"github.com/coreos/etcd/clientv3"
"github.com/coreos/etcd/functional/rpcpb"
"go.uber.org/zap"
)
func inject_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
cli1, err := clus.Members[idx1].CreateEtcdClient()
if err != nil {
return err
}
defer cli1.Close()
var mresp *clientv3.MemberListResponse
mresp, err = cli1.MemberList(context.Background())
mss := []string{}
if err == nil && mresp != nil {
mss = describeMembers(mresp)
}
clus.lg.Info(
"member list before disastrous machine failure",
zap.String("request-to", clus.Members[idx1].EtcdClientEndpoint),
zap.Strings("members", mss),
zap.Error(err),
)
if err != nil {
return err
}
sresp, serr := cli1.Status(context.Background(), clus.Members[idx1].EtcdClientEndpoint)
if serr != nil {
return serr
}
id1 := sresp.Header.MemberId
is1 := fmt.Sprintf("%016x", id1)
clus.lg.Info(
"disastrous machine failure START",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("target-member-id", is1),
zap.Error(err),
)
err = clus.sendOp(idx1, rpcpb.Operation_SIGQUIT_ETCD_AND_REMOVE_DATA)
clus.lg.Info(
"disastrous machine failure END",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("target-member-id", is1),
zap.Error(err),
)
if err != nil {
return err
}
time.Sleep(2 * time.Second)
idx2 := (idx1 + 1) % len(clus.Members)
var cli2 *clientv3.Client
cli2, err = clus.Members[idx2].CreateEtcdClient()
if err != nil {
return err
}
defer cli2.Close()
// FIXME(bug): this may block forever during
// "SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT"
// is the new leader too busy with snapshotting?
// is raft proposal dropped?
// enable client keepalive for failover?
clus.lg.Info(
"member remove after disaster START",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("target-member-id", is1),
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
)
ctx, cancel := context.WithTimeout(context.Background(), time.Minute)
_, err = cli2.MemberRemove(ctx, id1)
cancel()
clus.lg.Info(
"member remove after disaster END",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("target-member-id", is1),
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
zap.Error(err),
)
if err != nil {
return err
}
time.Sleep(2 * time.Second)
mresp, err = cli2.MemberList(context.Background())
mss = []string{}
if err == nil && mresp != nil {
mss = describeMembers(mresp)
}
clus.lg.Info(
"member list after member remove",
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
zap.Strings("members", mss),
zap.Error(err),
)
return err
}
func recover_SIGQUIT_ETCD_AND_REMOVE_DATA(clus *Cluster, idx1 int) error {
idx2 := (idx1 + 1) % len(clus.Members)
cli2, err := clus.Members[idx2].CreateEtcdClient()
if err != nil {
return err
}
defer cli2.Close()
_, err = cli2.MemberAdd(context.Background(), clus.Members[idx1].Etcd.AdvertisePeerURLs)
clus.lg.Info(
"member add before fresh restart",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
zap.Error(err),
)
if err != nil {
return err
}
time.Sleep(2 * time.Second)
clus.Members[idx1].Etcd.InitialClusterState = "existing"
err = clus.sendOp(idx1, rpcpb.Operation_RESTART_ETCD)
clus.lg.Info(
"fresh restart after member add",
zap.String("target-endpoint", clus.Members[idx1].EtcdClientEndpoint),
zap.Error(err),
)
if err != nil {
return err
}
time.Sleep(2 * time.Second)
var mresp *clientv3.MemberListResponse
mresp, err = cli2.MemberList(context.Background())
mss := []string{}
if err == nil && mresp != nil {
mss = describeMembers(mresp)
}
clus.lg.Info(
"member list after member add",
zap.String("request-to", clus.Members[idx2].EtcdClientEndpoint),
zap.Strings("members", mss),
zap.Error(err),
)
return err
}
func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus *Cluster) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER,
injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA,
recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA,
}
c := &caseFollower{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case {
return &caseUntilSnapshot{
rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER_UNTIL_TRIGGER_SNAPSHOT,
Case: new_Case_SIGQUIT_AND_REMOVE_ONE_FOLLOWER(clus),
}
}
func new_Case_SIGQUIT_AND_REMOVE_LEADER(clus *Cluster) Case {
cc := caseByFunc{
rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER,
injectMember: inject_SIGQUIT_ETCD_AND_REMOVE_DATA,
recoverMember: recover_SIGQUIT_ETCD_AND_REMOVE_DATA,
}
c := &caseLeader{cc, -1, -1}
return &caseDelay{
Case: c,
delayDuration: clus.GetCaseDelayDuration(),
}
}
func new_Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT(clus *Cluster) Case {
return &caseUntilSnapshot{
rpcpbCase: rpcpb.Case_SIGQUIT_AND_REMOVE_LEADER_UNTIL_TRIGGER_SNAPSHOT,
Case: new_Case_SIGQUIT_AND_REMOVE_LEADER(clus),
}
}
func describeMembers(mresp *clientv3.MemberListResponse) (ss []string) {
ss = make([]string, len(mresp.Members))
for i, m := range mresp.Members {
ss[i] = fmt.Sprintf("Name %s / ID %016x / ClientURLs %s / PeerURLs %s",
m.Name,
m.ID,
strings.Join(m.ClientURLs, ","),
strings.Join(m.PeerURLs, ","),
)
}
sort.Strings(ss)
return ss
}