blob: f42c4619aa3d37efd8689ebd426d14122adeb515 [file] [log] [blame]
//go:build integ
// +build integ
// Copyright Istio Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// See the License for the specific language governing permissions and
// limitations under the License.
package pilot
import (
import (
v1 ""
metav1 ""
import (
istioKube ""
func TestCNIRaceRepair(t *testing.T) {
Run(func(t framework.TestContext) {
if !i.Settings().EnableCNI {
t.Skip("CNI race condition mitigation is only tested when CNI is enabled.")
c := t.Clusters().Default()
ns := namespace.NewOrFail(t, t, namespace.Config{
Prefix: "cni-race",
Inject: true,
// Create a echo deployment in the cni-race namespace.
t.Logf("Deploy an echo instance in namespace %v...", ns.Name())
New(t, c).
Namespace: ns,
Ports: ports.All(),
Subsets: []echo.SubsetConfig{{}},
// To begin with, delete CNI Daemonset to simulate a CNI race condition.
// Temporarily store CNI DaemonSet, which will be deployed again later.
t.Log("Delete CNI Daemonset temporarily to simulate race condition")
cniDaemonSet := getCNIDaemonSet(t, c)
deleteCNIDaemonset(t, c)
// Rollout restart instances in the echo namespace, and wait for a broken instance.
t.Log("Rollout restart echo instance to get a broken instance")
rolloutCmd := fmt.Sprintf("kubectl rollout restart deployment -n %s", ns.Name())
if _, err := shell.Execute(true, rolloutCmd); err != nil {
t.Fatalf("failed to rollout restart deployments %v", err)
waitForBrokenPodOrFail(t, c, ns)
t.Log("Redeploy CNI and verify repair takes effect by evicting the broken pod")
// Now bring back CNI Daemonset, and pod in the echo namespace should be repaired.
deployCNIDaemonset(t, c, cniDaemonSet)
waitForRepairOrFail(t, c, ns)
func getCNIDaemonSet(ctx framework.TestContext, c cluster.Cluster) *v1.DaemonSet {
cniDaemonSet, err := c.(istioKube.ExtendedClient).
Get(context.Background(), "istio-cni-node", metav1.GetOptions{})
if err != nil {
ctx.Fatalf("failed to get CNI Daemonset %v", err)
if cniDaemonSet == nil {
ctx.Fatal("cannot find CNI Daemonset")
return cniDaemonSet
func deleteCNIDaemonset(ctx framework.TestContext, c cluster.Cluster) {
if err := c.(istioKube.ExtendedClient).
Delete(context.Background(), "istio-cni-node", metav1.DeleteOptions{}); err != nil {
ctx.Fatalf("failed to delete CNI Daemonset %v", err)
// Wait until the CNI Daemonset pod cannot be fetched anymore
retry.UntilSuccessOrFail(ctx, func() error {
scopes.Framework.Infof("Checking if CNI Daemonset pods are deleted...")
pods, err := c.PodsForSelector(context.TODO(), "kube-system", "k8s-app=istio-cni-node")
if err != nil {
return err
if len(pods.Items) > 0 {
return errors.New("CNI Daemonset pod still exists after deletion")
return nil
}, retry.Delay(1*time.Second), retry.Timeout(80*time.Second))
func deployCNIDaemonset(ctx framework.TestContext, c cluster.Cluster, cniDaemonSet *v1.DaemonSet) {
deployDaemonSet := v1.DaemonSet{}
deployDaemonSet.Spec = cniDaemonSet.Spec
deployDaemonSet.ObjectMeta = metav1.ObjectMeta{
Name: cniDaemonSet.ObjectMeta.Name,
Namespace: cniDaemonSet.ObjectMeta.Namespace,
Labels: cniDaemonSet.ObjectMeta.Labels,
Annotations: cniDaemonSet.ObjectMeta.Annotations,
_, err := c.(istioKube.ExtendedClient).Kube().AppsV1().DaemonSets("kube-system").
Create(context.Background(), &deployDaemonSet, metav1.CreateOptions{})
if err != nil {
ctx.Fatalf("failed to deploy CNI Daemonset %v", err)
func waitForBrokenPodOrFail(t framework.TestContext, cluster cluster.Cluster, ns namespace.Instance) {
retry.UntilSuccessOrFail(t, func() error {
pods, err := cluster.CoreV1().Pods(ns.Name()).List(context.TODO(), metav1.ListOptions{})
if err != nil {
return err
if len(pods.Items) == 0 {
return fmt.Errorf("still waiting the pod in namespace %v to start", ns.Name())
// Verify that at least one pod is in broken state due to the race condition.
for _, p := range pods.Items {
for _, container := range p.Status.InitContainerStatuses {
if state := container.LastTerminationState.Terminated; state != nil && state.ExitCode ==
constants.ValidationErrorCode {
return nil
return fmt.Errorf("cannot find any pod with wanted exit code %v", constants.ValidationErrorCode)
}, retry.Delay(1*time.Second), retry.Timeout(80*time.Second))
func waitForRepairOrFail(t framework.TestContext, cluster cluster.Cluster, ns namespace.Instance) {
retry.UntilSuccessOrFail(t, func() error {
pods, err := cluster.CoreV1().Pods(ns.Name()).List(context.TODO(), metav1.ListOptions{})
if err != nil {
return err
if len(pods.Items) == 0 {
return errors.New("no pod found")
// Verify that no pod is broken by the race condition now.
for _, p := range pods.Items {
for _, container := range p.Status.InitContainerStatuses {
if state := container.LastTerminationState.Terminated; state != nil && state.ExitCode ==
constants.ValidationErrorCode {
return errors.New("there are still pods in broken state due to CNI race condition")
return nil
}, retry.Delay(1*time.Second), retry.Timeout(80*time.Second))