From 99d09535c84d6c153680f03f808ca4f4c49ce40e Mon Sep 17 00:00:00 2001 From: Andrew Lavery Date: Tue, 28 May 2024 19:11:33 +0800 Subject: [PATCH 1/5] run many reset tests --- .github/workflows/pull-request.yaml | 51 +++++++++++++++++------------ e2e/reset_test.go | 36 ++++++++++++++++++++ 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/.github/workflows/pull-request.yaml b/.github/workflows/pull-request.yaml index d5f58484e0..62c47cd319 100644 --- a/.github/workflows/pull-request.yaml +++ b/.github/workflows/pull-request.yaml @@ -165,30 +165,39 @@ jobs: fail-fast: false matrix: tests: - - TestSingleNodeInstallation - - TestSingleNodeInstallationAlmaLinux8 - - TestSingleNodeInstallationDebian12 - - TestSingleNodeInstallationCentos8Stream - - TestVersion - - TestHostPreflight - - TestUnsupportedOverrides - - TestMultiNodeInstallation +# - TestSingleNodeInstallation +# - TestSingleNodeInstallationAlmaLinux8 +# - TestSingleNodeInstallationDebian12 +# - TestSingleNodeInstallationCentos8Stream +# - TestVersion +# - TestHostPreflight +# - TestUnsupportedOverrides +# - TestMultiNodeInstallation - TestMultiNodeReset - - TestCommandsRequireSudo - - TestInstallWithoutEmbed - - TestInstallFromReplicatedApp + - TestMultiNodeReset1 + - TestMultiNodeReset2 + - TestMultiNodeReset3 + - TestMultiNodeReset4 + - TestMultiNodeReset5 + - TestMultiNodeReset6 + - TestMultiNodeReset7 + - TestMultiNodeReset8 + - TestMultiNodeReset9 +# - TestCommandsRequireSudo +# - TestInstallWithoutEmbed +# - TestInstallFromReplicatedApp - TestResetAndReinstall - TestResetAndReinstallAirgap - - TestCollectSupportBundle - - TestOldVersionUpgrade - - TestMaterialize - - TestLocalArtifactMirror - - TestSingleNodeAirgapUpgradeUbuntuJammy - - TestInstallSnapshotFromReplicatedApp - - TestMultiNodeAirgapUpgradeUbuntuJammy - - TestSingleNodeDisasterRecovery - - TestSingleNodeResumeDisasterRecovery - - TestSingleNodeAirgapDisasterRecovery +# - TestCollectSupportBundle +# - TestOldVersionUpgrade +# - TestMaterialize +# - TestLocalArtifactMirror +# - TestSingleNodeAirgapUpgradeUbuntuJammy +# - TestInstallSnapshotFromReplicatedApp +# - TestMultiNodeAirgapUpgradeUbuntuJammy +# - TestSingleNodeDisasterRecovery +# - TestSingleNodeResumeDisasterRecovery +# - TestSingleNodeAirgapDisasterRecovery steps: - name: Checkout uses: actions/checkout@v4 diff --git a/e2e/reset_test.go b/e2e/reset_test.go index 54d15eb43f..4b61a37b25 100644 --- a/e2e/reset_test.go +++ b/e2e/reset_test.go @@ -116,3 +116,39 @@ func TestMultiNodeReset(t *testing.T) { t.Logf("%s: test complete", time.Now().Format(time.RFC3339)) } + +func TestMultiNodeReset1(t *testing.T) { + TestMultiNodeReset(t) +} + +func TestMultiNodeReset2(t *testing.T) { + TestMultiNodeReset(t) +} + +func TestMultiNodeReset3(t *testing.T) { + TestMultiNodeReset(t) +} + +func TestMultiNodeReset4(t *testing.T) { + TestMultiNodeReset(t) +} + +func TestMultiNodeReset5(t *testing.T) { + TestMultiNodeReset(t) +} + +func TestMultiNodeReset6(t *testing.T) { + TestMultiNodeReset(t) +} + +func TestMultiNodeReset7(t *testing.T) { + TestMultiNodeReset(t) +} + +func TestMultiNodeReset8(t *testing.T) { + TestMultiNodeReset(t) +} + +func TestMultiNodeReset9(t *testing.T) { + TestMultiNodeReset(t) +} From 95ef24b916946d8595a1b314778dccc4bde27528 Mon Sep 17 00:00:00 2001 From: Andrew Lavery Date: Tue, 28 May 2024 20:05:39 +0800 Subject: [PATCH 2/5] retry leaving etcd, only drain pods from node if it's not the last etcd member --- cmd/embedded-cluster/uninstall.go | 60 ++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 13 deletions(-) diff --git a/cmd/embedded-cluster/uninstall.go b/cmd/embedded-cluster/uninstall.go index bc30fb772e..034aaaaf6c 100644 --- a/cmd/embedded-cluster/uninstall.go +++ b/cmd/embedded-cluster/uninstall.go @@ -14,6 +14,7 @@ import ( "github.com/sirupsen/logrus" "github.com/urfave/cli/v2" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/wait" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -200,26 +201,51 @@ func (h *hostInfo) checkResetSafety(c *cli.Context) (bool, string, error) { return true, "", nil } -// leaveEtcdcluster uses k0s to attempt to leave the etcd cluster -func (h *hostInfo) leaveEtcdcluster() error { - - // if we're the only etcd member we don't need to leave the cluster +func (h *hostInfo) isLastEtcdMember() (bool, error) { out, err := exec.Command(k0s, "etcd", "member-list").Output() if err != nil { - return err + return false, err } memberlist := etcdMembers{} err = json.Unmarshal(out, &memberlist) if err != nil { - return err + return false, err + } + + if memberlist.Members[h.Hostname] == "" { + return false, nil // this is not a etcd member + } + if len(memberlist.Members) == 1 { + return true, nil // we are an etcd member and the only one + } + return false, nil // there is more than one etcd member remaining +} + +// leaveEtcdcluster uses k0s to attempt to leave the etcd cluster +func (h *hostInfo) leaveEtcdcluster() error { + // if we're the only etcd member we don't need to leave the cluster + lastMember, err := h.isLastEtcdMember() + if err != nil { + return fmt.Errorf("unable to check if last etcd member: %w", err) } - if len(memberlist.Members) == 1 && memberlist.Members[h.Hostname] != "" { + if lastMember { return nil } - out, err = exec.Command(k0s, "etcd", "leave").CombinedOutput() + var lasterr error + var out []byte + backoff := wait.Backoff{Steps: 5, Duration: time.Second, Factor: 2, Jitter: 0.1} + err = wait.ExponentialBackoff(backoff, func() (bool, error) { + out, err = exec.Command(k0s, "etcd", "leave").CombinedOutput() + if err != nil { + lasterr = fmt.Errorf("%w: %s", err, string(out)) + return false, nil + } + return true, nil + + }) if err != nil { - return fmt.Errorf("unable to leave etcd cluster: %w, %s", err, string(out)) + return fmt.Errorf("unable to leave etcd cluster: %w", lasterr) } return nil } @@ -342,12 +368,20 @@ var resetCommand = &cli.Command{ } } - // drain node - logrus.Info("Draining node...") - err = currentHost.drainNode() - if !checkErrPrompt(c, err) { + isLastEtcdHost, err := currentHost.isLastEtcdMember() + if err != nil { return err } + if isLastEtcdHost { + logrus.Info("Not draining node as it is the last etcd member, and doing so would be redundant...") + } else { + // drain node + logrus.Info("Draining node...") + err = currentHost.drainNode() + if !checkErrPrompt(c, err) { + return err + } + } // remove node from cluster logrus.Info("Removing node from cluster...") From ec32a750cff9dc453417e05059f8ac48ed5ed06d Mon Sep 17 00:00:00 2001 From: Andrew Lavery Date: Tue, 28 May 2024 21:35:42 +0800 Subject: [PATCH 3/5] handle worker nodes --- cmd/embedded-cluster/uninstall.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/embedded-cluster/uninstall.go b/cmd/embedded-cluster/uninstall.go index 034aaaaf6c..c5db14a7dc 100644 --- a/cmd/embedded-cluster/uninstall.go +++ b/cmd/embedded-cluster/uninstall.go @@ -370,7 +370,8 @@ var resetCommand = &cli.Command{ isLastEtcdHost, err := currentHost.isLastEtcdMember() if err != nil { - return err + // ignore this error because this will always error on worker nodes - this just means we have to drain the node + isLastEtcdHost = false } if isLastEtcdHost { logrus.Info("Not draining node as it is the last etcd member, and doing so would be redundant...") From e7f12fb981fab284b54f719f066dc98e9c77b273 Mon Sep 17 00:00:00 2001 From: Andrew Lavery Date: Tue, 28 May 2024 22:25:21 +0800 Subject: [PATCH 4/5] drain every node again --- cmd/embedded-cluster/uninstall.go | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/cmd/embedded-cluster/uninstall.go b/cmd/embedded-cluster/uninstall.go index c5db14a7dc..ad3539fec7 100644 --- a/cmd/embedded-cluster/uninstall.go +++ b/cmd/embedded-cluster/uninstall.go @@ -368,21 +368,21 @@ var resetCommand = &cli.Command{ } } - isLastEtcdHost, err := currentHost.isLastEtcdMember() - if err != nil { - // ignore this error because this will always error on worker nodes - this just means we have to drain the node - isLastEtcdHost = false - } - if isLastEtcdHost { - logrus.Info("Not draining node as it is the last etcd member, and doing so would be redundant...") - } else { - // drain node - logrus.Info("Draining node...") - err = currentHost.drainNode() - if !checkErrPrompt(c, err) { - return err - } + //isLastEtcdHost, err := currentHost.isLastEtcdMember() + //if err != nil { + // // ignore this error because this will always error on worker nodes - this just means we have to drain the node + // isLastEtcdHost = false + //} + //if isLastEtcdHost { + // logrus.Info("Not draining node as it is the last etcd member, and doing so would be redundant...") + //} else { + // drain node + logrus.Info("Draining node...") + err = currentHost.drainNode() + if !checkErrPrompt(c, err) { + return err } + //} // remove node from cluster logrus.Info("Removing node from cluster...") From 50b59d7d2f31921f79207cbd20e1bbe1919c3da5 Mon Sep 17 00:00:00 2001 From: Andrew Lavery Date: Tue, 28 May 2024 23:01:55 +0800 Subject: [PATCH 5/5] cleanup --- cmd/embedded-cluster/uninstall.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/cmd/embedded-cluster/uninstall.go b/cmd/embedded-cluster/uninstall.go index ad3539fec7..38dda2cb70 100644 --- a/cmd/embedded-cluster/uninstall.go +++ b/cmd/embedded-cluster/uninstall.go @@ -368,21 +368,12 @@ var resetCommand = &cli.Command{ } } - //isLastEtcdHost, err := currentHost.isLastEtcdMember() - //if err != nil { - // // ignore this error because this will always error on worker nodes - this just means we have to drain the node - // isLastEtcdHost = false - //} - //if isLastEtcdHost { - // logrus.Info("Not draining node as it is the last etcd member, and doing so would be redundant...") - //} else { // drain node logrus.Info("Draining node...") err = currentHost.drainNode() if !checkErrPrompt(c, err) { return err } - //} // remove node from cluster logrus.Info("Removing node from cluster...")