From 380a50a3a0403e29fdd3663ed17ac84d5e1e34f2 Mon Sep 17 00:00:00 2001 From: huangmingxia Date: Fri, 30 Jan 2026 18:27:45 +0800 Subject: [PATCH] HIVE-3067 infra_test: match machine pool e2e by Hive labels --- hack/e2e-test.sh | 2 +- test/e2e/common/client.go | 6 ++ .../e2e/postinstall/machinesets/infra_test.go | 88 +++++++++---------- 3 files changed, 50 insertions(+), 46 deletions(-) diff --git a/hack/e2e-test.sh b/hack/e2e-test.sh index 654e4781ac0..f18e00ad585 100755 --- a/hack/e2e-test.sh +++ b/hack/e2e-test.sh @@ -177,7 +177,7 @@ INSTALL_RESULT="" i=1 while [ $i -le ${max_cluster_deployment_status_checks} ]; do CD_JSON=$(oc get cd ${CLUSTER_NAME} -n ${CLUSTER_NAMESPACE} -o json) - if [[ $(jq .spec.installed <<<"${CD_JSON}") == "true" ]] ; then + if [[ $(jq .spec.installed <<<"${CD_JSON}") == "true" ]] && [[ $(jq -r .status.powerState <<<"${CD_JSON}") == "Running" ]] ; then INSTALL_RESULT="success" break fi diff --git a/test/e2e/common/client.go b/test/e2e/common/client.go index c0ef40a804b..f93e275b9a5 100644 --- a/test/e2e/common/client.go +++ b/test/e2e/common/client.go @@ -5,15 +5,21 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" + ctrl "sigs.k8s.io/controller-runtime/pkg/log" "k8s.io/client-go/dynamic" kclient "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" apiregv1client "k8s.io/kube-aggregator/pkg/client/clientset_generated/clientset/typed/apiregistration/v1" + "github.com/openshift/hive/pkg/util/logrus" "github.com/openshift/hive/pkg/util/scheme" ) +func init() { + ctrl.SetLogger(logrus.NewLogr(log.StandardLogger())) +} + func MustGetClient() client.WithWatch { return MustGetClientFromConfig(MustGetConfig()) } diff --git a/test/e2e/postinstall/machinesets/infra_test.go b/test/e2e/postinstall/machinesets/infra_test.go index ec79091ae0d..d92d951498e 100644 --- a/test/e2e/postinstall/machinesets/infra_test.go +++ b/test/e2e/postinstall/machinesets/infra_test.go @@ -15,6 +15,7 @@ import ( apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/rest" "k8s.io/client-go/util/retry" "k8s.io/utils/ptr" @@ -35,6 +36,10 @@ import ( const ( workerMachinePoolName = "worker" infraMachinePoolName = "infra" + + capiMachineTypeKey = "machine.openshift.io/cluster-api-machine-type" + hiveMachinePoolKey = "hive.openshift.io/machine-pool" + openshiftMachineAPI = "openshift-machine-api" ) func TestScaleMachinePool(t *testing.T) { @@ -52,27 +57,23 @@ func TestScaleMachinePool(t *testing.T) { } c := common.MustGetClient() - machinePrefix, err := machineNamePrefix(cd, workerMachinePoolName) - require.NoError(t, err, "cannot determine machine name prefix") // Scale down - err = retry.RetryOnConflict(retry.DefaultRetry, func() error { + err := retry.RetryOnConflict(retry.DefaultRetry, func() error { pool := common.GetMachinePool(c, cd, workerMachinePoolName) require.NotNilf(t, pool, "worker machine pool does not exist: %s", workerMachinePoolName) logger = logger.WithField("pool", pool.Name) - logger.Infof("expected Machine name prefix: %s", machinePrefix) - logger.Info("scaling pool to 1 replicas") pool.Spec.Replicas = ptr.To(int64(1)) return c.Update(context.TODO(), pool) }) require.NoError(t, err, "cannot update worker machine pool to reduce replicas") - err = waitForMachines(logger, cfg, cd, machinePrefix, 1) + err = waitForMachines(logger, cfg, cd, workerMachinePoolName, 1) require.NoError(t, err, "timed out waiting for machines to be scaled down") - err = waitForNodes(logger, cfg, cd, machinePrefix, 1) + err = waitForNodes(logger, cfg, cd, workerMachinePoolName, 1) require.NoError(t, err, "timed out waiting for nodes to be scaled down") // Scale up @@ -86,10 +87,10 @@ func TestScaleMachinePool(t *testing.T) { }) require.NoError(t, err, "cannot update worker machine pool to increase replicas") - err = waitForMachines(logger, cfg, cd, machinePrefix, 3) + err = waitForMachines(logger, cfg, cd, workerMachinePoolName, 3) require.NoError(t, err, "timed out waiting for machines to be scaled up") - err = waitForNodes(logger, cfg, cd, machinePrefix, 3) + err = waitForNodes(logger, cfg, cd, workerMachinePoolName, 3) require.NoError(t, err, "timed out waiting for nodes to be scaled up") } @@ -162,17 +163,11 @@ func TestNewMachinePool(t *testing.T) { err := c.Create(context.TODO(), infraMachinePool) require.NoError(t, err, "cannot create infra machine pool") - machinePrefix, err := machineNamePrefix(cd, infraMachinePoolName) - require.NoError(t, err, "cannot find/calculate machine name prefix") - logger.Infof("expected Machine name prefix: %s", machinePrefix) - - // Wait for machines to be created t.Logf("Waiting for 3 infra machines to be created") - err = waitForMachines(logger, cfg, cd, machinePrefix, 3) + err = waitForMachines(logger, cfg, cd, infraMachinePoolName, 3) require.NoError(t, err, "timed out waiting for machines to be created") - err = waitForNodes(logger, cfg, cd, machinePrefix, 3, - // Ensure that labels were applied to the nodes + err = waitForNodes(logger, cfg, cd, infraMachinePoolName, 3, func(node *corev1.Node) bool { if machineType := node.Labels["openshift.io/machine-type"]; machineType != infraMachinePoolName { t.Logf("Did not find expected label in node") @@ -205,7 +200,7 @@ func TestNewMachinePool(t *testing.T) { cfg, func(machineSets []*machinev1.MachineSet) bool { for _, ms := range machineSets { - if strings.HasPrefix(ms.Name, machinePrefix) { + if ms.Labels[hiveMachinePoolKey] == infraMachinePoolName { return false } } @@ -277,9 +272,6 @@ func TestAutoscalingMachinePool(t *testing.T) { require.NoError(t, err, "cannot update worker machine pool to reduce replicas") logger = logger.WithField("pool", pool.Name) - machinePrefix, err := machineNamePrefix(cd, workerMachinePoolName) - require.NoError(t, err, "cannot find/calculate machine name prefix") - logger.Info("lowering autoscaler delay so scaling down happens faster") clusterAutoscaler := &autoscalingv1.ClusterAutoscaler{} poll: @@ -298,14 +290,12 @@ poll: machineSetList := &machinev1.MachineSetList{} rc.List(context.Background(), machineSetList) for _, machineSet := range machineSetList.Items { - // Only check machinesets that belong to this worker pool - poolLabel, hasPoolLabel := machineSet.Labels["hive.openshift.io/machine-pool"] + poolLabel, hasPoolLabel := machineSet.Labels[hiveMachinePoolKey] if !hasPoolLabel || poolLabel != pool.Spec.Name { continue } - // Check labels require.Equal(t, "true", machineSet.Labels[constants.HiveManagedLabel], "Incorrect hive managed label on machineset") - require.Equal(t, pool.Spec.Name, machineSet.Labels["hive.openshift.io/machine-pool"], "Incorrect machine pool label on machineset") + require.Equal(t, pool.Spec.Name, machineSet.Labels[hiveMachinePoolKey], "Incorrect machine pool label on machineset") } if clusterAutoscaler.Name == "" { @@ -380,9 +370,9 @@ poll: err = rc.Create(context.TODO(), busyboxDeployment) require.NoError(t, err, "cannot create busybox deployment") - err = waitForMachines(logger, cfg, cd, machinePrefix, maxReplicas) + err = waitForMachines(logger, cfg, cd, workerMachinePoolName, maxReplicas) require.NoError(t, err, "timed out waiting for machines to be created") - err = waitForNodes(logger, cfg, cd, machinePrefix, maxReplicas) + err = waitForNodes(logger, cfg, cd, workerMachinePoolName, maxReplicas) require.NoError(t, err, "timed out waiting for nodes to be created") // Scale down @@ -392,9 +382,9 @@ poll: logger.Info("deleting busybox deployment to relieve cpu pressure and scale down machines") err = rc.Delete(context.TODO(), busyboxDeployment, client.PropagationPolicy(metav1.DeletePropagationForeground)) require.NoError(t, err, "could not delete busybox deployment") - err = waitForMachines(logger, cfg, cd, machinePrefix, minReplicas) + err = waitForMachines(logger, cfg, cd, workerMachinePoolName, minReplicas) require.NoError(t, err, "timed out waiting for machine count") - err = waitForNodes(logger, cfg, cd, machinePrefix, minReplicas) + err = waitForNodes(logger, cfg, cd, workerMachinePoolName, minReplicas) require.NoError(t, err, "timed out waiting for nodes to be created") logger.Info("disabling autoscaling") @@ -407,45 +397,57 @@ poll: return c.Update(context.TODO(), pool) }) require.NoError(t, err, "cannot update worker machine pool to turn off auto-scaling") - err = waitForMachines(logger, cfg, cd, machinePrefix, 3) + err = waitForMachines(logger, cfg, cd, workerMachinePoolName, 3) require.NoError(t, err, "timed out waiting for machines to be created") - err = waitForNodes(logger, cfg, cd, machinePrefix, 3) + err = waitForNodes(logger, cfg, cd, workerMachinePoolName, 3) require.NoError(t, err, "timed out waiting for nodes to be created") } -func waitForMachines(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDeployment, machinePrefix string, expectedReplicas int) error { - logger.Infof("waiting for %d machines with prefix '%s'", expectedReplicas, machinePrefix) +func waitForMachines(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDeployment, poolName string, expectedReplicas int) error { + infraID := cd.Spec.ClusterMetadata.InfraID + logger.Infof("waiting for %d machines (pool %s, infraID %s)", expectedReplicas, poolName, infraID) lastCount := 0 return common.WaitForMachines(cfg, func(machines []*machinev1.Machine) bool { count := 0 for _, m := range machines { - if strings.HasPrefix(m.Name, machinePrefix) { + if m.Labels[machinev1.MachineClusterIDLabel] == infraID && m.Labels[capiMachineTypeKey] == poolName { count++ } } if count != lastCount { - logger.Infof("found %d machines with prefix '%s'", count, machinePrefix) + logger.Infof("found %d machines for pool %s", count, poolName) lastCount = count } return count == expectedReplicas }, 20*time.Minute) } -func waitForNodes(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDeployment, machinePrefix string, expectedReplicas int, extraChecks ...func(node *corev1.Node) bool) error { - logger.Infof("waiting for %d nodes with machine annotation prefix '%s'", expectedReplicas, machinePrefix) +func waitForNodes(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDeployment, poolName string, expectedReplicas int, extraChecks ...func(node *corev1.Node) bool) error { + infraID := cd.Spec.ClusterMetadata.InfraID + logger.Infof("waiting for %d nodes (pool %s)", expectedReplicas, poolName) + rc := common.MustGetClientFromConfig(cfg) return common.WaitForNodes(cfg, func(nodes []*corev1.Node) bool { + machineList := &machinev1.MachineList{} + err := rc.List(context.TODO(), machineList, client.InNamespace(openshiftMachineAPI), + client.MatchingLabels{machinev1.MachineClusterIDLabel: infraID, capiMachineTypeKey: poolName}) + if err != nil { + return false + } + poolMachineNames := sets.New[string]() + for i := range machineList.Items { + poolMachineNames.Insert(machineList.Items[i].Name) + } poolNodes := []*corev1.Node{} for _, n := range nodes { if n.Annotations == nil { continue } machineAnnotation := n.Annotations["machine.openshift.io/machine"] - name := strings.Split(machineAnnotation, "/") - if len(name) < 2 { + parts := strings.Split(machineAnnotation, "/") + if len(parts) < 2 { continue } - machineName := name[1] - if strings.HasPrefix(machineName, machinePrefix) { + if poolMachineNames.Has(parts[1]) { poolNodes = append(poolNodes, n) } } @@ -465,7 +467,3 @@ func waitForNodes(logger log.FieldLogger, cfg *rest.Config, cd *hivev1.ClusterDe return true }, 15*time.Minute) } - -func machineNamePrefix(cd *hivev1.ClusterDeployment, poolName string) (string, error) { - return fmt.Sprintf("%s-%s-", cd.Spec.ClusterMetadata.InfraID, poolName), nil -}