From d92994f3ba1f26b2c54fa898592a778fb75d81bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Svensson?= Date: Fri, 29 May 2026 09:26:20 +0200 Subject: [PATCH] fix: check primary node first when resolving shard index MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During scale-in, shardIndexFromState could match a stale replica from a drained shard that temporarily appears in a remaining shard via gossip before CLUSTER FORGET propagates. This returns the wrong shard index, causing the controller to drain the wrong shards — e.g. draining 3 of 4 instead of 1, leading to an unrecoverable Reconciling state. Fix by checking the primary node first. The primary is the authoritative slot owner and its ValkeyNode CR always has the correct shard-index label. Stale replicas from drained shards are never primaries of remaining shards. Signed-off-by: Björn Svensson --- internal/controller/valkeycluster_controller.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/internal/controller/valkeycluster_controller.go b/internal/controller/valkeycluster_controller.go index 0939ed26..8f0c9d86 100644 --- a/internal/controller/valkeycluster_controller.go +++ b/internal/controller/valkeycluster_controller.go @@ -1207,9 +1207,16 @@ func (r *ValkeyClusterReconciler) deleteExcessValkeyNodes(ctx context.Context, c return deleted, nil } -// shardIndexFromState determines the pod shard-index for a given Valkey shard -// by matching any of its nodes' addresses to ValkeyNode labels. +// shardIndexFromState determines the shard index for a given Valkey Cluster +// shard by matching its node addresses to ValkeyNode shard-index labels. It checks +// the primary first, falling back to any node if unavailable. func shardIndexFromState(shard *valkey.ShardState, nodes *valkeyiov1alpha1.ValkeyNodeList) int { + if primary := shard.GetPrimaryNode(); primary != nil { + _, shardIndex := nodeRoleAndShard(primary.Address, nodes) + if shardIndex >= 0 { + return shardIndex + } + } for _, node := range shard.Nodes { _, shardIndex := nodeRoleAndShard(node.Address, nodes) if shardIndex >= 0 {