Skip to content

Commit 5282a64

Browse files
committed
migrate autoscaler v2 to use spec field then fallback to env var
Signed-off-by: alimaazamat <[email protected]>
1 parent ad365ba commit 5282a64

File tree

5 files changed

+116
-31
lines changed

5 files changed

+116
-31
lines changed

ray-operator/controllers/ray/utils/validation.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -609,10 +609,17 @@ func validateWorkerGroupIdleTimeout(workerGroup rayv1.WorkerGroupSpec, spec *ray
609609
}
610610

611611
// idleTimeoutSeconds only allowed on autoscaler v2
612+
// Check spec field first (>= 1.4.0), then fall back to env var (< 1.4.0)
613+
if IsAutoscalingV2Enabled(spec) {
614+
return nil
615+
}
616+
612617
envVar, exists := EnvVarByName(RAY_ENABLE_AUTOSCALER_V2, spec.HeadGroupSpec.Template.Spec.Containers[RayContainerIndex].Env)
613-
if !exists || (envVar.Value != "1" && envVar.Value != "true") {
614-
return fmt.Errorf("worker group %s has idleTimeoutSeconds set, but %s environment variable is not set to 'true' in the head pod", workerGroup.GroupName, RAY_ENABLE_AUTOSCALER_V2)
618+
if exists && (envVar.Value == "1" || envVar.Value == "true") {
619+
return nil
615620
}
621+
622+
return fmt.Errorf("worker group %s has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' or set %s environment variable to 'true' in the head pod", workerGroup.GroupName, RAY_ENABLE_AUTOSCALER_V2)
616623
}
617624

618625
return nil

ray-operator/controllers/ray/utils/validation_test.go

Lines changed: 45 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1891,10 +1891,11 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
18911891
"should accept worker group with valid idleTimeoutSeconds": {
18921892
spec: rayv1.RayClusterSpec{
18931893
EnableInTreeAutoscaling: ptr.To(true),
1894+
AutoscalerOptions: &rayv1.AutoscalerOptions{
1895+
Version: ptr.To(rayv1.AutoscalerVersionV2),
1896+
},
18941897
HeadGroupSpec: rayv1.HeadGroupSpec{
1895-
Template: podTemplateSpec([]corev1.EnvVar{
1896-
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"},
1897-
}, nil),
1898+
Template: podTemplateSpec(nil, nil),
18981899
},
18991900
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
19001901
{
@@ -1911,10 +1912,11 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
19111912
"should reject negative idleTimeoutSeconds": {
19121913
spec: rayv1.RayClusterSpec{
19131914
EnableInTreeAutoscaling: ptr.To(true),
1915+
AutoscalerOptions: &rayv1.AutoscalerOptions{
1916+
Version: ptr.To(rayv1.AutoscalerVersionV2),
1917+
},
19141918
HeadGroupSpec: rayv1.HeadGroupSpec{
1915-
Template: podTemplateSpec([]corev1.EnvVar{
1916-
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"},
1917-
}, nil),
1919+
Template: podTemplateSpec(nil, nil),
19181920
},
19191921
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
19201922
{
@@ -1931,10 +1933,11 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
19311933
"should accept zero idleTimeoutSeconds": {
19321934
spec: rayv1.RayClusterSpec{
19331935
EnableInTreeAutoscaling: ptr.To(true),
1936+
AutoscalerOptions: &rayv1.AutoscalerOptions{
1937+
Version: ptr.To(rayv1.AutoscalerVersionV2),
1938+
},
19341939
HeadGroupSpec: rayv1.HeadGroupSpec{
1935-
Template: podTemplateSpec([]corev1.EnvVar{
1936-
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"},
1937-
}, nil),
1940+
Template: podTemplateSpec(nil, nil),
19381941
},
19391942
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
19401943
{
@@ -1951,6 +1954,9 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
19511954
"should reject idleTimeoutSeconds when autoscaler version is not v2": {
19521955
spec: rayv1.RayClusterSpec{
19531956
EnableInTreeAutoscaling: ptr.To(true),
1957+
AutoscalerOptions: &rayv1.AutoscalerOptions{
1958+
Version: ptr.To(rayv1.AutoscalerVersionV1),
1959+
},
19541960
HeadGroupSpec: rayv1.HeadGroupSpec{
19551961
Template: podTemplateSpec(nil, nil),
19561962
},
@@ -1964,11 +1970,14 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
19641970
},
19651971
},
19661972
},
1967-
expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod",
1973+
expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' or set %s environment variable to 'true' in the head pod", RAY_ENABLE_AUTOSCALER_V2),
19681974
},
19691975
"should reject idleTimeoutSeconds when autoscaler version is not set": {
19701976
spec: rayv1.RayClusterSpec{
19711977
EnableInTreeAutoscaling: ptr.To(true),
1978+
AutoscalerOptions: &rayv1.AutoscalerOptions{
1979+
Version: nil,
1980+
},
19721981
HeadGroupSpec: rayv1.HeadGroupSpec{
19731982
Template: podTemplateSpec(nil, nil),
19741983
},
@@ -1982,7 +1991,7 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
19821991
},
19831992
},
19841993
},
1985-
expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod",
1994+
expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' or set %s environment variable to 'true' in the head pod", RAY_ENABLE_AUTOSCALER_V2),
19861995
},
19871996
"should reject idleTimeoutSeconds when AutoscalerOptions is nil": {
19881997
spec: rayv1.RayClusterSpec{
@@ -2000,14 +2009,14 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
20002009
},
20012010
},
20022011
},
2003-
expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod",
2012+
expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' or set %s environment variable to 'true' in the head pod", RAY_ENABLE_AUTOSCALER_V2),
20042013
},
2005-
"should reject idleTimeoutSeconds when env var is set to invalid value": {
2014+
"should accept worker group with idleTimeoutSeconds when env var is set to '1' (legacy < 1.4.0)": {
20062015
spec: rayv1.RayClusterSpec{
20072016
EnableInTreeAutoscaling: ptr.To(true),
20082017
HeadGroupSpec: rayv1.HeadGroupSpec{
20092018
Template: podTemplateSpec([]corev1.EnvVar{
2010-
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "false"},
2019+
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "1"},
20112020
}, nil),
20122021
},
20132022
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
@@ -2020,9 +2029,9 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
20202029
},
20212030
},
20222031
},
2023-
expectedErr: "worker group worker-group-1 has idleTimeoutSeconds set, but RAY_enable_autoscaler_v2 environment variable is not set to 'true' in the head pod",
2032+
expectedErr: "",
20242033
},
2025-
"should accept worker group with idleTimeoutSeconds when env var is set to true": {
2034+
"should accept worker group with idleTimeoutSeconds when env var is set to 'true' (legacy < 1.4.0)": {
20262035
spec: rayv1.RayClusterSpec{
20272036
EnableInTreeAutoscaling: ptr.To(true),
20282037
HeadGroupSpec: rayv1.HeadGroupSpec{
@@ -2042,6 +2051,26 @@ func TestValidateWorkerGroupIdleTimeout(t *testing.T) {
20422051
},
20432052
expectedErr: "",
20442053
},
2054+
"should reject idleTimeoutSeconds when env var is set to 'false'": {
2055+
spec: rayv1.RayClusterSpec{
2056+
EnableInTreeAutoscaling: ptr.To(true),
2057+
HeadGroupSpec: rayv1.HeadGroupSpec{
2058+
Template: podTemplateSpec([]corev1.EnvVar{
2059+
{Name: RAY_ENABLE_AUTOSCALER_V2, Value: "false"},
2060+
}, nil),
2061+
},
2062+
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
2063+
{
2064+
GroupName: "worker-group-1",
2065+
Template: podTemplateSpec(nil, nil),
2066+
IdleTimeoutSeconds: ptr.To(int32(60)),
2067+
MinReplicas: ptr.To(int32(0)),
2068+
MaxReplicas: ptr.To(int32(10)),
2069+
},
2070+
},
2071+
},
2072+
expectedErr: fmt.Sprintf("worker group worker-group-1 has idleTimeoutSeconds set, but autoscaler v2 is not enabled. Please set .spec.autoscalerOptions.version to 'v2' or set %s environment variable to 'true' in the head pod", RAY_ENABLE_AUTOSCALER_V2),
2073+
},
20452074
"should accept worker group without idleTimeoutSeconds and without autoscaler v2": {
20462075
spec: rayv1.RayClusterSpec{
20472076
EnableInTreeAutoscaling: ptr.To(true),

ray-operator/test/e2eautoscaler/raycluster_autoscaler_part2_test.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ func TestRayClusterAutoscalerV2IdleTimeout(t *testing.T) {
4242
rayClusterSpecAC := rayv1ac.RayClusterSpec().
4343
WithEnableInTreeAutoscaling(true).
4444
WithRayVersion(GetRayVersion()).
45+
WithAutoscalerOptions(rayv1ac.AutoscalerOptions().
46+
WithVersion(rayv1.AutoscalerVersionV2)).
4547
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
4648
WithRayStartParams(map[string]string{"num-cpus": "0"}).
4749
WithTemplate(tc.HeadPodTemplateGetter())).

ray-operator/test/e2eautoscaler/raycluster_autoscaler_test.go

Lines changed: 60 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ import (
1515
)
1616

1717
func TestRayClusterAutoscaler(t *testing.T) {
18-
for _, tc := range tests {
18+
for i, tc := range tests {
1919
t.Run(tc.name, func(t *testing.T) {
2020
test := With(t)
2121
g := gomega.NewWithT(t)
@@ -31,7 +31,15 @@ func TestRayClusterAutoscaler(t *testing.T) {
3131

3232
rayClusterSpecAC := rayv1ac.RayClusterSpec().
3333
WithEnableInTreeAutoscaling(true).
34-
WithRayVersion(GetRayVersion()).
34+
WithRayVersion(GetRayVersion())
35+
36+
// Add autoscaler version for V2 test (tests[1])
37+
if i == 1 {
38+
rayClusterSpecAC = rayClusterSpecAC.WithAutoscalerOptions(
39+
rayv1ac.AutoscalerOptions().WithVersion(rayv1.AutoscalerVersionV2))
40+
}
41+
42+
rayClusterSpecAC = rayClusterSpecAC.
3543
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
3644
WithRayStartParams(map[string]string{"num-cpus": "0"}).
3745
WithTemplate(tc.HeadPodTemplateGetter())).
@@ -82,7 +90,7 @@ func TestRayClusterAutoscaler(t *testing.T) {
8290
}
8391

8492
func TestRayClusterAutoscalerWithFakeGPU(t *testing.T) {
85-
for _, tc := range tests {
93+
for i, tc := range tests {
8694
t.Run(tc.name, func(t *testing.T) {
8795
test := With(t)
8896
g := gomega.NewWithT(t)
@@ -98,7 +106,15 @@ func TestRayClusterAutoscalerWithFakeGPU(t *testing.T) {
98106

99107
rayClusterSpecAC := rayv1ac.RayClusterSpec().
100108
WithEnableInTreeAutoscaling(true).
101-
WithRayVersion(GetRayVersion()).
109+
WithRayVersion(GetRayVersion())
110+
111+
// Add autoscaler version for V2 test (tests[1])
112+
if i == 1 {
113+
rayClusterSpecAC = rayClusterSpecAC.WithAutoscalerOptions(
114+
rayv1ac.AutoscalerOptions().WithVersion(rayv1.AutoscalerVersionV2))
115+
}
116+
117+
rayClusterSpecAC = rayClusterSpecAC.
102118
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
103119
WithRayStartParams(map[string]string{"num-cpus": "0"}).
104120
WithTemplate(tc.HeadPodTemplateGetter())).
@@ -142,7 +158,7 @@ func TestRayClusterAutoscalerWithFakeGPU(t *testing.T) {
142158
}
143159

144160
func TestRayClusterAutoscalerWithCustomResource(t *testing.T) {
145-
for _, tc := range tests {
161+
for i, tc := range tests {
146162
t.Run(tc.name, func(t *testing.T) {
147163
test := With(t)
148164
g := gomega.NewWithT(t)
@@ -160,7 +176,15 @@ func TestRayClusterAutoscalerWithCustomResource(t *testing.T) {
160176

161177
rayClusterSpecAC := rayv1ac.RayClusterSpec().
162178
WithEnableInTreeAutoscaling(true).
163-
WithRayVersion(GetRayVersion()).
179+
WithRayVersion(GetRayVersion())
180+
181+
// Add autoscaler version for V2 test (tests[1])
182+
if i == 1 {
183+
rayClusterSpecAC = rayClusterSpecAC.WithAutoscalerOptions(
184+
rayv1ac.AutoscalerOptions().WithVersion(rayv1.AutoscalerVersionV2))
185+
}
186+
187+
rayClusterSpecAC = rayClusterSpecAC.
164188
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
165189
WithRayStartParams(map[string]string{"num-cpus": "0"}).
166190
WithTemplate(tc.HeadPodTemplateGetter())).
@@ -201,7 +225,7 @@ func TestRayClusterAutoscalerWithCustomResource(t *testing.T) {
201225
}
202226

203227
func TestRayClusterAutoscalerWithDesiredState(t *testing.T) {
204-
for _, tc := range tests {
228+
for i, tc := range tests {
205229
t.Run(tc.name, func(t *testing.T) {
206230
test := With(t)
207231
g := gomega.NewWithT(t)
@@ -222,7 +246,15 @@ func TestRayClusterAutoscalerWithDesiredState(t *testing.T) {
222246
groupName := "custom-resource-group"
223247
rayClusterSpecAC := rayv1ac.RayClusterSpec().
224248
WithEnableInTreeAutoscaling(true).
225-
WithRayVersion(GetRayVersion()).
249+
WithRayVersion(GetRayVersion())
250+
251+
// Add autoscaler version for V2 test (tests[1])
252+
if i == 1 {
253+
rayClusterSpecAC = rayClusterSpecAC.WithAutoscalerOptions(
254+
rayv1ac.AutoscalerOptions().WithVersion(rayv1.AutoscalerVersionV2))
255+
}
256+
257+
rayClusterSpecAC = rayClusterSpecAC.
226258
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
227259
WithRayStartParams(map[string]string{"num-cpus": "0"}).
228260
WithTemplate(tc.HeadPodTemplateGetter())).
@@ -263,7 +295,7 @@ func TestRayClusterAutoscalerWithDesiredState(t *testing.T) {
263295
}
264296

265297
func TestRayClusterAutoscalerMinReplicasUpdate(t *testing.T) {
266-
for _, tc := range tests {
298+
for i, tc := range tests {
267299
t.Run(tc.name, func(t *testing.T) {
268300
test := With(t)
269301
g := gomega.NewWithT(t)
@@ -281,7 +313,15 @@ func TestRayClusterAutoscalerMinReplicasUpdate(t *testing.T) {
281313

282314
rayClusterSpecAC := rayv1ac.RayClusterSpec().
283315
WithEnableInTreeAutoscaling(true).
284-
WithRayVersion(GetRayVersion()).
316+
WithRayVersion(GetRayVersion())
317+
318+
// Add autoscaler version for V2 test (tests[1])
319+
if i == 1 {
320+
rayClusterSpecAC = rayClusterSpecAC.WithAutoscalerOptions(
321+
rayv1ac.AutoscalerOptions().WithVersion(rayv1.AutoscalerVersionV2))
322+
}
323+
324+
rayClusterSpecAC = rayClusterSpecAC.
285325
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
286326
WithRayStartParams(map[string]string{"num-cpus": "0"}).
287327
WithTemplate(tc.HeadPodTemplateGetter())).
@@ -355,7 +395,7 @@ func TestRayClusterAutoscalerMaxReplicasUpdate(t *testing.T) {
355395
},
356396
}
357397

358-
for _, tc := range tests {
398+
for i, tc := range tests {
359399
for _, rtc := range replicaTests {
360400
t.Run(fmt.Sprintf("%s(%s)", tc.name, rtc.name), func(t *testing.T) {
361401
test := With(t)
@@ -371,7 +411,15 @@ func TestRayClusterAutoscalerMaxReplicasUpdate(t *testing.T) {
371411

372412
rayClusterSpecAC := rayv1ac.RayClusterSpec().
373413
WithEnableInTreeAutoscaling(true).
374-
WithRayVersion(GetRayVersion()).
414+
WithRayVersion(GetRayVersion())
415+
416+
// Add autoscaler version for V2 test (tests[1])
417+
if i == 1 {
418+
rayClusterSpecAC = rayClusterSpecAC.WithAutoscalerOptions(
419+
rayv1ac.AutoscalerOptions().WithVersion(rayv1.AutoscalerVersionV2))
420+
}
421+
422+
rayClusterSpecAC = rayClusterSpecAC.
375423
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
376424
WithRayStartParams(map[string]string{"num-cpus": "0"}).
377425
WithTemplate(tc.HeadPodTemplateGetter())).

ray-operator/test/e2eautoscaler/support.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ func headPodTemplateApplyConfigurationV2() *corev1ac.PodTemplateSpecApplyConfigu
114114
corev1ac.ContainerPort().WithName(utils.DashboardPortName).WithContainerPort(utils.DefaultDashboardPort),
115115
corev1ac.ContainerPort().WithName(utils.ClientPortName).WithContainerPort(utils.DefaultClientPort),
116116
).
117-
WithEnv(corev1ac.EnvVar().WithName(utils.RAY_ENABLE_AUTOSCALER_V2).WithValue("1")).
118117
WithResources(corev1ac.ResourceRequirements().
119118
WithRequests(corev1.ResourceList{
120119
corev1.ResourceCPU: resource.MustParse("1"),

0 commit comments

Comments
 (0)