Skip to content

Commit

Permalink
feat: validate the length of RayCluster name and worker group names
Browse files Browse the repository at this point in the history
Signed-off-by: Rueian <rueiancsie@gmail.com>
  • Loading branch information
rueian committed Feb 20, 2025
1 parent 349ab01 commit 1032d24
Show file tree
Hide file tree
Showing 5 changed files with 182 additions and 3 deletions.
2 changes: 1 addition & 1 deletion ray-operator/controllers/ray/common/ingress.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ func BuildIngressForHeadService(ctx context.Context, cluster rayv1.RayCluster) (

labels := map[string]string{
utils.RayClusterLabelKey: cluster.Name,
utils.RayIDLabelKey: utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode),
utils.RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode)),
utils.KubernetesApplicationNameLabelKey: utils.ApplicationName,
utils.KubernetesCreatedByLabelKey: utils.ComponentName,
}
Expand Down
2 changes: 1 addition & 1 deletion ray-operator/controllers/ray/common/route.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ import (
func BuildRouteForHeadService(cluster rayv1.RayCluster) (*routev1.Route, error) {
labels := map[string]string{
utils.RayClusterLabelKey: cluster.Name,
utils.RayIDLabelKey: utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode),
utils.RayIDLabelKey: utils.CheckLabel(utils.GenerateIdentifier(cluster.Name, rayv1.HeadNode)),
utils.KubernetesApplicationNameLabelKey: utils.ApplicationName,
utils.KubernetesCreatedByLabelKey: utils.ComponentName,
}
Expand Down
6 changes: 6 additions & 0 deletions ray-operator/controllers/ray/utils/validation.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,17 @@ func ValidateRayClusterStatus(instance *rayv1.RayCluster) error {

// Validation for invalid Ray Cluster configurations.
func ValidateRayClusterSpec(instance *rayv1.RayCluster) error {
if len(instance.Name) > 63 {
return fmt.Errorf("RayCluster name should be no more than 63 characters")
}
if len(instance.Spec.HeadGroupSpec.Template.Spec.Containers) == 0 {
return fmt.Errorf("headGroupSpec should have at least one container")
}

for _, workerGroup := range instance.Spec.WorkerGroupSpecs {
if len(workerGroup.GroupName) > 63 {
return fmt.Errorf("group name should be no more than 63 characters")
}
if len(workerGroup.Template.Spec.Containers) == 0 {
return fmt.Errorf("workerGroupSpec should have at least one container")
}
Expand Down
81 changes: 81 additions & 0 deletions ray-operator/controllers/ray/utils/validation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package utils

import (
"fmt"
"strings"
"testing"

"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -373,6 +374,86 @@ func TestValidateRayClusterSpecRedisUsername(t *testing.T) {
}
}

func TestValidateRayClusterSpecNames(t *testing.T) {
tests := []struct {
rayCluster *rayv1.RayCluster
name string
errorMessage string
expectError bool
}{
{
name: "RayCluster name is too long (> 63 characters)",
rayCluster: &rayv1.RayCluster{
ObjectMeta: metav1.ObjectMeta{
Name: strings.Repeat("A", 64),
},
},
expectError: true,
errorMessage: "RayCluster name should be no more than 63 characters",
},
{
name: "Worker group name is too long (> 63 characters)",
rayCluster: &rayv1.RayCluster{
Spec: rayv1.RayClusterSpec{
HeadGroupSpec: rayv1.HeadGroupSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{Name: "ray-head"}},
},
},
},
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
{
GroupName: strings.Repeat("A", 64),
},
},
},
},
expectError: true,
errorMessage: "group name should be no more than 63 characters",
},
{
name: "Both RayCluster name and Worker group name are ok (== 63 characters)",
rayCluster: &rayv1.RayCluster{
ObjectMeta: metav1.ObjectMeta{
Name: strings.Repeat("A", 63),
},
Spec: rayv1.RayClusterSpec{
HeadGroupSpec: rayv1.HeadGroupSpec{
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{Name: "ray-head"}},
},
},
},
WorkerGroupSpecs: []rayv1.WorkerGroupSpec{
{
GroupName: strings.Repeat("A", 63),
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Containers: []corev1.Container{{Name: "ray-worker"}},
},
},
},
},
},
},
expectError: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := ValidateRayClusterSpec(tt.rayCluster)
if tt.expectError {
require.Error(t, err)
assert.EqualError(t, err, tt.errorMessage)
} else {
require.NoError(t, err)
}
})
}
}

func TestValidateRayClusterSpecEmptyContainers(t *testing.T) {
headGroupSpecWithOneContainer := rayv1.HeadGroupSpec{
Template: corev1.PodTemplateSpec{
Expand Down
94 changes: 93 additions & 1 deletion ray-operator/test/e2e/raycluster_gcs_ft_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package e2e

import (
"strings"
"testing"
"time"

Expand All @@ -21,7 +22,7 @@ const (
redisAddress = "redis:6379"
)

func TestRayClusterGCSFaultTolerence(t *testing.T) {
func TestRayClusterGCSFaultTolerance(t *testing.T) {
test := With(t)
g := NewWithT(t)

Expand Down Expand Up @@ -128,6 +129,97 @@ func TestRayClusterGCSFaultTolerence(t *testing.T) {
})
}

func TestRayClusterGCSFTWithMaximumNames(t *testing.T) {
test := With(t)
g := NewWithT(t)

// Create a namespace
namespace := test.NewTestNamespace()
testScriptAC := newConfigMap(namespace.Name, files(test, "test_detached_actor_1.py", "test_detached_actor_2.py"))
testScript, err := test.Client().Core().CoreV1().ConfigMaps(namespace.Name).Apply(test.Ctx(), testScriptAC, TestApplyOptions)
g.Expect(err).NotTo(HaveOccurred())

test.T().Run("Test Maximum Cluster Name and Group name (63 characters)", func(_ *testing.T) {
maximumRayClusterName := strings.Repeat("r", 63)
maximumWorkerGroupName := strings.Repeat("w", 63)
strings.Repeat("w", 63)

checkRedisDBSize := deployRedis(test, namespace.Name, redisPassword)
defer g.Eventually(checkRedisDBSize, time.Second*30, time.Second).Should(BeEquivalentTo("0"))

rayClusterSpecAC := rayv1ac.RayClusterSpec().
WithGcsFaultToleranceOptions(
rayv1ac.GcsFaultToleranceOptions().
WithRedisAddress(redisAddress).
WithRedisPassword(rayv1ac.RedisCredential().WithValue(redisPassword)),
).
WithRayVersion(GetRayVersion()).
WithHeadGroupSpec(rayv1ac.HeadGroupSpec().
WithRayStartParams(map[string]string{
"num-cpus": "0",
}).
WithTemplate(headPodTemplateApplyConfiguration()),
).
WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec().
WithRayStartParams(map[string]string{
"num-cpus": "1",
}).
WithGroupName(maximumWorkerGroupName).
WithReplicas(1).
WithMinReplicas(1).
WithMaxReplicas(2).
WithTemplate(workerPodTemplateApplyConfiguration()),
)
rayClusterAC := rayv1ac.RayCluster(maximumRayClusterName, namespace.Name).
WithSpec(apply(rayClusterSpecAC, mountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](testScript, "/home/ray/samples")))

rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Apply(test.Ctx(), rayClusterAC, TestApplyOptions)

g.Expect(err).NotTo(HaveOccurred())
LogWithTimestamp(test.T(), "Created RayCluster %s/%s successfully", rayCluster.Namespace, rayCluster.Name)

LogWithTimestamp(test.T(), "Waiting for RayCluster %s/%s to become ready", rayCluster.Namespace, rayCluster.Name)
g.Eventually(RayCluster(test, namespace.Name, rayCluster.Name), TestTimeoutLong).
Should(WithTransform(StatusCondition(rayv1.RayClusterProvisioned), MatchCondition(metav1.ConditionTrue, rayv1.AllPodRunningAndReadyFirstTime)))

headPod, err := GetHeadPod(test, rayCluster)
g.Expect(err).NotTo(HaveOccurred())

LogWithTimestamp(test.T(), "HeadPod Name: %s", headPod.Name)

rayNamespace := "testing-ray-namespace"
LogWithTimestamp(test.T(), "Ray namespace: %s", rayNamespace)

ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "samples/test_detached_actor_1.py", rayNamespace})

expectedOutput := "3"
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "samples/test_detached_actor_2.py", rayNamespace, expectedOutput})

// Test 1: Delete the head Pod
err = test.Client().Core().CoreV1().Pods(namespace.Name).Delete(test.Ctx(), headPod.Name, metav1.DeleteOptions{})
g.Expect(err).NotTo(HaveOccurred())

PodUID := func(p *corev1.Pod) string { return string(p.UID) }
g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium).
ShouldNot(WithTransform(PodUID, Equal(string(headPod.UID)))) // Use UID to check if the new head pod is created.

// Pod Status should eventually become Running
PodState := func(p *corev1.Pod) string { return string(p.Status.Phase) }
g.Eventually(HeadPod(test, rayCluster), TestTimeoutMedium).
Should(WithTransform(PodState, Equal("Running")))

headPod, err = GetHeadPod(test, rayCluster) // Replace the old head pod
g.Expect(err).NotTo(HaveOccurred())

expectedOutput = "4"

ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "samples/test_detached_actor_2.py", rayNamespace, expectedOutput})

err = test.Client().Ray().RayV1().RayClusters(namespace.Name).Delete(test.Ctx(), rayCluster.Name, metav1.DeleteOptions{})
g.Expect(err).NotTo(HaveOccurred())
})
}

func TestGcsFaultToleranceOptions(t *testing.T) {
// Each test uses a separate namespace to utilize different Redis instances
// for better isolation.
Expand Down

0 comments on commit 1032d24

Please sign in to comment.