From 13dc6727f51384292aaec7cd160840119499a68c Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Sun, 13 Jul 2025 11:50:13 +0200
Subject: [PATCH 01/10] [KAI integration] Adding integration and example yamls

---
 helm-chart/kuberay-operator/values.yaml       |  8 +-
 .../apis/config/v1alpha1/config_utils.go      |  3 +-
 .../apis/config/v1alpha1/config_utils_test.go | 11 +++
 .../config/v1alpha1/configuration_types.go    |  2 +-
 .../samples/ray-cluster.kai-gpu-sharing.yaml  | 75 +++++++++++++++++++
 .../samples/ray-cluster.kai-scheduler.yaml    | 70 +++++++++++++++++
 .../kai-scheduler/kai_scheduler.go            | 57 ++++++++++++++
 .../ray/batchscheduler/schedulermanager.go    |  3 +
 .../batchscheduler/schedulermanager_test.go   | 21 ++++++
 ray-operator/main.go                          |  2 +-
 10 files changed, 247 insertions(+), 5 deletions(-)
 create mode 100644 ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
 create mode 100644 ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
 create mode 100644 ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go

diff --git a/helm-chart/kuberay-operator/values.yaml b/helm-chart/kuberay-operator/values.yaml
index fa8be11047f..35d7c912eb0 100644
--- a/helm-chart/kuberay-operator/values.yaml
+++ b/helm-chart/kuberay-operator/values.yaml
@@ -70,12 +70,16 @@ logging:
 #  4. Use PodGroup
 #       batchScheduler:
 #         name: scheduler-plugins
-#
+
+#  5. Use Kai Scheduler
+#       batchScheduler:
+#         name: kai-scheduler
+
 batchScheduler:
   # Deprecated. This option will be removed in the future.
   # Note, for backwards compatibility. When it sets to true, it enables volcano scheduler integration.
   enabled: false
-  # Set the customized scheduler name, supported values are "volcano", "yunikorn" or "scheduler-plugins", do not set
+  # Set the customized scheduler name, supported values are "volcano", "yunikorn", "kai-scheduler" or "scheduler-plugins", do not set
   # "batchScheduler.enabled=true" at the same time as it will override this option.
   name: ""
 
diff --git a/ray-operator/apis/config/v1alpha1/config_utils.go b/ray-operator/apis/config/v1alpha1/config_utils.go
index 53e4f02a581..e224c88cb96 100644
--- a/ray-operator/apis/config/v1alpha1/config_utils.go
+++ b/ray-operator/apis/config/v1alpha1/config_utils.go
@@ -5,6 +5,7 @@ import (
 
 	"github.com/go-logr/logr"
 
+	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
 	schedulerplugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
@@ -23,7 +24,7 @@ func ValidateBatchSchedulerConfig(logger logr.Logger, config Configuration) erro
 
 	if len(config.BatchScheduler) > 0 {
 		// if a customized scheduler is configured, check it is supported
-		if config.BatchScheduler == volcano.GetPluginName() || config.BatchScheduler == yunikorn.GetPluginName() || config.BatchScheduler == schedulerplugins.GetPluginName() {
+		if config.BatchScheduler == volcano.GetPluginName() || config.BatchScheduler == yunikorn.GetPluginName() || config.BatchScheduler == schedulerplugins.GetPluginName() || config.BatchScheduler == kaischeduler.GetPluginName() {
 			logger.Info("Feature flag batch-scheduler is enabled",
 				"scheduler name", config.BatchScheduler)
 		} else {
diff --git a/ray-operator/apis/config/v1alpha1/config_utils_test.go b/ray-operator/apis/config/v1alpha1/config_utils_test.go
index f6a73cbbbb0..c7bc960b43e 100644
--- a/ray-operator/apis/config/v1alpha1/config_utils_test.go
+++ b/ray-operator/apis/config/v1alpha1/config_utils_test.go
@@ -7,6 +7,7 @@ import (
 	"github.com/go-logr/logr/testr"
 
 	schedulerPlugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
+	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
 )
@@ -71,6 +72,16 @@ func TestValidateBatchSchedulerConfig(t *testing.T) {
 			},
 			wantErr: false,
 		},
+		{
+			name: "valid option, batch-scheduler=kai-scheduler",
+			args: args{
+				logger: testr.New(t),
+				config: Configuration{
+					BatchScheduler: kaischeduler.GetPluginName(),
+				},
+			},
+			wantErr: false,
+		},
 		{
 			name: "invalid option, invalid scheduler name",
 			args: args{
diff --git a/ray-operator/apis/config/v1alpha1/configuration_types.go b/ray-operator/apis/config/v1alpha1/configuration_types.go
index 975521242b9..c58864d85ff 100644
--- a/ray-operator/apis/config/v1alpha1/configuration_types.go
+++ b/ray-operator/apis/config/v1alpha1/configuration_types.go
@@ -44,7 +44,7 @@ type Configuration struct {
 	LogStdoutEncoder string `json:"logStdoutEncoder,omitempty"`
 
 	// BatchScheduler enables the batch scheduler integration with a specific scheduler
-	// based on the given name, currently, supported values are volcano and yunikorn.
+	// based on the given name, currently, supported values are volcano, yunikorn, kai-scheduler.
 	BatchScheduler string `json:"batchScheduler,omitempty"`
 
 	// HeadSidecarContainers includes specification for a sidecar container
diff --git a/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
new file mode 100644
index 00000000000..80db8fe249c
--- /dev/null
+++ b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
@@ -0,0 +1,75 @@
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: default
+spec:
+  resources:
+    cpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    gpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    memory:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+---
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: test
+spec:
+  parentQueue: default
+  resources:
+    cpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    gpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    memory:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+---
+
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: rc-half-gpu
+  labels:
+    kai.scheduler/queue: test
+spec:
+  headGroupSpec:
+    template:
+      spec:
+        containers:
+        - name: head
+          image: rayproject/ray:2.46.0
+          resources:
+            limits:
+              cpu: "1"
+              memory: "2Gi"
+  
+  # ---- Two workers share one GPU (0.5 each) ----
+  workerGroupSpecs:
+  - groupName: shared-gpu
+    replicas: 2
+    minReplicas: 2
+    template:
+      metadata:
+        annotations:
+          gpu-fraction: "0.5"
+      spec:
+        containers:
+        - name: worker
+          image: rayproject/ray:2.46.0
+          resources:
+            limits:
+              cpu: "1"
+              memory: "2Gi"
\ No newline at end of file
diff --git a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
new file mode 100644
index 00000000000..81fdaa3b4bd
--- /dev/null
+++ b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
@@ -0,0 +1,70 @@
+#A simple example raycluster with KAI
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: default
+spec:
+  resources:
+    cpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    gpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    memory:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+---
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: test
+spec:
+  parentQueue: default
+  resources:
+    cpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    gpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    memory:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+---
+apiVersion: ray.io/v1
+kind: RayCluster
+metadata:
+  name: ray-sample
+  labels:
+    kai.scheduler/queue: test
+spec:
+  headGroupSpec:
+    template:
+      spec:
+        containers:
+        - name: ray-head
+          image: rayproject/ray:2.41.0
+          resources:
+            requests:
+              cpu: "1"
+              memory: "2Gi"
+  workerGroupSpecs:
+  - groupName: worker
+    replicas: 2
+    minReplicas: 2
+    template:
+      spec:
+        containers:
+        - name: ray-worker
+          image: rayproject/ray:2.41.0
+          resources:
+            requests:
+              cpu: "1"
+              memory: "1Gi"
diff --git a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
new file mode 100644
index 00000000000..8d8fffaacf8
--- /dev/null
+++ b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
@@ -0,0 +1,57 @@
+package kaischeduler
+
+// This KAI plugin relies on KAI-Scheduler's
+// built-in PodGrouper to create PodGroups at
+// runtime, so the plugin itself only needs to:
+//   1. expose the scheduler name,
+//   2. stamp pods with schedulerName + queue label.
+// No PodGroup create/patch logic is included.
+
+import (
+	"context"
+
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/client-go/rest"
+	"sigs.k8s.io/controller-runtime/pkg/builder"
+
+	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
+)
+
+const (
+	QueueLabelName = "kai.scheduler/queue"
+)
+
+type KaiScheduler struct{}
+
+type KaiSchedulerFactory struct{}
+
+func GetPluginName() string { return "kai-scheduler" }
+
+func (k *KaiScheduler) Name() string { return GetPluginName() }
+
+func (k *KaiScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ *rayv1.RayCluster) error {
+	return nil
+}
+
+func (k *KaiScheduler) AddMetadataToPod(_ context.Context, app *rayv1.RayCluster, _ string, pod *corev1.Pod) {
+	if queue, ok := app.Labels[QueueLabelName]; ok {
+		if pod.Labels == nil {
+			pod.Labels = map[string]string{}
+		}
+		pod.Labels[QueueLabelName] = queue
+	}
+	pod.Spec.SchedulerName = k.Name()
+}
+
+func (kf *KaiSchedulerFactory) New(_ context.Context, _ *rest.Config) (schedulerinterface.BatchScheduler, error) {
+	return &KaiScheduler{}, nil
+}
+
+func (kf *KaiSchedulerFactory) AddToScheme(_ *runtime.Scheme) {
+}
+
+func (kf *KaiSchedulerFactory) ConfigureReconciler(b *builder.Builder) *builder.Builder {
+	return b 
+}
diff --git a/ray-operator/controllers/ray/batchscheduler/schedulermanager.go b/ray-operator/controllers/ray/batchscheduler/schedulermanager.go
index 050c08b5fc8..09455350cc3 100644
--- a/ray-operator/controllers/ray/batchscheduler/schedulermanager.go
+++ b/ray-operator/controllers/ray/batchscheduler/schedulermanager.go
@@ -12,6 +12,7 @@ import (
 
 	configapi "github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
 	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
+	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
 	schedulerplugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
@@ -60,6 +61,8 @@ func getSchedulerFactory(rayConfigs configapi.Configuration) (schedulerinterface
 			factory = &volcano.VolcanoBatchSchedulerFactory{}
 		case yunikorn.GetPluginName():
 			factory = &yunikorn.YuniKornSchedulerFactory{}
+		case kaischeduler.GetPluginName():
+			factory = &kaischeduler.KaiSchedulerFactory{}
 		case schedulerplugins.GetPluginName():
 			factory = &schedulerplugins.KubeSchedulerFactory{}
 		default:
diff --git a/ray-operator/controllers/ray/batchscheduler/schedulermanager_test.go b/ray-operator/controllers/ray/batchscheduler/schedulermanager_test.go
index 193c00b81e0..5d0fe672fbe 100644
--- a/ray-operator/controllers/ray/batchscheduler/schedulermanager_test.go
+++ b/ray-operator/controllers/ray/batchscheduler/schedulermanager_test.go
@@ -8,6 +8,7 @@ import (
 
 	"github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
 	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
+	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
 )
@@ -16,6 +17,7 @@ func TestGetSchedulerFactory(t *testing.T) {
 	DefaultFactory := &schedulerinterface.DefaultBatchSchedulerFactory{}
 	VolcanoFactory := &volcano.VolcanoBatchSchedulerFactory{}
 	YuniKornFactory := &yunikorn.YuniKornSchedulerFactory{}
+	KaiFactory := &kaischeduler.KaiSchedulerFactory{}
 
 	type args struct {
 		rayConfigs v1alpha1.Configuration
@@ -65,6 +67,16 @@ func TestGetSchedulerFactory(t *testing.T) {
 			},
 			want: reflect.TypeOf(VolcanoFactory),
 		},
+		{
+			name: "enableBatchScheduler=false, batchScheduler set to kai-scheduler",
+			args: args{
+				rayConfigs: v1alpha1.Configuration{
+					EnableBatchScheduler: false,
+					BatchScheduler:       kaischeduler.GetPluginName(),
+				},
+			},
+			want: reflect.TypeOf(KaiFactory),
+		},
 		{
 			name: "enableBatchScheduler not set, batchScheduler set to yunikorn",
 			args: args{
@@ -83,6 +95,15 @@ func TestGetSchedulerFactory(t *testing.T) {
 			},
 			want: reflect.TypeOf(VolcanoFactory),
 		},
+		{
+			name: "enableBatchScheduler not set, batchScheduler set to kai-scheduler",
+			args: args{
+				rayConfigs: v1alpha1.Configuration{
+					BatchScheduler: kaischeduler.GetPluginName(),
+				},
+			},
+			want: reflect.TypeOf(KaiFactory),
+		},
 		{
 			name: "enableBatchScheduler not set, batchScheduler set to unknown value",
 			args: args{
diff --git a/ray-operator/main.go b/ray-operator/main.go
index 1f1aa717c0b..c74da9e75c1 100644
--- a/ray-operator/main.go
+++ b/ray-operator/main.go
@@ -94,7 +94,7 @@ func main() {
 	flag.BoolVar(&enableBatchScheduler, "enable-batch-scheduler", false,
 		"(Deprecated) Enable batch scheduler. Currently is volcano, which supports gang scheduler policy. Please use --batch-scheduler instead.")
 	flag.StringVar(&batchScheduler, "batch-scheduler", "",
-		"Batch scheduler name, supported values are volcano and yunikorn.")
+		"Batch scheduler name, supported values are volcano, yunikorn, kai-scheduler.")
 	flag.StringVar(&configFile, "config", "", "Path to structured config file. Flags are ignored if config file is set.")
 	flag.BoolVar(&useKubernetesProxy, "use-kubernetes-proxy", false,
 		"Use Kubernetes proxy subresource when connecting to the Ray Head node.")

From e80c4266e2101b8f2906be95289331f1e83daf15 Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Tue, 15 Jul 2025 19:01:53 +0200
Subject: [PATCH 02/10] Adding logs that indicates that the queue label is
 missing

---
 .../samples/ray-cluster.kai-gpu-sharing.yaml      |  4 ++--
 .../config/samples/ray-cluster.kai-scheduler.yaml |  2 +-
 .../batchscheduler/kai-scheduler/kai_scheduler.go | 15 +++++++++++----
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
index 80db8fe249c..9fc7847ef62 100644
--- a/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
+++ b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
@@ -41,7 +41,7 @@ spec:
 apiVersion: ray.io/v1
 kind: RayCluster
 metadata:
-  name: rc-half-gpu
+  name: raycluster-half-gpu
   labels:
     kai.scheduler/queue: test
 spec:
@@ -72,4 +72,4 @@ spec:
           resources:
             limits:
               cpu: "1"
-              memory: "2Gi"
\ No newline at end of file
+              memory: "2Gi"
diff --git a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
index 81fdaa3b4bd..04005ce1e61 100644
--- a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
+++ b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
@@ -41,7 +41,7 @@ spec:
 apiVersion: ray.io/v1
 kind: RayCluster
 metadata:
-  name: ray-sample
+  name: raycluster-sample
   labels:
     kai.scheduler/queue: test
 spec:
diff --git a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
index 8d8fffaacf8..c37450ee9a1 100644
--- a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
+++ b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
@@ -13,6 +13,7 @@ import (
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/client-go/rest"
+	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/builder"
 
 	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
@@ -35,10 +36,16 @@ func (k *KaiScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ *rayv1
 	return nil
 }
 
-func (k *KaiScheduler) AddMetadataToPod(_ context.Context, app *rayv1.RayCluster, _ string, pod *corev1.Pod) {
-	if queue, ok := app.Labels[QueueLabelName]; ok {
+func (k *KaiScheduler) AddMetadataToPod(ctx context.Context, app *rayv1.RayCluster, _ string, pod *corev1.Pod) {
+	queue, ok := app.Labels[QueueLabelName]
+	if !ok || queue == "" {
+		logger := ctrl.LoggerFrom(ctx).WithName("kai-scheduler")
+		logger.Error(nil, "Queue label missing from RayCluster; pods will remain pending",
+			"requiredLabel", QueueLabelName,
+			"rayCluster", app.Name)
+	} else {
 		if pod.Labels == nil {
-			pod.Labels = map[string]string{}
+			pod.Labels = make(map[string]string)
 		}
 		pod.Labels[QueueLabelName] = queue
 	}
@@ -53,5 +60,5 @@ func (kf *KaiSchedulerFactory) AddToScheme(_ *runtime.Scheme) {
 }
 
 func (kf *KaiSchedulerFactory) ConfigureReconciler(b *builder.Builder) *builder.Builder {
-	return b 
+	return b
 }

From fcbd27b08fc2bec85cdcdb7bee689c99bd989a46 Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Wed, 16 Jul 2025 09:17:13 +0200
Subject: [PATCH 03/10] Adding tests for KAI Scheduler

---
 .../kai-scheduler/kai_scheduler_test.go       | 141 ++++++++++++++++++
 1 file changed, 141 insertions(+)
 create mode 100644 ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler_test.go

diff --git a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler_test.go b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler_test.go
new file mode 100644
index 00000000000..1b75dd164e5
--- /dev/null
+++ b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler_test.go
@@ -0,0 +1,141 @@
+package kaischeduler
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+
+	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+)
+
+func createTestRayCluster(name, namespace string, labels map[string]string) *rayv1.RayCluster {
+	return &rayv1.RayCluster{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: namespace,
+			Labels:    labels,
+		},
+	}
+}
+
+func createTestPod(name, namespace string) *corev1.Pod {
+	return &corev1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      name,
+			Namespace: namespace,
+			Labels: map[string]string{
+				"ray.io/cluster":   "test-cluster",
+				"ray.io/node-type": "worker",
+				"app":              "ray",
+			},
+		},
+		Spec: corev1.PodSpec{
+			Containers: []corev1.Container{{
+				Name:  "ray-worker",
+				Image: "rayproject/ray:latest",
+			}},
+		},
+	}
+}
+
+func TestAddMetadataToPod_WithQueueLabel(t *testing.T) {
+	a := assert.New(t)
+	scheduler := &KaiScheduler{}
+	ctx := context.Background()
+
+	// Create RayCluster with queue label
+	rayCluster := createTestRayCluster("test-cluster", "default", map[string]string{
+		QueueLabelName: "test-queue",
+	})
+	pod := createTestPod("test-pod", "default")
+
+	// Call AddMetadataToPod
+	scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod)
+
+	// Assert scheduler name is set to kai-scheduler
+	a.Equal("kai-scheduler", pod.Spec.SchedulerName)
+
+	// Assert queue label is propagated to pod
+	a.NotNil(pod.Labels)
+	a.Equal("test-queue", pod.Labels[QueueLabelName])
+}
+
+func TestAddMetadataToPod_WithoutQueueLabel(t *testing.T) {
+	a := assert.New(t)
+	scheduler := &KaiScheduler{}
+	ctx := context.Background()
+
+	// Create RayCluster without queue label
+	rayCluster := createTestRayCluster("test-cluster", "default", map[string]string{})
+	pod := createTestPod("test-pod", "default")
+
+	// Call AddMetadataToPod
+	scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod)
+
+	// Assert scheduler name is still set (always required)
+	a.Equal("kai-scheduler", pod.Spec.SchedulerName)
+
+	// Assert queue label is not added to pod when missing from RayCluster
+	if pod.Labels != nil {
+		_, exists := pod.Labels[QueueLabelName]
+		a.False(exists)
+	}
+}
+
+func TestAddMetadataToPod_WithEmptyQueueLabel(t *testing.T) {
+	a := assert.New(t)
+	scheduler := &KaiScheduler{}
+	ctx := context.Background()
+
+	// Create RayCluster with empty queue label
+	rayCluster := createTestRayCluster("test-cluster", "default", map[string]string{
+		QueueLabelName: "",
+	})
+	pod := createTestPod("test-pod", "default")
+
+	// Call AddMetadataToPod
+	scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod)
+
+	// Assert scheduler name is still set
+	a.Equal("kai-scheduler", pod.Spec.SchedulerName)
+
+	// Assert empty queue label is treated as missing
+	if pod.Labels != nil {
+		_, exists := pod.Labels[QueueLabelName]
+		a.False(exists)
+	}
+}
+
+func TestAddMetadataToPod_PreservesExistingPodLabels(t *testing.T) {
+	a := assert.New(t)
+	scheduler := &KaiScheduler{}
+	ctx := context.Background()
+
+	// Create RayCluster with queue label
+	rayCluster := createTestRayCluster("test-cluster", "default", map[string]string{
+		QueueLabelName: "test-queue",
+	})
+	
+	// Create pod with existing labels
+	pod := createTestPod("test-pod", "default")
+	pod.Labels = map[string]string{
+		"existing-label": "existing-value",
+		"app":           "ray",
+	}
+
+	// Call AddMetadataToPod
+	scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod)
+
+	// Assert scheduler name is set
+	a.Equal("kai-scheduler", pod.Spec.SchedulerName)
+
+	// Assert queue label is added
+	a.Equal("test-queue", pod.Labels[QueueLabelName])
+
+	// Assert existing labels are preserved
+	a.Equal("existing-value", pod.Labels["existing-label"])
+	a.Equal("ray", pod.Labels["app"])
+}

From 6032771c6a36c8b6924b6b2d660629c16a652e52 Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Wed, 16 Jul 2025 11:18:41 +0200
Subject: [PATCH 04/10] Changing Error message with Warning message

---
 .../batchscheduler/kai-scheduler/kai_scheduler.go | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
index c37450ee9a1..4fc1fa07bb4 100644
--- a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
+++ b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
@@ -37,19 +37,20 @@ func (k *KaiScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ *rayv1
 }
 
 func (k *KaiScheduler) AddMetadataToPod(ctx context.Context, app *rayv1.RayCluster, _ string, pod *corev1.Pod) {
+	pod.Spec.SchedulerName = k.Name()
+
 	queue, ok := app.Labels[QueueLabelName]
 	if !ok || queue == "" {
 		logger := ctrl.LoggerFrom(ctx).WithName("kai-scheduler")
-		logger.Error(nil, "Queue label missing from RayCluster; pods will remain pending",
+		logger.Info("Queue label missing from RayCluster; pods will remain pending",
 			"requiredLabel", QueueLabelName,
 			"rayCluster", app.Name)
-	} else {
-		if pod.Labels == nil {
-			pod.Labels = make(map[string]string)
-		}
-		pod.Labels[QueueLabelName] = queue
+		return
 	}
-	pod.Spec.SchedulerName = k.Name()
+	if pod.Labels == nil {
+		pod.Labels = make(map[string]string)
+	}
+	pod.Labels[QueueLabelName] = queue
 }
 
 func (kf *KaiSchedulerFactory) New(_ context.Context, _ *rest.Config) (schedulerinterface.BatchScheduler, error) {

From dfea7875bfbd6ec487861a0aa0a4fedf0efab739 Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Wed, 16 Jul 2025 12:56:41 +0200
Subject: [PATCH 05/10] Dividing queues and workload examples into different
 files for ease of explanation on docs

---
 .../samples/ray-cluster.kai-gpu-sharing.yaml  | 42 +------------------
 .../ray-cluster.kai-scheduler-queues.yaml     | 38 +++++++++++++++++
 .../samples/ray-cluster.kai-scheduler.yaml    | 41 +-----------------
 3 files changed, 40 insertions(+), 81 deletions(-)
 create mode 100644 ray-operator/config/samples/ray-cluster.kai-scheduler-queues.yaml

diff --git a/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
index 9fc7847ef62..00d98e8b6c5 100644
--- a/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
+++ b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
@@ -1,49 +1,9 @@
-apiVersion: scheduling.run.ai/v2
-kind: Queue
-metadata:
-  name: default
-spec:
-  resources:
-    cpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    gpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    memory:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
----
-apiVersion: scheduling.run.ai/v2
-kind: Queue
-metadata:
-  name: test
-spec:
-  parentQueue: default
-  resources:
-    cpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    gpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    memory:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
----
-
 apiVersion: ray.io/v1
 kind: RayCluster
 metadata:
   name: raycluster-half-gpu
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: team-a
 spec:
   headGroupSpec:
     template:
diff --git a/ray-operator/config/samples/ray-cluster.kai-scheduler-queues.yaml b/ray-operator/config/samples/ray-cluster.kai-scheduler-queues.yaml
new file mode 100644
index 00000000000..69c6bf635bf
--- /dev/null
+++ b/ray-operator/config/samples/ray-cluster.kai-scheduler-queues.yaml
@@ -0,0 +1,38 @@
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: department-1
+spec:
+  resources:
+    cpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    gpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    memory:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+---
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: team-a
+spec:
+  parentQueue: department-1
+  resources:
+    cpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    gpu:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
+    memory:
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
diff --git a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
index 04005ce1e61..4cb2bb6cfc1 100644
--- a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
+++ b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
@@ -1,49 +1,10 @@
 #A simple example raycluster with KAI
-apiVersion: scheduling.run.ai/v2
-kind: Queue
-metadata:
-  name: default
-spec:
-  resources:
-    cpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    gpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    memory:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
----
-apiVersion: scheduling.run.ai/v2
-kind: Queue
-metadata:
-  name: test
-spec:
-  parentQueue: default
-  resources:
-    cpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    gpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    memory:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
----
 apiVersion: ray.io/v1
 kind: RayCluster
 metadata:
   name: raycluster-sample
   labels:
-    kai.scheduler/queue: test
+    kai.scheduler/queue: team-a
 spec:
   headGroupSpec:
     template:

From e6634bbf4ec7ef8e7c35e363d29594c44c0ba6b7 Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Fri, 18 Jul 2025 09:45:56 +0200
Subject: [PATCH 06/10] Updating the image tag

---
 ray-operator/config/samples/ray-cluster.kai-scheduler.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
index 4cb2bb6cfc1..e31158e7b00 100644
--- a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
+++ b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
@@ -11,7 +11,7 @@ spec:
       spec:
         containers:
         - name: ray-head
-          image: rayproject/ray:2.41.0
+          image: rayproject/ray:2.46.0
           resources:
             requests:
               cpu: "1"
@@ -24,7 +24,7 @@ spec:
       spec:
         containers:
         - name: ray-worker
-          image: rayproject/ray:2.41.0
+          image: rayproject/ray:2.46.0
           resources:
             requests:
               cpu: "1"

From 1a63219ee32733657c4255fe8bb21073dabba174 Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Mon, 21 Jul 2025 22:06:11 +0200
Subject: [PATCH 07/10] [KAI integration] Fixing tests

---
 .../ray/batchscheduler/kai-scheduler/kai_scheduler.go          | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
index 4fc1fa07bb4..bec3ba7a7ec 100644
--- a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
+++ b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
@@ -18,6 +18,7 @@ import (
 
 	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
 	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 const (
@@ -53,7 +54,7 @@ func (k *KaiScheduler) AddMetadataToPod(ctx context.Context, app *rayv1.RayClust
 	pod.Labels[QueueLabelName] = queue
 }
 
-func (kf *KaiSchedulerFactory) New(_ context.Context, _ *rest.Config) (schedulerinterface.BatchScheduler, error) {
+func (kf *KaiSchedulerFactory) New(_ context.Context, _ *rest.Config, _ client.Client) (schedulerinterface.BatchScheduler, error) {
 	return &KaiScheduler{}, nil
 }
 

From cfe36e64c28de9212fa1e202ed1c09e20d592e95 Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Thu, 31 Jul 2025 11:36:34 +0200
Subject: [PATCH 08/10] KAI integration: fixed pre-commit

---
 .../apis/config/v1alpha1/config_utils.go      |  2 +-
 .../apis/config/v1alpha1/config_utils_test.go |  2 +-
 .../kai-scheduler/kai_scheduler.go            |  2 +-
 .../kai-scheduler/kai_scheduler_test.go       | 32 +++++++++----------
 .../ray/batchscheduler/schedulermanager.go    |  2 +-
 .../batchscheduler/schedulermanager_test.go   |  2 +-
 6 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/ray-operator/apis/config/v1alpha1/config_utils.go b/ray-operator/apis/config/v1alpha1/config_utils.go
index e224c88cb96..24ed417ce93 100644
--- a/ray-operator/apis/config/v1alpha1/config_utils.go
+++ b/ray-operator/apis/config/v1alpha1/config_utils.go
@@ -5,7 +5,7 @@ import (
 
 	"github.com/go-logr/logr"
 
-	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
+	kaischeduler "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
 	schedulerplugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
diff --git a/ray-operator/apis/config/v1alpha1/config_utils_test.go b/ray-operator/apis/config/v1alpha1/config_utils_test.go
index c7bc960b43e..76b983eb75e 100644
--- a/ray-operator/apis/config/v1alpha1/config_utils_test.go
+++ b/ray-operator/apis/config/v1alpha1/config_utils_test.go
@@ -6,8 +6,8 @@ import (
 	"github.com/go-logr/logr"
 	"github.com/go-logr/logr/testr"
 
+	kaischeduler "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
 	schedulerPlugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
-	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
 )
diff --git a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
index bec3ba7a7ec..764deddd5ef 100644
--- a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
+++ b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
@@ -15,10 +15,10 @@ import (
 	"k8s.io/client-go/rest"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/builder"
+	"sigs.k8s.io/controller-runtime/pkg/client"
 
 	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
 	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
-	"sigs.k8s.io/controller-runtime/pkg/client"
 )
 
 const (
diff --git a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler_test.go b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler_test.go
index 1b75dd164e5..ed9eaf9549a 100644
--- a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler_test.go
+++ b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler_test.go
@@ -11,21 +11,21 @@ import (
 	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
 )
 
-func createTestRayCluster(name, namespace string, labels map[string]string) *rayv1.RayCluster {
+func createTestRayCluster(labels map[string]string) *rayv1.RayCluster {
 	return &rayv1.RayCluster{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      name,
-			Namespace: namespace,
+			Name:      "test-cluster",
+			Namespace: "default",
 			Labels:    labels,
 		},
 	}
 }
 
-func createTestPod(name, namespace string) *corev1.Pod {
+func createTestPod() *corev1.Pod {
 	return &corev1.Pod{
 		ObjectMeta: metav1.ObjectMeta{
-			Name:      name,
-			Namespace: namespace,
+			Name:      "test-pod",
+			Namespace: "default",
 			Labels: map[string]string{
 				"ray.io/cluster":   "test-cluster",
 				"ray.io/node-type": "worker",
@@ -47,10 +47,10 @@ func TestAddMetadataToPod_WithQueueLabel(t *testing.T) {
 	ctx := context.Background()
 
 	// Create RayCluster with queue label
-	rayCluster := createTestRayCluster("test-cluster", "default", map[string]string{
+	rayCluster := createTestRayCluster(map[string]string{
 		QueueLabelName: "test-queue",
 	})
-	pod := createTestPod("test-pod", "default")
+	pod := createTestPod()
 
 	// Call AddMetadataToPod
 	scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod)
@@ -69,8 +69,8 @@ func TestAddMetadataToPod_WithoutQueueLabel(t *testing.T) {
 	ctx := context.Background()
 
 	// Create RayCluster without queue label
-	rayCluster := createTestRayCluster("test-cluster", "default", map[string]string{})
-	pod := createTestPod("test-pod", "default")
+	rayCluster := createTestRayCluster(map[string]string{})
+	pod := createTestPod()
 
 	// Call AddMetadataToPod
 	scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod)
@@ -91,10 +91,10 @@ func TestAddMetadataToPod_WithEmptyQueueLabel(t *testing.T) {
 	ctx := context.Background()
 
 	// Create RayCluster with empty queue label
-	rayCluster := createTestRayCluster("test-cluster", "default", map[string]string{
+	rayCluster := createTestRayCluster(map[string]string{
 		QueueLabelName: "",
 	})
-	pod := createTestPod("test-pod", "default")
+	pod := createTestPod()
 
 	// Call AddMetadataToPod
 	scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod)
@@ -115,15 +115,15 @@ func TestAddMetadataToPod_PreservesExistingPodLabels(t *testing.T) {
 	ctx := context.Background()
 
 	// Create RayCluster with queue label
-	rayCluster := createTestRayCluster("test-cluster", "default", map[string]string{
+	rayCluster := createTestRayCluster(map[string]string{
 		QueueLabelName: "test-queue",
 	})
-	
+
 	// Create pod with existing labels
-	pod := createTestPod("test-pod", "default")
+	pod := createTestPod()
 	pod.Labels = map[string]string{
 		"existing-label": "existing-value",
-		"app":           "ray",
+		"app":            "ray",
 	}
 
 	// Call AddMetadataToPod
diff --git a/ray-operator/controllers/ray/batchscheduler/schedulermanager.go b/ray-operator/controllers/ray/batchscheduler/schedulermanager.go
index 09455350cc3..42d0a8e8aa4 100644
--- a/ray-operator/controllers/ray/batchscheduler/schedulermanager.go
+++ b/ray-operator/controllers/ray/batchscheduler/schedulermanager.go
@@ -12,7 +12,7 @@ import (
 
 	configapi "github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
 	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
-	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
+	kaischeduler "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
 	schedulerplugins "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/scheduler-plugins"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
diff --git a/ray-operator/controllers/ray/batchscheduler/schedulermanager_test.go b/ray-operator/controllers/ray/batchscheduler/schedulermanager_test.go
index 5d0fe672fbe..6f1bc4f9627 100644
--- a/ray-operator/controllers/ray/batchscheduler/schedulermanager_test.go
+++ b/ray-operator/controllers/ray/batchscheduler/schedulermanager_test.go
@@ -8,7 +8,7 @@ import (
 
 	"github.com/ray-project/kuberay/ray-operator/apis/config/v1alpha1"
 	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
-	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
+	kaischeduler "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/kai-scheduler"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/volcano"
 	"github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/yunikorn"
 )

From 3c5b2f87716f7c5b57a887ad9b0c3637e6354185 Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Mon, 4 Aug 2025 10:33:38 +0200
Subject: [PATCH 09/10] [KAI Integration] Comment and merge yaml files, pull
 logger into main struct

---
 .../samples/ray-cluster.kai-gpu-sharing.yaml  | 63 ++++++++++++++++++-
 .../ray-cluster.kai-scheduler-queues.yaml     | 38 -----------
 .../samples/ray-cluster.kai-scheduler.yaml    | 57 ++++++++++++++++-
 .../kai-scheduler/kai_scheduler.go            | 16 +++--
 4 files changed, 125 insertions(+), 49 deletions(-)
 delete mode 100644 ray-operator/config/samples/ray-cluster.kai-scheduler-queues.yaml

diff --git a/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
index 00d98e8b6c5..833093570fc 100644
--- a/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
+++ b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
@@ -1,9 +1,67 @@
+# KAI Scheduler Example - GPU Sharing with RayCluster
+# KAI Scheduler uses a hierarchical queue system for resource management and fair sharing.
+# These queues must be created before any RayCluster can be scheduled by KAI.
+
+# NOTE: This is a DEMO configuration with unlimited quotas (-1) for easy testing.
+# In real-world deployments, you should set appropriate CPU/GPU/memory quotas and limits
+# based on your cluster's actual resources and organizational needs.
+
+# GPU Sharing Note: This example utilizes time slicing GPU sharing.
+# KAI Scheduler also supports MPS and other GPU sharing methods.
+# For more information, check the KAI Scheduler documentation.
+
+# Parent queue: Represents a department or high-level organizational unit
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: department-1
+spec:
+  # priority: 100  # Optional: Higher priority queues get surplus resources first
+  resources:
+    cpu:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Determines how leftover (surplus) CPU resources are distributed among
+      # queues that have the same priority. Higher weights receive larger portion
+    gpu:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+    memory:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+---
+# Child queue: Represents a team within the department-1
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: team-a
+spec:
+  parentQueue: department-1 # Inherits from parent queue
+  # priority: 50   # Optional: Team priority within department
+  resources:
+    cpu:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Determines how leftover (surplus) CPU resources are distributed among
+      # queues that have the same priority. Higher weights receive larger portion
+    gpu:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+    memory:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+---
+# RayCluster with KAI Scheduler and GPU Sharing
 apiVersion: ray.io/v1
 kind: RayCluster
 metadata:
   name: raycluster-half-gpu
   labels:
-    kai.scheduler/queue: team-a
+    kai.scheduler/queue: team-a # REQUIRED: Queue assignment for scheduling
 spec:
   headGroupSpec:
     template:
@@ -15,7 +73,6 @@ spec:
             limits:
               cpu: "1"
               memory: "2Gi"
-  
   # ---- Two workers share one GPU (0.5 each) ----
   workerGroupSpecs:
   - groupName: shared-gpu
@@ -24,7 +81,7 @@ spec:
     template:
       metadata:
         annotations:
-          gpu-fraction: "0.5"
+          gpu-fraction: "0.5" # Request 0.5 GPU per pod (two pods share one GPU)
       spec:
         containers:
         - name: worker
diff --git a/ray-operator/config/samples/ray-cluster.kai-scheduler-queues.yaml b/ray-operator/config/samples/ray-cluster.kai-scheduler-queues.yaml
deleted file mode 100644
index 69c6bf635bf..00000000000
--- a/ray-operator/config/samples/ray-cluster.kai-scheduler-queues.yaml
+++ /dev/null
@@ -1,38 +0,0 @@
-apiVersion: scheduling.run.ai/v2
-kind: Queue
-metadata:
-  name: department-1
-spec:
-  resources:
-    cpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    gpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    memory:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
----
-apiVersion: scheduling.run.ai/v2
-kind: Queue
-metadata:
-  name: team-a
-spec:
-  parentQueue: department-1
-  resources:
-    cpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    gpu:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
-    memory:
-      quota: -1
-      limit: -1
-      overQuotaWeight: 1
diff --git a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
index e31158e7b00..fb37101205b 100644
--- a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
+++ b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
@@ -1,10 +1,63 @@
-#A simple example raycluster with KAI
+# KAI Scheduler Example - Basic RayCluster
+# KAI Scheduler uses a hierarchical queue system for resource management and fair sharing.
+# These queues must be created before any RayCluster can be scheduled by KAI.
+
+# NOTE: This is a DEMO configuration with unlimited quotas (-1) for easy testing.
+# In real-world deployments, you should set appropriate CPU/GPU/memory quotas and limits
+# based on your cluster's actual resources and organizational needs.
+
+# Parent queue: Represents a department or high-level organizational unit
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: department-1
+spec:
+  # priority: 100  # Optional: Higher priority queues get surplus resources first
+  resources:
+    cpu:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Determines how leftover (surplus) CPU resources are distributed among
+      # queues that have the same priority. Higher weights receive larger portion
+    gpu:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+    memory:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+---
+# Child queue: Represents a team within the department-1
+apiVersion: scheduling.run.ai/v2
+kind: Queue
+metadata:
+  name: team-a
+spec:
+  parentQueue: department-1 # Inherits from parent queue
+  # priority: 50   # Optional: Team priority within department
+  resources:
+    cpu:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Determines how leftover (surplus) CPU resources are distributed among
+      # queues that have the same priority. Higher weights receive larger portion
+    gpu:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Share of surplus resources among same-priority teams
+    memory:
+      quota: -1 # Unlimited quota (demo setting)
+      limit: -1 # Unlimited burst limit (demo setting)
+      overQuotaWeight: 1 # Share of surplus resources among same-priority teams
+---
+# RayCluster with KAI Scheduler
 apiVersion: ray.io/v1
 kind: RayCluster
 metadata:
   name: raycluster-sample
   labels:
-    kai.scheduler/queue: team-a
+    kai.scheduler/queue: team-a # REQUIRED: Queue assignment for scheduling
 spec:
   headGroupSpec:
     template:
diff --git a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
index 764deddd5ef..8ea57d2f5d0 100644
--- a/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
+++ b/ray-operator/controllers/ray/batchscheduler/kai-scheduler/kai_scheduler.go
@@ -10,12 +10,13 @@ package kaischeduler
 import (
 	"context"
 
+	"github.com/go-logr/logr"
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/runtime"
 	"k8s.io/client-go/rest"
-	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/builder"
 	"sigs.k8s.io/controller-runtime/pkg/client"
+	logf "sigs.k8s.io/controller-runtime/pkg/log"
 
 	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
 	schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface"
@@ -25,7 +26,9 @@ const (
 	QueueLabelName = "kai.scheduler/queue"
 )
 
-type KaiScheduler struct{}
+type KaiScheduler struct {
+	log logr.Logger
+}
 
 type KaiSchedulerFactory struct{}
 
@@ -37,13 +40,12 @@ func (k *KaiScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ *rayv1
 	return nil
 }
 
-func (k *KaiScheduler) AddMetadataToPod(ctx context.Context, app *rayv1.RayCluster, _ string, pod *corev1.Pod) {
+func (k *KaiScheduler) AddMetadataToPod(_ context.Context, app *rayv1.RayCluster, _ string, pod *corev1.Pod) {
 	pod.Spec.SchedulerName = k.Name()
 
 	queue, ok := app.Labels[QueueLabelName]
 	if !ok || queue == "" {
-		logger := ctrl.LoggerFrom(ctx).WithName("kai-scheduler")
-		logger.Info("Queue label missing from RayCluster; pods will remain pending",
+		k.log.Info("Queue label missing from RayCluster; pods will remain pending",
 			"requiredLabel", QueueLabelName,
 			"rayCluster", app.Name)
 		return
@@ -55,7 +57,9 @@ func (k *KaiScheduler) AddMetadataToPod(ctx context.Context, app *rayv1.RayClust
 }
 
 func (kf *KaiSchedulerFactory) New(_ context.Context, _ *rest.Config, _ client.Client) (schedulerinterface.BatchScheduler, error) {
-	return &KaiScheduler{}, nil
+	return &KaiScheduler{
+		log: logf.Log.WithName("kai-scheduler"),
+	}, nil
 }
 
 func (kf *KaiSchedulerFactory) AddToScheme(_ *runtime.Scheme) {

From 28b592d30806dec4ecb5cbf6ece0a954f37ae868 Mon Sep 17 00:00:00 2001
From: EkinKarabulut <ekarabulut@nvidia.com>
Date: Mon, 4 Aug 2025 19:33:05 +0200
Subject: [PATCH 10/10] [KAI Integration] improve yaml comments

---
 .../samples/ray-cluster.kai-gpu-sharing.yaml  | 46 +++++++++++--------
 .../samples/ray-cluster.kai-scheduler.yaml    | 46 +++++++++++--------
 2 files changed, 52 insertions(+), 40 deletions(-)

diff --git a/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
index 833093570fc..3aa8b961186 100644
--- a/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
+++ b/ray-operator/config/samples/ray-cluster.kai-gpu-sharing.yaml
@@ -18,19 +18,22 @@ metadata:
 spec:
   # priority: 100  # Optional: Higher priority queues get surplus resources first
   resources:
+    # quota: Guaranteed resources for this queue
+    # limit: Maximum resources this queue can use
+    # overQuotaWeight: How surplus resources are shared among queues
+    # Note: Using -1 (unlimited) for demo purposes
     cpu:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Determines how leftover (surplus) CPU resources are distributed among
-      # queues that have the same priority. Higher weights receive larger portion
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
     gpu:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
     memory:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
 ---
 # Child queue: Represents a team within the department-1
 apiVersion: scheduling.run.ai/v2
@@ -41,19 +44,22 @@ spec:
   parentQueue: department-1 # Inherits from parent queue
   # priority: 50   # Optional: Team priority within department
   resources:
+    # quota: Guaranteed resources for this queue
+    # limit: Maximum resources this queue can use
+    # overQuotaWeight: How surplus resources are shared among queues
+    # Note: Using -1 (unlimited) for demo purposes
     cpu:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Determines how leftover (surplus) CPU resources are distributed among
-      # queues that have the same priority. Higher weights receive larger portion
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
     gpu:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
     memory:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
 ---
 # RayCluster with KAI Scheduler and GPU Sharing
 apiVersion: ray.io/v1
diff --git a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
index fb37101205b..da5507fcc3b 100644
--- a/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
+++ b/ray-operator/config/samples/ray-cluster.kai-scheduler.yaml
@@ -14,19 +14,22 @@ metadata:
 spec:
   # priority: 100  # Optional: Higher priority queues get surplus resources first
   resources:
+    # quota: Guaranteed resources for this queue
+    # limit: Maximum resources this queue can use
+    # overQuotaWeight: How surplus resources are shared among queues
+    # Note: Using -1 (unlimited) for demo purposes
     cpu:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Determines how leftover (surplus) CPU resources are distributed among
-      # queues that have the same priority. Higher weights receive larger portion
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
     gpu:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
     memory:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Share of surplus resources among same-priority queues
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
 ---
 # Child queue: Represents a team within the department-1
 apiVersion: scheduling.run.ai/v2
@@ -37,19 +40,22 @@ spec:
   parentQueue: department-1 # Inherits from parent queue
   # priority: 50   # Optional: Team priority within department
   resources:
+    # quota: Guaranteed resources for this queue
+    # limit: Maximum resources this queue can use
+    # overQuotaWeight: How surplus resources are shared among queues
+    # Note: Using -1 (unlimited) for demo purposes
     cpu:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Determines how leftover (surplus) CPU resources are distributed among
-      # queues that have the same priority. Higher weights receive larger portion
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
     gpu:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Share of surplus resources among same-priority teams
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
     memory:
-      quota: -1 # Unlimited quota (demo setting)
-      limit: -1 # Unlimited burst limit (demo setting)
-      overQuotaWeight: 1 # Share of surplus resources among same-priority teams
+      quota: -1
+      limit: -1
+      overQuotaWeight: 1
 ---
 # RayCluster with KAI Scheduler
 apiVersion: ray.io/v1