-
Notifications
You must be signed in to change notification settings - Fork 581
Integration: KAI Scheduler #3886
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
13dc672
e80c426
fcbd27b
6032771
dfea787
e6634bb
1a63219
cfe36e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
apiVersion: ray.io/v1 | ||
kind: RayCluster | ||
metadata: | ||
name: raycluster-half-gpu | ||
labels: | ||
kai.scheduler/queue: team-a | ||
spec: | ||
headGroupSpec: | ||
template: | ||
spec: | ||
containers: | ||
- name: head | ||
image: rayproject/ray:2.46.0 | ||
resources: | ||
limits: | ||
cpu: "1" | ||
memory: "2Gi" | ||
|
||
# ---- Two workers share one GPU (0.5 each) ---- | ||
workerGroupSpecs: | ||
- groupName: shared-gpu | ||
replicas: 2 | ||
minReplicas: 2 | ||
template: | ||
metadata: | ||
annotations: | ||
gpu-fraction: "0.5" | ||
spec: | ||
containers: | ||
- name: worker | ||
image: rayproject/ray:2.46.0 | ||
resources: | ||
limits: | ||
cpu: "1" | ||
memory: "2Gi" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
apiVersion: scheduling.run.ai/v2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like the queue definition using by other yaml files. Could the content of this file put into other yaml files which are using KAI together? It might benefit the end user to apply easily like other yaml files under samples. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think either way would work as long as we have a clear documentation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1. If we can make it so that users only need to run a single file to follow the doc, we should do that to make the process less error-prone. |
||
kind: Queue | ||
metadata: | ||
name: department-1 | ||
spec: | ||
resources: | ||
cpu: | ||
quota: -1 | ||
limit: -1 | ||
overQuotaWeight: 1 | ||
gpu: | ||
quota: -1 | ||
limit: -1 | ||
overQuotaWeight: 1 | ||
memory: | ||
quota: -1 | ||
limit: -1 | ||
overQuotaWeight: 1 | ||
--- | ||
apiVersion: scheduling.run.ai/v2 | ||
kind: Queue | ||
metadata: | ||
name: team-a | ||
spec: | ||
parentQueue: department-1 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what do |
||
resources: | ||
cpu: | ||
quota: -1 | ||
limit: -1 | ||
overQuotaWeight: 1 | ||
gpu: | ||
quota: -1 | ||
limit: -1 | ||
overQuotaWeight: 1 | ||
memory: | ||
quota: -1 | ||
limit: -1 | ||
overQuotaWeight: 1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#A simple example raycluster with KAI | ||
apiVersion: ray.io/v1 | ||
kind: RayCluster | ||
metadata: | ||
name: raycluster-sample | ||
labels: | ||
kai.scheduler/queue: team-a | ||
spec: | ||
headGroupSpec: | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-head | ||
image: rayproject/ray:2.46.0 | ||
resources: | ||
requests: | ||
cpu: "1" | ||
memory: "2Gi" | ||
workerGroupSpecs: | ||
- groupName: worker | ||
replicas: 2 | ||
minReplicas: 2 | ||
template: | ||
spec: | ||
containers: | ||
- name: ray-worker | ||
image: rayproject/ray:2.46.0 | ||
resources: | ||
requests: | ||
cpu: "1" | ||
memory: "1Gi" |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package kaischeduler | ||
|
||
// This KAI plugin relies on KAI-Scheduler's | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool! In that case, I guess it’s possible for the KAI scheduler to support gang scheduling with autoscaling enabled? |
||
// built-in PodGrouper to create PodGroups at | ||
// runtime, so the plugin itself only needs to: | ||
// 1. expose the scheduler name, | ||
// 2. stamp pods with schedulerName + queue label. | ||
// No PodGroup create/patch logic is included. | ||
|
||
import ( | ||
"context" | ||
|
||
corev1 "k8s.io/api/core/v1" | ||
"k8s.io/apimachinery/pkg/runtime" | ||
"k8s.io/client-go/rest" | ||
ctrl "sigs.k8s.io/controller-runtime" | ||
"sigs.k8s.io/controller-runtime/pkg/builder" | ||
"sigs.k8s.io/controller-runtime/pkg/client" | ||
|
||
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" | ||
schedulerinterface "github.com/ray-project/kuberay/ray-operator/controllers/ray/batchscheduler/interface" | ||
) | ||
|
||
const ( | ||
QueueLabelName = "kai.scheduler/queue" | ||
) | ||
|
||
type KaiScheduler struct{} | ||
|
||
type KaiSchedulerFactory struct{} | ||
|
||
func GetPluginName() string { return "kai-scheduler" } | ||
|
||
func (k *KaiScheduler) Name() string { return GetPluginName() } | ||
|
||
func (k *KaiScheduler) DoBatchSchedulingOnSubmission(_ context.Context, _ *rayv1.RayCluster) error { | ||
return nil | ||
} | ||
|
||
func (k *KaiScheduler) AddMetadataToPod(ctx context.Context, app *rayv1.RayCluster, _ string, pod *corev1.Pod) { | ||
pod.Spec.SchedulerName = k.Name() | ||
|
||
queue, ok := app.Labels[QueueLabelName] | ||
if !ok || queue == "" { | ||
logger := ctrl.LoggerFrom(ctx).WithName("kai-scheduler") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be better if we out logger in |
||
logger.Info("Queue label missing from RayCluster; pods will remain pending", | ||
"requiredLabel", QueueLabelName, | ||
"rayCluster", app.Name) | ||
return | ||
} | ||
if pod.Labels == nil { | ||
pod.Labels = make(map[string]string) | ||
} | ||
pod.Labels[QueueLabelName] = queue | ||
} | ||
|
||
func (kf *KaiSchedulerFactory) New(_ context.Context, _ *rest.Config, _ client.Client) (schedulerinterface.BatchScheduler, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we might need to create logger when new a Kaischeduler. |
||
return &KaiScheduler{}, nil | ||
} | ||
|
||
func (kf *KaiSchedulerFactory) AddToScheme(_ *runtime.Scheme) { | ||
} | ||
|
||
func (kf *KaiSchedulerFactory) ConfigureReconciler(b *builder.Builder) *builder.Builder { | ||
return b | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
package kaischeduler | ||
|
||
import ( | ||
"context" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
corev1 "k8s.io/api/core/v1" | ||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
|
||
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" | ||
) | ||
|
||
func createTestRayCluster(labels map[string]string) *rayv1.RayCluster { | ||
return &rayv1.RayCluster{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: "test-cluster", | ||
Namespace: "default", | ||
Labels: labels, | ||
}, | ||
} | ||
} | ||
|
||
func createTestPod() *corev1.Pod { | ||
return &corev1.Pod{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: "test-pod", | ||
Namespace: "default", | ||
Labels: map[string]string{ | ||
"ray.io/cluster": "test-cluster", | ||
"ray.io/node-type": "worker", | ||
"app": "ray", | ||
}, | ||
}, | ||
Spec: corev1.PodSpec{ | ||
Containers: []corev1.Container{{ | ||
Name: "ray-worker", | ||
Image: "rayproject/ray:latest", | ||
}}, | ||
}, | ||
} | ||
} | ||
|
||
func TestAddMetadataToPod_WithQueueLabel(t *testing.T) { | ||
a := assert.New(t) | ||
scheduler := &KaiScheduler{} | ||
ctx := context.Background() | ||
|
||
// Create RayCluster with queue label | ||
rayCluster := createTestRayCluster(map[string]string{ | ||
QueueLabelName: "test-queue", | ||
}) | ||
pod := createTestPod() | ||
|
||
// Call AddMetadataToPod | ||
scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod) | ||
|
||
// Assert scheduler name is set to kai-scheduler | ||
a.Equal("kai-scheduler", pod.Spec.SchedulerName) | ||
|
||
// Assert queue label is propagated to pod | ||
a.NotNil(pod.Labels) | ||
a.Equal("test-queue", pod.Labels[QueueLabelName]) | ||
} | ||
|
||
func TestAddMetadataToPod_WithoutQueueLabel(t *testing.T) { | ||
a := assert.New(t) | ||
scheduler := &KaiScheduler{} | ||
ctx := context.Background() | ||
|
||
// Create RayCluster without queue label | ||
rayCluster := createTestRayCluster(map[string]string{}) | ||
pod := createTestPod() | ||
|
||
// Call AddMetadataToPod | ||
scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod) | ||
|
||
// Assert scheduler name is still set (always required) | ||
a.Equal("kai-scheduler", pod.Spec.SchedulerName) | ||
|
||
// Assert queue label is not added to pod when missing from RayCluster | ||
if pod.Labels != nil { | ||
_, exists := pod.Labels[QueueLabelName] | ||
a.False(exists) | ||
} | ||
} | ||
|
||
func TestAddMetadataToPod_WithEmptyQueueLabel(t *testing.T) { | ||
a := assert.New(t) | ||
scheduler := &KaiScheduler{} | ||
ctx := context.Background() | ||
|
||
// Create RayCluster with empty queue label | ||
rayCluster := createTestRayCluster(map[string]string{ | ||
QueueLabelName: "", | ||
}) | ||
pod := createTestPod() | ||
|
||
// Call AddMetadataToPod | ||
scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod) | ||
|
||
// Assert scheduler name is still set | ||
a.Equal("kai-scheduler", pod.Spec.SchedulerName) | ||
|
||
// Assert empty queue label is treated as missing | ||
if pod.Labels != nil { | ||
_, exists := pod.Labels[QueueLabelName] | ||
a.False(exists) | ||
} | ||
} | ||
|
||
func TestAddMetadataToPod_PreservesExistingPodLabels(t *testing.T) { | ||
a := assert.New(t) | ||
scheduler := &KaiScheduler{} | ||
ctx := context.Background() | ||
|
||
// Create RayCluster with queue label | ||
rayCluster := createTestRayCluster(map[string]string{ | ||
QueueLabelName: "test-queue", | ||
}) | ||
|
||
// Create pod with existing labels | ||
pod := createTestPod() | ||
pod.Labels = map[string]string{ | ||
"existing-label": "existing-value", | ||
"app": "ray", | ||
} | ||
|
||
// Call AddMetadataToPod | ||
scheduler.AddMetadataToPod(ctx, rayCluster, "test-group", pod) | ||
|
||
// Assert scheduler name is set | ||
a.Equal("kai-scheduler", pod.Spec.SchedulerName) | ||
|
||
// Assert queue label is added | ||
a.Equal("test-queue", pod.Labels[QueueLabelName]) | ||
|
||
// Assert existing labels are preserved | ||
a.Equal("existing-value", pod.Labels["existing-label"]) | ||
a.Equal("ray", pod.Labels["app"]) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What does this mean? Are you using DRA to mount the same GPU to two different Pods?
Additionally, do we need to specify GPUs in the resource requests and limits? If not, KubeRay won’t pass GPU information to Ray, and Ray will be unable to map physical GPU resources in Kubernetes to logical resources within Ray.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add comments for the KAI Scheduler–specific configuration so that users can understand what this YAML is for?