@@ -84,10 +84,6 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens
84
84
if pod .Annotations == nil {
85
85
pod .Annotations = map [string ]string {}
86
86
}
87
- // add workload to pod annotations just for additional information
88
- // so that users will know which GPU workload this pod binds to
89
- pod .Annotations [constants .WorkloadKey ] = tfInfo .WorkloadName
90
-
91
87
// When it's worker, set workload key to label for triggering workload reconcile
92
88
if tfInfo .Profile .IsLocalGPU {
93
89
if pod .Labels == nil {
@@ -116,7 +112,11 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens
116
112
pod .Annotations [constants .InjectContainerAnnotation ] = strings .Join (tfInfo .ContainerNames , "," )
117
113
}
118
114
119
- func AppendTFWorkerLabelsAndAnnotationsAfterTemplate (podTmpl * v1.PodTemplate , workload * tfv1.TensorFusionWorkload ) (map [string ]string , map [string ]string ) {
115
+ func AppendTFWorkerLabelsAndAnnotationsAfterTemplate (
116
+ podTmpl * v1.PodTemplate ,
117
+ workload * tfv1.TensorFusionWorkload ,
118
+ containerName string ,
119
+ ) (map [string ]string , map [string ]string ) {
120
120
labels := maps .Clone (podTmpl .Template .Labels )
121
121
if labels == nil {
122
122
labels = map [string ]string {}
@@ -132,6 +132,7 @@ func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(podTmpl *v1.PodTemplate, wo
132
132
annotations [constants .VRAMLimitAnnotation ] = res .Limits .Vram .String ()
133
133
annotations [constants .TFLOPSRequestAnnotation ] = res .Requests .Tflops .String ()
134
134
annotations [constants .VRAMRequestAnnotation ] = res .Requests .Vram .String ()
135
+ annotations [constants .InjectContainerAnnotation ] = containerName
135
136
if workload .Spec .Qos == "" {
136
137
annotations [constants .QoSLevelAnnotation ] = string (tfv1 .QoSMedium )
137
138
} else {
@@ -595,7 +596,7 @@ func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTempla
595
596
}
596
597
}
597
598
598
- func AddWorkerConfAfterTemplate (ctx context.Context , spec * v1.PodSpec , workerConfig * tfv1.WorkerConfig , hypervisorConfig * tfv1.HypervisorConfig , workload * tfv1.TensorFusionWorkload ) {
599
+ func AddWorkerConfAfterTemplate (ctx context.Context , spec * v1.PodSpec , workerConfig * tfv1.WorkerConfig , hypervisorConfig * tfv1.HypervisorConfig , workload * tfv1.TensorFusionWorkload ) string {
599
600
// NOTE: need to set environment variable to make all GPUs visible to the worker,
600
601
// vgpu.rs limiter will limit to specific devices after Pod started
601
602
spec .Containers [0 ].Name = constants .TFContainerNameWorker
@@ -689,4 +690,6 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon
689
690
if len (spec .Containers [0 ].Resources .Requests ) == 0 {
690
691
spec .Containers [0 ].Resources .Requests = workerDefaultRequests
691
692
}
693
+
694
+ return spec .Containers [0 ].Name
692
695
}
0 commit comments