Skip to content

Commit 2de63db

Browse files
authored
fix: gpu operator setup nvidia container toolkit too late issue, wait it complete (#440)
1 parent d90283f commit 2de63db

File tree

4 files changed

+57
-1
lines changed

4 files changed

+57
-1
lines changed

cmd/main.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ var alertEvaluator *alert.AlertEvaluator
9999
var schedulerConfigPath string
100100
var alertEvaluatorReady chan struct{}
101101
var enableAutoExpander bool
102+
var compatibleWithNvidiaOperator bool
102103

103104
func init() {
104105
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
@@ -150,9 +151,16 @@ func main() {
150151
"refer https://prometheus.io/docs/alerting/latest/configuration")
151152
flag.BoolVar(&enableAutoExpander, "enable-auto-expander", false, "if turn on auto expander, "+
152153
"TensorFusion will auto expand Nodes then Pending Pods which caused by insufficient GPU resources found")
154+
flag.BoolVar(&compatibleWithNvidiaOperator, "compatible-with-nvidia-operator", false,
155+
"if enabled, node discovery will wait for NVIDIA GPU Operator toolkit-ready validation before starting")
153156

154157
klog.InitFlags(nil)
155158
flag.Parse()
159+
160+
// Set environment variable for utils package to read
161+
if compatibleWithNvidiaOperator {
162+
_ = os.Setenv(constants.CompatibleWithNvidiaOperatorEnv, constants.TrueStringValue)
163+
}
156164
ctrl.SetLogger(klog.NewKlogr())
157165
ctx := context.Background()
158166

internal/constants/env.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@ const (
1717
NvidiaOperatorProgressiveMigrationEnv = "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION"
1818
RunHypervisorUtilGPUAllocatable = "RUN_HYPERVISOR_UTIL_GPU_ALLOCATABLE"
1919

20-
UsingCommercialComponentEnv = "COMMERCIAL_PLAN"
20+
UsingCommercialComponentEnv = "COMMERCIAL_PLAN"
21+
CompatibleWithNvidiaOperatorEnv = "COMPATIBLE_WITH_NVIDIA_OPERATOR"
2122
)
2223

2324
// General envs used in compose components manifest

internal/utils/compose.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -745,6 +745,47 @@ func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTempla
745745
tmpl.Spec.Containers[0].Image = pool.Spec.ComponentConfig.NodeDiscovery.Image
746746
}
747747

748+
// Add initContainer to wait for NVIDIA GPU Operator toolkit-ready validation
749+
if IsCompatibleWithNvidiaOperator() {
750+
initContainerImage := pool.Spec.ComponentConfig.NodeDiscovery.Image
751+
if initContainerImage == "" {
752+
// Use the same image as the main container if not specified
753+
initContainerImage = tmpl.Spec.Containers[0].Image
754+
}
755+
756+
initContainer := v1.Container{
757+
Name: "toolkit-validation",
758+
Image: initContainerImage,
759+
Command: []string{"sh", "-c"},
760+
Args: []string{
761+
"until [ -f /run/nvidia/validations/toolkit-ready ]; do echo waiting for nvidia container stack to be setup; sleep 5; done",
762+
},
763+
SecurityContext: &v1.SecurityContext{
764+
Privileged: ptr.To(true),
765+
},
766+
VolumeMounts: []v1.VolumeMount{
767+
{
768+
Name: "run-nvidia-validations",
769+
MountPath: "/run/nvidia/validations",
770+
MountPropagation: ptr.To(v1.MountPropagationHostToContainer),
771+
},
772+
},
773+
}
774+
775+
tmpl.Spec.InitContainers = append(tmpl.Spec.InitContainers, initContainer)
776+
777+
// Add volume for NVIDIA validations
778+
tmpl.Spec.Volumes = append(tmpl.Spec.Volumes, v1.Volume{
779+
Name: "run-nvidia-validations",
780+
VolumeSource: v1.VolumeSource{
781+
HostPath: &v1.HostPathVolumeSource{
782+
Path: "/run/nvidia/validations",
783+
Type: ptr.To(v1.HostPathDirectoryOrCreate),
784+
},
785+
},
786+
})
787+
}
788+
748789
tmpl.Spec.Containers[0].Env = append(tmpl.Spec.Containers[0].Env, v1.EnvVar{
749790
Name: constants.NodeDiscoveryReportGPUNodeEnvName,
750791
Value: gpuNodeName,

internal/utils/config.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,8 @@ var nvidiaOperatorProgressiveMigrationEnv = os.Getenv(constants.NvidiaOperatorPr
181181

182182
var isLicensedEnv = os.Getenv(constants.UsingCommercialComponentEnv) == constants.TrueStringValue
183183

184+
var compatibleWithNvidiaOperatorEnv = os.Getenv(constants.CompatibleWithNvidiaOperatorEnv) == constants.TrueStringValue
185+
184186
func init() {
185187
if isLicensedEnv {
186188
ctrl.Log.Info("Enabling none open source components, please make sure you are in trial stage or have bought commercial license. Contact us: [email protected]")
@@ -195,6 +197,10 @@ func IsProgressiveMigration() bool {
195197
return nvidiaOperatorProgressiveMigrationEnv
196198
}
197199

200+
func IsCompatibleWithNvidiaOperator() bool {
201+
return compatibleWithNvidiaOperatorEnv
202+
}
203+
198204
// For test purpose only
199205
func SetProgressiveMigration(isProgressiveMigration bool) {
200206
nvidiaOperatorProgressiveMigrationEnv = isProgressiveMigration

0 commit comments

Comments
 (0)