Skip to content

Commit 53c3988

Browse files
authored
fix: use function parameter instead of env var for Container Toolkit (#441)
* fix: use function parameter instead of env var for Container Toolkit config * go fmt
1 parent 2de63db commit 53c3988

File tree

6 files changed

+27
-34
lines changed

6 files changed

+27
-34
lines changed

cmd/main.go

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ var alertEvaluator *alert.AlertEvaluator
9999
var schedulerConfigPath string
100100
var alertEvaluatorReady chan struct{}
101101
var enableAutoExpander bool
102-
var compatibleWithNvidiaOperator bool
102+
var compatibleWithNvidiaContainerToolkit bool
103103

104104
func init() {
105105
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
@@ -151,16 +151,11 @@ func main() {
151151
"refer https://prometheus.io/docs/alerting/latest/configuration")
152152
flag.BoolVar(&enableAutoExpander, "enable-auto-expander", false, "if turn on auto expander, "+
153153
"TensorFusion will auto expand Nodes then Pending Pods which caused by insufficient GPU resources found")
154-
flag.BoolVar(&compatibleWithNvidiaOperator, "compatible-with-nvidia-operator", false,
155-
"if enabled, node discovery will wait for NVIDIA GPU Operator toolkit-ready validation before starting")
154+
flag.BoolVar(&compatibleWithNvidiaContainerToolkit, "compatible-with-nvidia-container-toolkit", false,
155+
"if enabled, node discovery will wait for NVIDIA Container Toolkit toolkit-ready validation before starting")
156156

157157
klog.InitFlags(nil)
158158
flag.Parse()
159-
160-
// Set environment variable for utils package to read
161-
if compatibleWithNvidiaOperator {
162-
_ = os.Setenv(constants.CompatibleWithNvidiaOperatorEnv, constants.TrueStringValue)
163-
}
164159
ctrl.SetLogger(klog.NewKlogr())
165160
ctx := context.Background()
166161

@@ -401,11 +396,12 @@ func startCustomResourceController(
401396
}
402397

403398
if err = (&controller.GPUNodeReconciler{
404-
Client: mgr.GetClient(),
405-
Scheme: mgr.GetScheme(),
406-
Recorder: mgr.GetEventRecorderFor("GPUNode"),
407-
Allocator: allocator,
408-
Expander: nodeExpander,
399+
Client: mgr.GetClient(),
400+
Scheme: mgr.GetScheme(),
401+
Recorder: mgr.GetEventRecorderFor("GPUNode"),
402+
Allocator: allocator,
403+
Expander: nodeExpander,
404+
CompatibleWithNvidiaContainerToolkit: compatibleWithNvidiaContainerToolkit,
409405
}).SetupWithManager(mgr); err != nil {
410406
setupLog.Error(err, "unable to create controller", "controller", "GPUNode")
411407
os.Exit(1)

internal/autoscaler/autoscaler_suite_test.go

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,10 @@ var _ = BeforeSuite(func() {
181181
Expect(err).ToNot(HaveOccurred())
182182

183183
err = (&controller.GPUNodeReconciler{
184-
Client: mgr.GetClient(),
185-
Scheme: mgr.GetScheme(),
186-
Recorder: mgr.GetEventRecorderFor("GPUNode"),
184+
Client: mgr.GetClient(),
185+
Scheme: mgr.GetScheme(),
186+
Recorder: mgr.GetEventRecorderFor("GPUNode"),
187+
CompatibleWithNvidiaContainerToolkit: false,
187188
}).SetupWithManager(mgr)
188189
Expect(err).ToNot(HaveOccurred())
189190

internal/controller/gpunode_controller.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,10 +48,11 @@ import (
4848
// GPUNodeReconciler reconciles a GPUNode object
4949
type GPUNodeReconciler struct {
5050
client.Client
51-
Scheme *runtime.Scheme
52-
Recorder record.EventRecorder
53-
Allocator *gpuallocator.GpuAllocator
54-
Expander *expander.NodeExpander
51+
Scheme *runtime.Scheme
52+
Recorder record.EventRecorder
53+
Allocator *gpuallocator.GpuAllocator
54+
Expander *expander.NodeExpander
55+
CompatibleWithNvidiaContainerToolkit bool
5556
}
5657

5758
// +kubebuilder:rbac:groups=tensor-fusion.ai,resources=gpunodes,verbs=get;list;watch;create;update;patch;delete
@@ -287,7 +288,7 @@ func (r *GPUNodeReconciler) reconcileNodeDiscoveryJob(
287288
})
288289
tmpl.Spec.EnableServiceLinks = ptr.To(false)
289290

290-
utils.AddTFNodeDiscoveryConfAfterTemplate(ctx, &tmpl, pool, gpunode.Name)
291+
utils.AddTFNodeDiscoveryConfAfterTemplate(ctx, &tmpl, pool, gpunode.Name, r.CompatibleWithNvidiaContainerToolkit)
291292

292293
// create node-discovery job
293294
job := &batchv1.Job{

internal/controller/suite_test.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,10 +182,11 @@ var _ = BeforeSuite(func() {
182182
Expect(err).ToNot(HaveOccurred())
183183

184184
err = (&GPUNodeReconciler{
185-
Client: mgr.GetClient(),
186-
Scheme: mgr.GetScheme(),
187-
Recorder: mgr.GetEventRecorderFor("GPUNode"),
188-
Allocator: allocator,
185+
Client: mgr.GetClient(),
186+
Scheme: mgr.GetScheme(),
187+
Recorder: mgr.GetEventRecorderFor("GPUNode"),
188+
Allocator: allocator,
189+
CompatibleWithNvidiaContainerToolkit: false,
189190
}).SetupWithManager(mgr)
190191
Expect(err).ToNot(HaveOccurred())
191192

internal/utils/compose.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -724,7 +724,7 @@ func composeVectorContainer(spec *v1.PodSpec, pool *tfv1.GPUPool) {
724724
}
725725
}
726726

727-
func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTemplateSpec, pool *tfv1.GPUPool, gpuNodeName string) {
727+
func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTemplateSpec, pool *tfv1.GPUPool, gpuNodeName string, compatibleWithNvidiaContainerToolkit bool) {
728728
tmpl.Spec.RestartPolicy = v1.RestartPolicyOnFailure
729729
serviceAccountName := GetSelfServiceAccountNameShort()
730730
if serviceAccountName == "" {
@@ -745,8 +745,8 @@ func AddTFNodeDiscoveryConfAfterTemplate(ctx context.Context, tmpl *v1.PodTempla
745745
tmpl.Spec.Containers[0].Image = pool.Spec.ComponentConfig.NodeDiscovery.Image
746746
}
747747

748-
// Add initContainer to wait for NVIDIA GPU Operator toolkit-ready validation
749-
if IsCompatibleWithNvidiaOperator() {
748+
// Add initContainer to wait for NVIDIA Container Toolkit toolkit-ready validation
749+
if compatibleWithNvidiaContainerToolkit {
750750
initContainerImage := pool.Spec.ComponentConfig.NodeDiscovery.Image
751751
if initContainerImage == "" {
752752
// Use the same image as the main container if not specified

internal/utils/config.go

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,6 @@ var nvidiaOperatorProgressiveMigrationEnv = os.Getenv(constants.NvidiaOperatorPr
181181

182182
var isLicensedEnv = os.Getenv(constants.UsingCommercialComponentEnv) == constants.TrueStringValue
183183

184-
var compatibleWithNvidiaOperatorEnv = os.Getenv(constants.CompatibleWithNvidiaOperatorEnv) == constants.TrueStringValue
185-
186184
func init() {
187185
if isLicensedEnv {
188186
ctrl.Log.Info("Enabling none open source components, please make sure you are in trial stage or have bought commercial license. Contact us: [email protected]")
@@ -197,10 +195,6 @@ func IsProgressiveMigration() bool {
197195
return nvidiaOperatorProgressiveMigrationEnv
198196
}
199197

200-
func IsCompatibleWithNvidiaOperator() bool {
201-
return compatibleWithNvidiaOperatorEnv
202-
}
203-
204198
// For test purpose only
205199
func SetProgressiveMigration(isProgressiveMigration bool) {
206200
nvidiaOperatorProgressiveMigrationEnv = isProgressiveMigration

0 commit comments

Comments
 (0)