From e350bead7b4721134b5ae1abfcbe4bc80f429dff Mon Sep 17 00:00:00 2001 From: tmontfort Date: Fri, 7 Nov 2025 08:37:24 -0800 Subject: [PATCH 1/2] rootless container stuff --- deploy/cloud/helm/platform/Chart.yaml | 2 +- .../components/operator/templates/deployment.yaml | 2 ++ .../operator/templates/manager-rbac.yaml | 4 +++- .../templates/mpi-run-ssh-keygen-job.yaml | 15 +++++++++++++-- .../helm/platform/components/operator/values.yaml | 6 ++++++ deploy/cloud/helm/platform/templates/kai.yaml | 2 +- deploy/cloud/helm/platform/values.yaml | 13 ++++++++----- 7 files changed, 34 insertions(+), 10 deletions(-) diff --git a/deploy/cloud/helm/platform/Chart.yaml b/deploy/cloud/helm/platform/Chart.yaml index 41f392c7a7..c3d193a892 100644 --- a/deploy/cloud/helm/platform/Chart.yaml +++ b/deploy/cloud/helm/platform/Chart.yaml @@ -37,7 +37,7 @@ dependencies: - name: kai-scheduler version: v0.9.4 repository: oci://ghcr.io/nvidia/kai-scheduler - condition: kai-scheduler.enabled + condition: global.kai-scheduler.enabled - name: grove-charts alias: grove version: v0.1.0-alpha.3 diff --git a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml index ddae88cbb4..09d320035a 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/deployment.yaml @@ -164,5 +164,7 @@ spec: | nindent 10 }} securityContext: runAsNonRoot: true + seccompProfile: + type: RuntimeDefault serviceAccountName: {{ include "dynamo-operator.fullname" . }}-controller-manager terminationGracePeriodSeconds: 10 diff --git a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml index dcc598ccd7..e099c64d20 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml @@ -488,6 +488,7 @@ subjects: - kind: ServiceAccount name: '{{ include "dynamo-operator.fullname" . }}-controller-manager' namespace: '{{ .Release.Namespace }}' +{{- if index .Values.global "kai-scheduler" "enabled" }} --- # ClusterRole for kai-scheduler queue access # This is always a ClusterRole since Queue resources are cluster-scoped @@ -526,4 +527,5 @@ roleRef: subjects: - kind: ServiceAccount name: '{{ include "dynamo-operator.fullname" . }}-controller-manager' - namespace: '{{ .Release.Namespace }}' \ No newline at end of file + namespace: '{{ .Release.Namespace }}' +{{- end }} \ No newline at end of file diff --git a/deploy/cloud/helm/platform/components/operator/templates/mpi-run-ssh-keygen-job.yaml b/deploy/cloud/helm/platform/components/operator/templates/mpi-run-ssh-keygen-job.yaml index 4e50b41527..a36c370862 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/mpi-run-ssh-keygen-job.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/mpi-run-ssh-keygen-job.yaml @@ -44,8 +44,9 @@ spec: {{- end }} securityContext: runAsNonRoot: true - runAsUser: 65534 - fsGroup: 65534 + allowPrivilegeEscalation: false + seccompProfile: + type: RuntimeDefault initContainers: - name: keygen image: bitnamisecure/git:latest @@ -57,6 +58,11 @@ spec: value: "{{ .Values.dynamo.mpiRun.secretName }}" - name: NAMESPACE value: "{{ .Release.Namespace }}" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL command: - /bin/bash - -e @@ -71,6 +77,11 @@ spec: volumeMounts: - name: shared mountPath: /shared + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL env: - name: SECRET_NAME value: "{{ .Values.dynamo.mpiRun.secretName }}" diff --git a/deploy/cloud/helm/platform/components/operator/values.yaml b/deploy/cloud/helm/platform/components/operator/values.yaml index 168283e121..1717ea6de6 100644 --- a/deploy/cloud/helm/platform/components/operator/values.yaml +++ b/deploy/cloud/helm/platform/components/operator/values.yaml @@ -47,7 +47,10 @@ controllerManager: - --logtostderr=true - --v=0 containerSecurityContext: + runAsNonRoot: true allowPrivilegeEscalation: false + seccompProfile: + type: RuntimeDefault capabilities: drop: - ALL @@ -68,7 +71,10 @@ controllerManager: - --leader-elect - --leader-election-id=dynamo.nko.nvidia.com containerSecurityContext: + runAsNonRoot: true allowPrivilegeEscalation: false + seccompProfile: + type: RuntimeDefault capabilities: drop: - ALL diff --git a/deploy/cloud/helm/platform/templates/kai.yaml b/deploy/cloud/helm/platform/templates/kai.yaml index af1a082201..c0c2f18856 100644 --- a/deploy/cloud/helm/platform/templates/kai.yaml +++ b/deploy/cloud/helm/platform/templates/kai.yaml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. --- -{{- if .Capabilities.APIVersions.Has "scheduling.run.ai/v2" }} +{{- if and (index .Values.global "kai-scheduler" "enabled") (.Capabilities.APIVersions.Has "scheduling.run.ai/v2") }} {{- /* Create parent queue first */ -}} {{- $defaultQueue := lookup "scheduling.run.ai/v2" "Queue" "" "dynamo-default" }} diff --git a/deploy/cloud/helm/platform/values.yaml b/deploy/cloud/helm/platform/values.yaml index e5519676ec..c32fe9976a 100644 --- a/deploy/cloud/helm/platform/values.yaml +++ b/deploy/cloud/helm/platform/values.yaml @@ -14,6 +14,13 @@ # limitations under the License. # Used to generate top-level secrets (overridden by custom-values.yaml) +# Global configuration shared across all subcharts +global: + # Kai Scheduler integration + kai-scheduler: + # -- Whether kai-scheduler is enabled. This value is shared across all charts and controls both the kai-scheduler deployment and the operator's queue RBAC permissions. + enabled: false + # Subcharts configuration # Dynamo operator configuration @@ -29,6 +36,7 @@ dynamo-operator: # -- URL for the Model Express server if not deployed by this helm chart. This is ignored if Model Express server is installed by this helm chart (global.model-express.enabled is true). modelExpressURL: "" + # -- Namespace access controls for the operator namespaceRestriction: # -- Whether to restrict operator to specific namespaces. By default, the operator will run with cluster-wide permissions. Only 1 instance of the operator should be deployed in the cluster. If you want to deploy multiple operator instances, you can set this to true and specify the target namespace (by default, the target namespace is the helm release namespace). @@ -148,11 +156,6 @@ grove: # -- Whether to enable Grove for multi-node inference coordination, if enabled, the Grove operator will be deployed cluster-wide enabled: false -# Kai Scheduler component - advanced workload scheduling -kai-scheduler: - # -- Whether to enable Kai Scheduler for intelligent resource allocation, if enabled, the Kai Scheduler operator will be deployed cluster-wide - enabled: false - # etcd configuration - distributed key-value store for operator state etcd: From 13b034cf6639ab2a78b36483183ec8eded926bf9 Mon Sep 17 00:00:00 2001 From: tmontfort Date: Fri, 7 Nov 2025 09:29:26 -0800 Subject: [PATCH 2/2] fix ssh keygen job --- .../components/operator/templates/mpi-run-ssh-keygen-job.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deploy/cloud/helm/platform/components/operator/templates/mpi-run-ssh-keygen-job.yaml b/deploy/cloud/helm/platform/components/operator/templates/mpi-run-ssh-keygen-job.yaml index a36c370862..3531565900 100644 --- a/deploy/cloud/helm/platform/components/operator/templates/mpi-run-ssh-keygen-job.yaml +++ b/deploy/cloud/helm/platform/components/operator/templates/mpi-run-ssh-keygen-job.yaml @@ -44,7 +44,8 @@ spec: {{- end }} securityContext: runAsNonRoot: true - allowPrivilegeEscalation: false + runAsUser: 65534 + fsGroup: 65534 seccompProfile: type: RuntimeDefault initContainers: