llm-d · kalantar · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/charts/harness/.helmignore b/charts/harness/.helmignore
@@ -0,0 +1,24 @@
+# Patterns to ignore when building packages.
+# This supports shell glob matching, relative path matching, and
+# negation (prefixed with !). Only one pattern per line.
+.DS_Store
+# Common VCS dirs
+.git/
+.gitignore
+.bzr/
+.bzrignore
+.hg/
+.hgignore
+.svn/
+# Common backup files
+*.swp
+*.bak
+*.tmp
+*.orig
+*~
+# Various IDEs
+.project
+.idea/
+*.tmproj
+.vscode/
+
diff --git a/charts/harness/Chart.yaml b/charts/harness/Chart.yaml
@@ -0,0 +1,40 @@
+apiVersion: v2
+name: llm-d-benchark
+description: A Helm chart for the experiment harness in llm-d-benchmark 
+
+# A chart can be either an 'application' or a 'library' chart.
+#
+# Application charts are a collection of templates that can be packaged into versioned archives
+# to be deployed.
+#
+# Library charts provide useful utilities or functions for the chart developer. They're included as
+# a dependency of application charts to inject those utilities and functions into the rendering
+# pipeline. Library charts do not define any templates and therefore cannot be deployed.
+type: application
+
+# This is the chart version. This version number should be incremented each time you make changes
+# to the chart and its templates, including the app version.
+# Versions are expected to follow Semantic Versioning (https://semver.org/)
+version: "v0.0.1"
+
+# This is the version number of the application being deployed. This version number should be
+# incremented each time you make changes to the application. Versions are not expected to
+# follow Semantic Versioning. They should reflect the version the application is using.
+# It is recommended to use it with quotes.
+appVersion: "v0.3.0"
+
+maintainers:
+  - name: "Michael Kalantar"
+    email: "[email protected]"
+    url: "https://github.com/kalantar"
+
+sources:
+  - https://github.com/llm-d/llm-d-benchmark
+
+# dependencies:
+#   - name: common
+#     repository: https://charts.bitnami.com/bitnami
+#     tags:
+#       - bitnami-common
+#     version: "2.27.0"
+
diff --git a/charts/harness/templates/_helpers.tpl b/charts/harness/templates/_helpers.tpl
@@ -0,0 +1,31 @@
+{{/*
+Expand the name of the chart.
+*/}}
+{{- define "harness.name" -}}
+{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+
+{{/*
+Create chart name and version as used by the chart label.
+Truncated to 63 characrters because Kubernetes label values are limited to this
+*/}}
+{{- define "harness.chart" -}}
+{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
+{{- end }}
+
+{{/*
+Create common labels for the resources managed by this chart.
+*/}}
+{{- define "harness.labels" -}}
+helm.sh/chart: {{ include "harness.chart" . }}
+{{- if .Chart.AppVersion }}
+app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
+{{- end }}
+app.kubernetes.io/managed-by: {{ .Release.Service }}
+{{- end }}
+
+{{- define "harness.sanitizeString" -}}
+{{- $input := . | lower | replace "." "-" | replace "/" "-" -}}
+{{- $input -}}
+{{- end -}}
diff --git a/charts/harness/templates/harness-pod.yaml b/charts/harness/templates/harness-pod.yaml
@@ -0,0 +1,104 @@
+apiVersion: v1
+kind: Pod
+metadata:
+  name: {{ .Values.harness.type }}-launcher
+  labels:
+    app: {{ .Values.harness.type }}-launcher
+spec:
+  serviceAccountName: {{ include "harness.name" . }}-runner
+  containers:
+  - name: harness
+    image: "{{ .Values.harness.image.registry }}/{{ .Values.harness.image.repository }}/{{ .Values.harness.image.name }}:{{ .Values.harness.image.tag }}"
+    imagePullPolicy: {{ .Values.harness.image.pullPolicy }}
+    securityContext:
+      runAsUser: 0
+    command: ["sh", "-c"]
+    args:
+    {{- toYaml .Values.harness.args | nindent 4 }}
+    env:
+    - name: LLMDBENCH_RUN_EXPERIMENT_LAUNCHER
+      value: "1"
+    - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZE_LOCALLY
+      value: "0"
+    - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS
+      value: "{{ .Values.harness.type }}-llm-d-benchmark.sh"
+    - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZER
+      value: "{{ .Values.harness.type }}-analyze_results.sh"
+    - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_NAME
+      value: "{{ .Values.experiment.profile.name }}"
+    - name: LLMDBENCH_RUN_EXPERIMENT_ID
+      value: "{{ .Values.experiment.identifier }}"
+    - name: LLMDBENCH_HARNESS_NAME
+      value: "{{ .Values.harness.type }}"
+    - name: LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR
+      value: "/requests/{{ .Values.harness.type }}_{{ .Values.experiment.identifier }}_{{ .Values.stack.name }}"
+    - name: LLMDBENCH_CONTROL_WORK_DIR
+      value: "/requests/{{ .Values.harness.type }}_{{ .Values.experiment.identifier }}_{{ .Values.stack.name }}"
+    - name: LLMDBENCH_HARNESS_NAMESPACE
+      value: "{{ .Release.Namespace }}"
+    - name: LLMDBENCH_HARNESS_STACK_TYPE
+      value: "{{ .Values.stack.type }}"
+    - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL
+      value: "{{ .Values.stack.endpointUrl }}"
+    - name: LLMDBENCH_HARNESS_STACK_NAME
+      value: {{ include "harness.sanitizeString" .Values.stack.model | quote }}
+    - name: LLMDBENCH_DEPLOY_METHODS
+      value: "{{ .Values.stack.deployMethod }}"
+    - name: LLMDBENCH_MAGIC_ENVAR
+      value: "harness_pod"
+
+    - name: LLMDBENCH_LLMD_IMAGE_REGISTRY
+      value: {{ .Values.harness.image.registry }}
+    - name: LLMDBENCH_LLMD_IMAGE_REPO
+      value: {{ .Values.harness.image.repository }}
+    - name: LLMDBENCH_LLMD_IMAGE_NAME
+      value: {{ .Values.harness.image.name }}
+    - name: LLMDBENCH_LLMD_IMAGE_TAG
+      value: {{ .Values.harness.image.tag | quote }}
+    {{- with .Values.harness.extraEnv }}
+    - name: {{ .name }}
+      value: "{{ .value }}"
+    {{- end }}
+
+    #  TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD
+    - name: LLMDBENCH_DEPLOY_CURRENT_MODEL
+      value: "{{ .Values.stack.model }}"
+    - name: LLMDBENCH_DEPLOY_CURRENT_MODELID
+      value: {{ include "harness.sanitizeString" .Values.stack.model | quote }}
+    - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS
+      value: "0"
+    - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS
+      value: "2"
+    - name: LLMDBENCH_VLLM_COMMON_AFFINITY
+      value: "nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3"
+    - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM
+      value: "4"
+    - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM
+      value: "1"
+    - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM
+      value: "1"
+    - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM
+      value: "1"
+
+    - name: HF_TOKEN_SECRET
+      value: "hf-secret"
+    - name: HUGGING_FACE_HUB_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-secret
+          key: HF_TOKEN
+    resources:
+      {{- toYaml .Values.harness.resources | nindent 6 }}
+    volumeMounts:
+    - name: results
+      mountPath: /requests
+    - name: {{ .Values.harness.type }}-profiles
+      mountPath: /workspace/profiles/{{ .Values.harness.type }}
+  volumes:
+  - name: results
+    persistentVolumeClaim:
+      claimName: {{ .Values.harness.resultsPVC }}
+  - name: {{ .Values.harness.type }}-profiles
+    configMap:
+      name: {{ .Values.harness.type }}-profiles
+  restartPolicy: Never
diff --git a/charts/harness/templates/harness-role.yaml b/charts/harness/templates/harness-role.yaml
@@ -0,0 +1,19 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: {{ include "harness.name" . }}-job-creator
+  labels:
+    {{- include "harness.labels" . | nindent 4 }}
+rules:
+  - apiGroups: ["batch"]
+    resources: ["jobs"]
+    verbs: ["create", "get", "list", "watch", "delete", "patch", "update"]
+  - apiGroups: [""]
+    resources: ["serviceaccounts"]
+    verbs: ["get"]
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["pods/log"]
+    verbs: ["get"]
diff --git a/charts/harness/templates/harness-rolebinding.yaml b/charts/harness/templates/harness-rolebinding.yaml
@@ -0,0 +1,27 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: {{ include "harness.name" . }}-job-creator-binding
+  labels:
+    {{- include "harness.labels" . | nindent 4 }}
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "harness.name" . }}-runner
+roleRef:
+  kind: Role
+  name: {{ include "harness.name" . }}-job-creator
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: {{ include "harness.name" . }}-restricted-scc
+  labels:
+    {{- include "harness.labels" . | nindent 4 }}
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "harness.name" . }}-runner
+roleRef:
+  kind: ClusterRole
+  name: system:openshift:scc:restricted
+  apiGroup: rbac.authorization.k8s.io
diff --git a/charts/harness/templates/harness-sa.yaml b/charts/harness/templates/harness-sa.yaml
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: {{ include "harness.name" . }}-runner
+  labels:
+    {{- include "harness.labels" . | nindent 4 }}