From 4644ab6a3ae553772ae436f62bfbbd4ee7cc90a5 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 1 Oct 2025 07:42:18 -0400 Subject: [PATCH 01/44] values files Signed-off-by: Michael Kalantar --- .../inference-scheduling/gaie-values.yaml | 150 +++++++++++ .../inference-scheduling/gateway-values.yaml | 8 + .../inference-scheduling/ms-values.yaml | 237 ++++++++++++++++++ 3 files changed, 395 insertions(+) create mode 100644 tekton-poc/examples/inference-scheduling/gaie-values.yaml create mode 100644 tekton-poc/examples/inference-scheduling/gateway-values.yaml create mode 100644 tekton-poc/examples/inference-scheduling/ms-values.yaml diff --git a/tekton-poc/examples/inference-scheduling/gaie-values.yaml b/tekton-poc/examples/inference-scheduling/gaie-values.yaml new file mode 100644 index 00000000..2d84f723 --- /dev/null +++ b/tekton-poc/examples/inference-scheduling/gaie-values.yaml @@ -0,0 +1,150 @@ +inferenceExtension: + replicas: 1 + image: + # Either image will work, you just need to bring the correct plugins per image. In this example we will bring the upstream default plugin + ################### + name: llm-d-inference-scheduler + hub: ghcr.io/llm-d + tag: v0.2.1 + pullPolicy: Always + extProcPort: 9002 + extraContainerPorts: + - name: zmq + containerPort: 5557 + protocol: TCP + extraServicePorts: + - name: zmq + port: 5557 + targetPort: 5557 + protocol: TCP + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + pluginsConfigFile: "inf-sche-none.yaml" + pluginsCustomConfig: + inf-sche-none.yaml: | + # Sample EPP configuration for running without P/D with no scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-none.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefix-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 0 + inf-sche-prefix-kv-queue.yaml: | + # Sample EPP configuration for running without P/D with prefix, kv, and queue scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-prefix-kv-queue.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefix-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + - type: kv-cache-scorer + - type: queue-cache-scorer + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 1 + - pluginRef: kv-cache-scorer + weight: 1 + - pluginRef: queue-scorer + weight: 1 + inf-sche-prefix-kv.yaml: | + # Sample EPP configuration for running without P/D with prefix and kv scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-prefix-kv.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefix-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + - type: kv-cache-scorer + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 1 + - pluginRef: kv-cache-scorer + weight: 1 + - pluginRef: queue-scorer + weight: 1 + inf-sche-prefix.yaml: | + # Sample EPP configuration for running without P/D with prefix scorer with weight of 1 + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-prefix.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefix-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 1 + inf-sche-queue.yaml: | + # Sample EPP configuration for running without P/D with no scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-queue.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: queue-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: queue-scorer + weight: 1 + inf-sche-kv.yaml: | + # Sample EPP configuration for running without P/D with no scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-kv.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: kv-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: kv-cache-scorer + weight: 1 +inferencePool: + targetPortNumber: 8000 + modelServerType: vllm + apiVersion: "inference.networking.x-k8s.io/v1alpha2" + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: qwen-qwe-1ea37573-en3-0-6b +provider: + name: none diff --git a/tekton-poc/examples/inference-scheduling/gateway-values.yaml b/tekton-poc/examples/inference-scheduling/gateway-values.yaml new file mode 100644 index 00000000..b22f8140 --- /dev/null +++ b/tekton-poc/examples/inference-scheduling/gateway-values.yaml @@ -0,0 +1,8 @@ +gateway: + gatewayClassName: kgateway + service: + type: NodePort + destinationRule: + host: gaie-inference-scheduling-epp.kalantar-is.svc.cluster.local + gatewayParameters: + enabled: true diff --git a/tekton-poc/examples/inference-scheduling/ms-values.yaml b/tekton-poc/examples/inference-scheduling/ms-values.yaml new file mode 100644 index 00000000..b8ad2d45 --- /dev/null +++ b/tekton-poc/examples/inference-scheduling/ms-values.yaml @@ -0,0 +1,237 @@ +fullnameOverride: qwen-qwe-1ea37573-en3-0-6b +multinode: false + +modelArtifacts: + uri: pvc://model-pvc/models/Qwen/Qwen3-0.6B + size: 300Gi + authSecretName: "hf-secret" + name: Qwen/Qwen3-0.6B + +routing: + servicePort: 8000 + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-llmdbench-inference-gateway + proxy: + image: "ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0" + secure: false + connector: nixlv2 + debugLevel: 3 + inferenceModel: + create: true + inferencePool: + create: false + name: mk-qwen-qwe-1ea37573-en3-0-6b-gaie + httpRoute: + create: true + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: mk-qwen-qwe-1ea37573-en3-0-6b-gaie + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + matches: + - path: + type: PathPrefix + value: /qwen-qwen3-0-6b/ + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: mk-qwen-qwe-1ea37573-en3-0-6b-gaie + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + + epp: + create: false + +decode: + create: true + replicas: 2 + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + - NVIDIA-H100-80GB-HBM3 + parallelism: + data: 1 + tensor: 1 + annotations: + deployed-by: jchen + modelservice: llm-d-benchmark + podAnnotations: + deployed-by: jchen + modelservice: llm-d-benchmark + #no____config + containers: + - name: "vllm" + mountModelVolume: true + image: "ghcr.io/llm-d/llm-d:v0.2.0" + modelCommand: vllmServe + + args: + - "--enforce-eager" + - "--block-size" + - "64" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--tensor-parallel-size" + - "1" + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + env: + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + resources: + limits: + memory: 64Gi + cpu: "16" + + nvidia.com/gpu: "1" + + requests: + memory: 64Gi + cpu: "16" + + nvidia.com/gpu: "1" + + extraConfig: + startupProbe: + httpGet: + path: /health + port: 8200 + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: 8200 + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8200 + failureThreshold: 3 + periodSeconds: 5 + + ports: + - containerPort: 5557 + protocol: TCP + - containerPort: 8200 + name: metrics + protocol: TCP + volumeMounts: [] + volumes: [] + +prefill: + create: false + replicas: 0 + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + - NVIDIA-H100-80GB-HBM3 + parallelism: + data: 1 + tensor: 1 + annotations: + deployed-by: jchen + modelservice: llm-d-benchmark + podAnnotations: + deployed-by: jchen + modelservice: llm-d-benchmark + #no____config + containers: + - name: "vllm" + mountModelVolume: true + image: "ghcr.io/llm-d/llm-d:v0.2.0" + modelCommand: vllmServe + + args: + - "--disable-log-requests" + - "--max-model-len" + - "16000" + - "--tensor-parallel-size" + - "1" + env: + - name: VLLM_IS_PREFILL + value: "1" + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + resources: + limits: + memory: 40Gi + cpu: "4" + + nvidia.com/gpu: "0" + + requests: + memory: 40Gi + cpu: "4" + + nvidia.com/gpu: "0" + + extraConfig: + startupProbe: + httpGet: + path: /health + port: 8000 + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: 8000 + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8000 + failureThreshold: 3 + periodSeconds: 5 + + ports: + - containerPort: 5557 + protocol: TCP + - containerPort: 8200 + name: metrics + protocol: TCP + volumeMounts: [] + volumes: [] \ No newline at end of file From 40bf8cdf20e471bab470abac63af4ef8bf39c049 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 1 Oct 2025 09:11:08 -0400 Subject: [PATCH 02/44] update gateway Signed-off-by: Michael Kalantar --- tekton-poc/examples/inference-scheduling/ms-values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tekton-poc/examples/inference-scheduling/ms-values.yaml b/tekton-poc/examples/inference-scheduling/ms-values.yaml index b8ad2d45..80649823 100644 --- a/tekton-poc/examples/inference-scheduling/ms-values.yaml +++ b/tekton-poc/examples/inference-scheduling/ms-values.yaml @@ -12,7 +12,7 @@ routing: parentRefs: - group: gateway.networking.k8s.io kind: Gateway - name: infra-llmdbench-inference-gateway + name: experiment-gateway-inference-gateway proxy: image: "ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0" secure: false From 9041ccc3029e825c3c738019912f494981c48e19 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 1 Oct 2025 12:29:10 -0400 Subject: [PATCH 03/44] change model label Signed-off-by: Michael Kalantar --- tekton-poc/examples/inference-scheduling/gaie-values.yaml | 2 +- tekton-poc/examples/inference-scheduling/ms-values.yaml | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tekton-poc/examples/inference-scheduling/gaie-values.yaml b/tekton-poc/examples/inference-scheduling/gaie-values.yaml index 2d84f723..08d28bc6 100644 --- a/tekton-poc/examples/inference-scheduling/gaie-values.yaml +++ b/tekton-poc/examples/inference-scheduling/gaie-values.yaml @@ -145,6 +145,6 @@ inferencePool: modelServers: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: qwen-qwe-1ea37573-en3-0-6b + llm-d.ai/model: qwen-qwen3-0-6b provider: name: none diff --git a/tekton-poc/examples/inference-scheduling/ms-values.yaml b/tekton-poc/examples/inference-scheduling/ms-values.yaml index 80649823..a02e894b 100644 --- a/tekton-poc/examples/inference-scheduling/ms-values.yaml +++ b/tekton-poc/examples/inference-scheduling/ms-values.yaml @@ -1,4 +1,4 @@ -fullnameOverride: qwen-qwe-1ea37573-en3-0-6b +fullnameOverride: qwen-qwen3-0-6b multinode: false modelArtifacts: @@ -22,14 +22,14 @@ routing: create: true inferencePool: create: false - name: mk-qwen-qwe-1ea37573-en3-0-6b-gaie + name: experiment-gaie httpRoute: create: true rules: - backendRefs: - group: inference.networking.x-k8s.io kind: InferencePool - name: mk-qwen-qwe-1ea37573-en3-0-6b-gaie + name: experiment-gaie port: 8000 weight: 1 timeouts: @@ -48,7 +48,7 @@ routing: - backendRefs: - group: inference.networking.x-k8s.io kind: InferencePool - name: mk-qwen-qwe-1ea37573-en3-0-6b-gaie + name: experiment-gaie port: 8000 weight: 1 timeouts: From 698d774c922b19ff3cde7a2f0856f54960ea2af6 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 1 Oct 2025 14:32:54 -0400 Subject: [PATCH 04/44] harness launcher chart Signed-off-by: Michael Kalantar --- charts/harness/.helmignore | 24 ++ charts/harness/Chart.yaml | 40 +++ charts/harness/templates/_helpers.tpl | 31 +++ charts/harness/templates/harness-pod.yaml | 79 ++++++ charts/harness/templates/harness-role.yaml | 19 ++ .../templates/harness-rolebinding.yaml | 27 ++ charts/harness/templates/harness-sa.yaml | 6 + .../templates/inference-perf-profiles.yaml | 235 ++++++++++++++++++ charts/harness/values.yaml | 38 +++ 9 files changed, 499 insertions(+) create mode 100644 charts/harness/.helmignore create mode 100644 charts/harness/Chart.yaml create mode 100644 charts/harness/templates/_helpers.tpl create mode 100644 charts/harness/templates/harness-pod.yaml create mode 100644 charts/harness/templates/harness-role.yaml create mode 100644 charts/harness/templates/harness-rolebinding.yaml create mode 100644 charts/harness/templates/harness-sa.yaml create mode 100644 charts/harness/templates/inference-perf-profiles.yaml create mode 100644 charts/harness/values.yaml diff --git a/charts/harness/.helmignore b/charts/harness/.helmignore new file mode 100644 index 00000000..898df488 --- /dev/null +++ b/charts/harness/.helmignore @@ -0,0 +1,24 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ + diff --git a/charts/harness/Chart.yaml b/charts/harness/Chart.yaml new file mode 100644 index 00000000..701fc7e4 --- /dev/null +++ b/charts/harness/Chart.yaml @@ -0,0 +1,40 @@ +apiVersion: v2 +name: llm-d-benchark +description: A Helm chart for the experiment harness in llm-d-benchmark + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: "v0.0.1" + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "v0.3.0" + +maintainers: + - name: "Michael Kalantar" + email: "kalantar@us.ibm.com" + url: "https://github.com/kalantar" + +sources: + - https://github.com/llm-d/llm-d-benchmark + +# dependencies: +# - name: common +# repository: https://charts.bitnami.com/bitnami +# tags: +# - bitnami-common +# version: "2.27.0" + diff --git a/charts/harness/templates/_helpers.tpl b/charts/harness/templates/_helpers.tpl new file mode 100644 index 00000000..aa63cc97 --- /dev/null +++ b/charts/harness/templates/_helpers.tpl @@ -0,0 +1,31 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "harness.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + + +{{/* +Create chart name and version as used by the chart label. +Truncated to 63 characrters because Kubernetes label values are limited to this +*/}} +{{- define "harness.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create common labels for the resources managed by this chart. +*/}} +{{- define "harness.labels" -}} +helm.sh/chart: {{ include "harness.chart" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{- define "harness.sanitizeString" -}} +{{- $input := . | lower | replace "." "-" | replace "/" "-" -}} +{{- $input -}} +{{- end -}} \ No newline at end of file diff --git a/charts/harness/templates/harness-pod.yaml b/charts/harness/templates/harness-pod.yaml new file mode 100644 index 00000000..56ce99d4 --- /dev/null +++ b/charts/harness/templates/harness-pod.yaml @@ -0,0 +1,79 @@ +apiVersion: v1 +kind: Pod +metadata: + name: {{ .Values.harness.type }}-launcher + labels: + app: {{ .Values.harness.type }}-launcher +spec: + serviceAccountName: {{ include "harness.name" . }}-runner + containers: + - name: harness + image: "{{ .Values.harness.image.repository }}:{{ .Values.harness.image.tag }}" + imagePullPolicy: {{ .Values.harness.image.pullPolicy }} + securityContext: + runAsUser: 0 + command: ["sh", "-c"] + args: + {{- toYaml .Values.harness.args | nindent 4 }} + env: + - name: LLMDBENCH_RUN_EXPERIMENT_LAUNCHER + value: "1" + - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZE_LOCALLY + value: "0" + - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS + value: "{{ .Values.harness.type }}-llm-d-benchmark.sh" + - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZER + value: "{{ .Values.harness.type }}-analyze_results.sh" + - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_NAME + value: "{{ .Values.experiment.profile.name }}" + - name: LLMDBENCH_RUN_EXPERIMENT_ID + value: "{{ .Values.experiment.identifier }}" + - name: LLMDBENCH_HARNESS_NAME + value: "{{ .Values.harness.type }}" + - name: LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR + value: "/requests/{{ .Values.harness.type }}_{{ .Values.experiment.identifier }}_{{ .Values.stack.name }}" + - name: LLMDBENCH_CONTROL_WORK_DIR + value: "/requests/{{ .Values.harness.type }}_{{ .Values.experiment.identifier }}_{{ .Values.stack.name }}" + - name: LLMDBENCH_HARNESS_NAMESPACE + value: "{{ .Release.Namespace }}" + - name: LLMDBENCH_HARNESS_STACK_TYPE + value: "{{ .Values.stack.type }}" + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: "{{ .Values.stack.endpointUrl }}" + - name: LLMDBENCH_HARNESS_STACK_NAME + value: {{ include "harness.sanitizeString" .Values.stack.model | quote }} + - name: LLMDBENCH_DEPLOY_METHODS + value: "{{ .Values.stack.deployMethod }}" + - name: LLMDBENCH_MAGIC_ENVAR + value: "harness_pod" + {{- with .Values.harness.extraEnv }} + - name: {{ .name }} + value: "{{ .value }}" + {{- end }} + + # TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD + - name: LLMDBENCH_DEPLOY_CURRENT_MODELID + value: "{{ .Values.stack.model }}" + + - name: HF_TOKEN_SECRET + value: "hf-secret" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + resources: + {{- toYaml .Values.harness.resources | nindent 6 }} + volumeMounts: + - name: results + mountPath: /requests + - name: {{ .Values.harness.type }}-profiles + mountPath: /workspace/profiles/{{ .Values.harness.type }} + volumes: + - name: results + persistentVolumeClaim: + claimName: {{ .Values.harness.resultsPVC }} + - name: {{ .Values.harness.type }}-profiles + configMap: + name: {{ .Values.harness.type }}-profiles + restartPolicy: Never \ No newline at end of file diff --git a/charts/harness/templates/harness-role.yaml b/charts/harness/templates/harness-role.yaml new file mode 100644 index 00000000..9ae95bb6 --- /dev/null +++ b/charts/harness/templates/harness-role.yaml @@ -0,0 +1,19 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "harness.name" . }}-job-creator + labels: + {{- include "harness.labels" . | nindent 4 }} +rules: + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "get", "list", "watch", "delete", "patch", "update"] + - apiGroups: [""] + resources: ["serviceaccounts"] + verbs: ["get"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] diff --git a/charts/harness/templates/harness-rolebinding.yaml b/charts/harness/templates/harness-rolebinding.yaml new file mode 100644 index 00000000..202ab4ff --- /dev/null +++ b/charts/harness/templates/harness-rolebinding.yaml @@ -0,0 +1,27 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "harness.name" . }}-job-creator-binding + labels: + {{- include "harness.labels" . | nindent 4 }} +subjects: + - kind: ServiceAccount + name: {{ include "harness.name" . }}-runner +roleRef: + kind: Role + name: {{ include "harness.name" . }}-job-creator + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "harness.name" . }}-restricted-scc + labels: + {{- include "harness.labels" . | nindent 4 }} +subjects: + - kind: ServiceAccount + name: {{ include "harness.name" . }}-runner +roleRef: + kind: ClusterRole + name: system:openshift:scc:restricted + apiGroup: rbac.authorization.k8s.io diff --git a/charts/harness/templates/harness-sa.yaml b/charts/harness/templates/harness-sa.yaml new file mode 100644 index 00000000..f6a4a83f --- /dev/null +++ b/charts/harness/templates/harness-sa.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "harness.name" . }}-runner + labels: + {{- include "harness.labels" . | nindent 4 }} diff --git a/charts/harness/templates/inference-perf-profiles.yaml b/charts/harness/templates/inference-perf-profiles.yaml new file mode 100644 index 00000000..285107c6 --- /dev/null +++ b/charts/harness/templates/inference-perf-profiles.yaml @@ -0,0 +1,235 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: inference-perf-profiles +data: + chatbot_sharegpt.yaml: "load:\n type: constant\n stages:\n - rate: 1\n duration: + 120\n - rate: 2\n duration: 120\n - rate: 4\n duration: 120\n - rate: + 8\n duration: 120\napi:\n type: completion\n streaming: true\nserver:\n type: + vllm\n model_name: {{ .Values.stack.model }}\n base_url: {{ .Values.stack.endpointUrl }}\n + \ ignore_eos: true\ntokenizer:\n pretrained_model_name_or_path: {{ .Values.stack.model }}\ndata:\n + \ type: shareGPT\n input_distribution:\n min: 10 # min length + of the synthetic prompts\n max: 1024 # max length of the synthetic + prompts\n output_distribution:\n min: 10 # min length of the output + to be generated\n max: 1024 # max length of the output to be generated + \nreport:\n request_lifecycle:\n summary: true\n per_stage: true\n per_request: + true\nstorage:\n local_storage:\n path: /workspace" + chatbot_synthetic.yaml: |- + load: + type: constant + stages: + - rate: 1 + duration: 120 + - rate: 2 + duration: 120 + - rate: 4 + duration: 120 + - rate: 8 + duration: 120 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: random + input_distribution: + min: 10 # min length of the synthetic prompts + max: 8192 # max length of the synthetic prompts + mean: 4096 # mean length of the synthetic prompts + std: 2048 # standard deviation of the length of the synthetic prompts + total_count: 1000 # total number of prompts to generate to fit the above mentioned distribution constraints + output_distribution: + min: 10 # min length of the output to be generated + max: 2048 # max length of the output to be generated + mean: 1024 # mean length of the output to be generated + std: 512 # standard deviation of the length of the output to be generated + total_count: 1000 # total number of output lengths to generate to fit the above mentioned distribution constraints + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace + code_completion_synthetic.yaml: |- + load: + type: constant + stages: + - rate: 1 + duration: 120 + - rate: 2 + duration: 120 + - rate: 4 + duration: 120 + - rate: 8 + duration: 120 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: random + input_distribution: + min: 10 # min length of the synthetic prompts + max: 4096 # max length of the synthetic prompts + mean: 2048 # mean length of the synthetic prompts + std: 1024 # standard deviation of the length of the synthetic prompts + total_count: 1000 # total number of prompts to generate to fit the above mentioned distribution constraints + output_distribution: + min: 10 # min length of the output to be generated + max: 256 # max length of the output to be generated + mean: 128 # mean length of the output to be generated + std: 64 # standard deviation of the length of the output to be generated + total_count: 1000 # total number of output lengths to generate to fit the above mentioned distribution constraints + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace + sanity_random.yaml: |- + load: + type: constant + stages: + - rate: 1 + duration: 30 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: random + input_distribution: + min: 10 # min length of the synthetic prompts + max: 100 # max length of the synthetic prompts + mean: 50 # mean length of the synthetic prompts + std: 10 # standard deviation of the length of the synthetic prompts + total_count: 100 # total number of prompts to generate to fit the above mentioned distribution constraints + output_distribution: + min: 10 # min length of the output to be generated + max: 100 # max length of the output to be generated + mean: 50 # mean length of the output to be generated + std: 10 # standard deviation of the length of the output to be generated + total_count: 100 # total number of output lengths to generate to fit the above mentioned distribution constraints + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace + shared_prefix_synthetic.yaml: | + load: + type: constant + stages: + - rate: 2 + duration: 50 + - rate: 5 + duration: 50 + # - rate: 8 + # duration: 50 + # - rate: 10 + # duration: 50 + # - rate: 12 + # duration: 50 + # - rate: 15 + # duration: 50 + # - rate: 20 + # duration: 50 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: shared_prefix + shared_prefix: + # Number of distinct shared prefixes + num_groups: {{ .Values.experiment.profile.shared_prefix.num_groups }} + # Number of unique questions per shared prefix + num_prompts_per_group: {{ .Values.experiment.profile.shared_prefix.num_prompts_per_group }} + # Length of the shared prefix (in tokens) + system_prompt_len: {{ .Values.experiment.profile.shared_prefix.system_prompt_len }} + # Length of the unique question part (in tokens) + question_len: {{ .Values.experiment.profile.shared_prefix.question_len }} + # Target length for the model's generated output (in tokens) + output_len: {{ .Values.experiment.profile.shared_prefix.output_len }} + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace + summarization_synthetic.yaml: |- + load: + type: constant + stages: + - rate: 1 + duration: 120 + - rate: 2 + duration: 120 + - rate: 4 + duration: 120 + - rate: 8 + duration: 120 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: random + input_distribution: + min: 10 # min length of the synthetic prompts + max: 4096 # max length of the synthetic prompts + mean: 2048 # mean length of the synthetic prompts + std: 1024 # standard deviation of the length of the synthetic prompts + total_count: 1000 # total number of prompts to generate to fit the above mentioned distribution constraints + output_distribution: + min: 10 # min length of the output to be generated + max: 512 # max length of the output to be generated + mean: 256 # mean length of the output to be generated + std: 128 # standard deviation of the length of the output to be generated + total_count: 1000 # total number of output lengths to generate to fit the above mentioned distribution constraints + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace diff --git a/charts/harness/values.yaml b/charts/harness/values.yaml new file mode 100644 index 00000000..4c2fd1ac --- /dev/null +++ b/charts/harness/values.yaml @@ -0,0 +1,38 @@ +harness: + type: inference-perf + resultsPVC: workspace-pvc + image: + repository: ghcr.io/llm-d/llm-d-benchmark + tag: v0.3.0rc2 + pullPolicy: Always + extraEnv: [] + args: ["llm-d-benchmark.sh"] + resources: + limits: + cpu: 16 + memory: 32Gi + requests: + cpu: 16 + memory: 32Gi + +stack: + type: "llm-d" + # model: + deployMethod: modelservice + # name + # endpointUrl + +experiment: + # identifier: + profile: + name: sanity_random.yaml + shared_prefix: + num_groups: 32 + num_prompts_per_group: 32 + system_prompt_len: 2048 + question_len: 256 + output_len: 256 + +nameOverride: "" +fullnameOverride: "" + From 0516f2bf56d6ed8fbf2fef2e35aedd9f963a7852 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 10:53:40 -0400 Subject: [PATCH 05/44] model-download chart Signed-off-by: Michael Kalantar --- charts/model-download/.helmignore | 24 ++++++++++++ charts/model-download/Chart.yaml | 40 +++++++++++++++++++ charts/model-download/templates/_helpers.tpl | 31 +++++++++++++++ charts/model-download/templates/job.yaml | 41 ++++++++++++++++++++ charts/model-download/templates/pvc.yaml | 14 +++++++ charts/model-download/values.yaml | 8 ++++ 6 files changed, 158 insertions(+) create mode 100644 charts/model-download/.helmignore create mode 100644 charts/model-download/Chart.yaml create mode 100644 charts/model-download/templates/_helpers.tpl create mode 100644 charts/model-download/templates/job.yaml create mode 100644 charts/model-download/templates/pvc.yaml create mode 100644 charts/model-download/values.yaml diff --git a/charts/model-download/.helmignore b/charts/model-download/.helmignore new file mode 100644 index 00000000..898df488 --- /dev/null +++ b/charts/model-download/.helmignore @@ -0,0 +1,24 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ + diff --git a/charts/model-download/Chart.yaml b/charts/model-download/Chart.yaml new file mode 100644 index 00000000..04da0386 --- /dev/null +++ b/charts/model-download/Chart.yaml @@ -0,0 +1,40 @@ +apiVersion: v2 +name: llm-d-benchark +description: A Helm chart for model download + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: "v0.0.1" + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "v0.3.0" + +maintainers: + - name: "Michael Kalantar" + email: "kalantar@us.ibm.com" + url: "https://github.com/kalantar" + +sources: + - https://github.com/llm-d/llm-d-benchmark + +# dependencies: +# - name: common +# repository: https://charts.bitnami.com/bitnami +# tags: +# - bitnami-common +# version: "2.27.0" + diff --git a/charts/model-download/templates/_helpers.tpl b/charts/model-download/templates/_helpers.tpl new file mode 100644 index 00000000..2d518662 --- /dev/null +++ b/charts/model-download/templates/_helpers.tpl @@ -0,0 +1,31 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "download.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + + +{{/* +Create chart name and version as used by the chart label. +Truncated to 63 characrters because Kubernetes label values are limited to this +*/}} +{{- define "download.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create common labels for the resources managed by this chart. +*/}} +{{- define "dowload.labels" -}} +helm.sh/chart: {{ include "download.chart" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{- define "download.sanitizeString" -}} +{{- $input := . | lower | replace "." "-" | replace "/" "-" -}} +{{- $input -}} +{{- end -}} \ No newline at end of file diff --git a/charts/model-download/templates/job.yaml b/charts/model-download/templates/job.yaml new file mode 100644 index 00000000..590bb3a2 --- /dev/null +++ b/charts/model-download/templates/job.yaml @@ -0,0 +1,41 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "download.name" . }}-job +spec: + template: + spec: + containers: + - name: downloader + image: python:3.10 + command: ["/bin/sh", "-c"] + args: + - mkdir -p "\${MOUNT_PATH}/\${MODEL_PATH}" && \ + pip install huggingface_hub && \ + export PATH="\${PATH}:\${HOME}/.local/bin" && \ + hf auth login --token "${HF_TOKEN}" && \ + hf download "\${HF_MODEL_ID}" --local-dir "/cache/\${MODEL_PATH}" + env: + - name: MODEL_PATH + value: models/{{ required "ERROR .Values.hf_model must be set" .Values.hf_model }} + - name: HF_MODEL_ID + value: {{ .Values.hf_model }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Values.hf_secret }} + key: HF_TOKEN + - name: HF_HOME + value: /tmp/huggingface + - name: HOME + value: /tmp + - name: MOUNT_PATH + value: /cache + volumeMounts: + - name: model-cache + mountPath: /cache + restartPolicy: OnFailure + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: {{ .Values.pvc.name }} \ No newline at end of file diff --git a/charts/model-download/templates/pvc.yaml b/charts/model-download/templates/pvc.yaml new file mode 100644 index 00000000..3c367832 --- /dev/null +++ b/charts/model-download/templates/pvc.yaml @@ -0,0 +1,14 @@ +{{- if .Values.pvc.create }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.pvc.name }} +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.pvc.size }} + storageClassName: {{ .Values.pvc.storageClass }} + volumeMode: Filesystem +{{- end }} \ No newline at end of file diff --git a/charts/model-download/values.yaml b/charts/model-download/values.yaml new file mode 100644 index 00000000..f4ca639d --- /dev/null +++ b/charts/model-download/values.yaml @@ -0,0 +1,8 @@ +# hf_model: # required +hf_secret: hf-secret + +pvc: + name: model-pvc + create: false + size: 5Gi + storageClass: default From ae36f889941a9bf0994c70fa32eb36d1b208897e Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 11:36:21 -0400 Subject: [PATCH 06/44] try without backslash Signed-off-by: Michael Kalantar --- charts/model-download/templates/job.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/charts/model-download/templates/job.yaml b/charts/model-download/templates/job.yaml index 590bb3a2..2b0edb76 100644 --- a/charts/model-download/templates/job.yaml +++ b/charts/model-download/templates/job.yaml @@ -10,11 +10,11 @@ spec: image: python:3.10 command: ["/bin/sh", "-c"] args: - - mkdir -p "\${MOUNT_PATH}/\${MODEL_PATH}" && \ + - mkdir -p "${MOUNT_PATH}/${MODEL_PATH}" && \ pip install huggingface_hub && \ - export PATH="\${PATH}:\${HOME}/.local/bin" && \ + export PATH="${PATH}:${HOME}/.local/bin" && \ hf auth login --token "${HF_TOKEN}" && \ - hf download "\${HF_MODEL_ID}" --local-dir "/cache/\${MODEL_PATH}" + hf download "${HF_MODEL_ID}" --local-dir "/cache/${MODEL_PATH}" env: - name: MODEL_PATH value: models/{{ required "ERROR .Values.hf_model must be set" .Values.hf_model }} From aeec0007790dbef4c4bc328c52dd601a80e866d8 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 11:42:20 -0400 Subject: [PATCH 07/44] try without backslash Signed-off-by: Michael Kalantar --- charts/model-download/templates/job.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/model-download/templates/job.yaml b/charts/model-download/templates/job.yaml index 2b0edb76..e25b924b 100644 --- a/charts/model-download/templates/job.yaml +++ b/charts/model-download/templates/job.yaml @@ -10,11 +10,11 @@ spec: image: python:3.10 command: ["/bin/sh", "-c"] args: - - mkdir -p "${MOUNT_PATH}/${MODEL_PATH}" && \ + - mkdir -p "\${MOUNT_PATH}/\${MODEL_PATH}" && \ pip install huggingface_hub && \ - export PATH="${PATH}:${HOME}/.local/bin" && \ - hf auth login --token "${HF_TOKEN}" && \ - hf download "${HF_MODEL_ID}" --local-dir "/cache/${MODEL_PATH}" + export PATH="\${PATH}:\${HOME}/.local/bin" && \ + hf auth login --token "\${HF_TOKEN}" && \ + hf download "\${HF_MODEL_ID}" --local-dir "/cache/\${MODEL_PATH}" env: - name: MODEL_PATH value: models/{{ required "ERROR .Values.hf_model must be set" .Values.hf_model }} From 90473a594f54c0ffa712bdb1c548a2aec0731824 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 13:15:08 -0400 Subject: [PATCH 08/44] restructure args Signed-off-by: Michael Kalantar --- charts/model-download/templates/job.yaml | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/charts/model-download/templates/job.yaml b/charts/model-download/templates/job.yaml index e25b924b..1ad41655 100644 --- a/charts/model-download/templates/job.yaml +++ b/charts/model-download/templates/job.yaml @@ -10,11 +10,12 @@ spec: image: python:3.10 command: ["/bin/sh", "-c"] args: - - mkdir -p "\${MOUNT_PATH}/\${MODEL_PATH}" && \ - pip install huggingface_hub && \ - export PATH="\${PATH}:\${HOME}/.local/bin" && \ - hf auth login --token "\${HF_TOKEN}" && \ - hf download "\${HF_MODEL_ID}" --local-dir "/cache/\${MODEL_PATH}" + - > + export PATH="${PATH}:${HOME}/.local/bin"; + mkdir -p "${MOUNT_PATH}/${MODEL_PATH}"; + python -m pip install huggingface_hub; + hf auth login --token "${HF_TOKEN}"; + hf download "${HF_MODEL_ID}" --local-dir "/cache/${MODEL_PATH}" env: - name: MODEL_PATH value: models/{{ required "ERROR .Values.hf_model must be set" .Values.hf_model }} From b68c2f7af9376ea6f8b457d9d8a5182ac022cb5e Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 15:09:30 -0400 Subject: [PATCH 09/44] hack Signed-off-by: Michael Kalantar --- charts/harness/templates/harness-pod.yaml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/charts/harness/templates/harness-pod.yaml b/charts/harness/templates/harness-pod.yaml index 56ce99d4..decfa9cb 100644 --- a/charts/harness/templates/harness-pod.yaml +++ b/charts/harness/templates/harness-pod.yaml @@ -46,11 +46,31 @@ spec: value: "{{ .Values.stack.deployMethod }}" - name: LLMDBENCH_MAGIC_ENVAR value: "harness_pod" + + - name: LLMDBENCH_LLMD_IMAGE_REGISTRY + value: ghcr.io + - name: LLMDBENCH_LLMD_IMAGE_REPO + value: llm-d + - name: LLMDBENCH_LLMD_IMAGE_NAME + value: llm-d-benchmark + - name: LLMDBENCH_LLMD_IMAGE_TAG + value: {{ .Values.harness.image.tag }} {{- with .Values.harness.extraEnv }} - name: {{ .name }} value: "{{ .value }}" {{- end }} + - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS + value: 0 + - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS + value: 2 + - name: LLMDBENCH_VLLM_COMMON_AFFINITY + value: "nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3" + - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM + value: 4 + - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM + value: 1 + # TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD - name: LLMDBENCH_DEPLOY_CURRENT_MODELID value: "{{ .Values.stack.model }}" From 37893a1d6b2672742c258cbaba0e8878a41b270c Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 15:13:14 -0400 Subject: [PATCH 10/44] hack Signed-off-by: Michael Kalantar --- charts/harness/templates/harness-pod.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/harness/templates/harness-pod.yaml b/charts/harness/templates/harness-pod.yaml index decfa9cb..1eb938f6 100644 --- a/charts/harness/templates/harness-pod.yaml +++ b/charts/harness/templates/harness-pod.yaml @@ -61,15 +61,15 @@ spec: {{- end }} - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS - value: 0 + value: "0" - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS - value: 2 + value: "2" - name: LLMDBENCH_VLLM_COMMON_AFFINITY value: "nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3" - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM - value: 4 + value: "4" - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM - value: 1 + value: "1" # TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD - name: LLMDBENCH_DEPLOY_CURRENT_MODELID From f9e1372ef9d9634590ca2ae98d33f96bda684c3b Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 15:55:33 -0400 Subject: [PATCH 11/44] extend hack Signed-off-by: Michael Kalantar --- charts/harness/templates/harness-pod.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/charts/harness/templates/harness-pod.yaml b/charts/harness/templates/harness-pod.yaml index 1eb938f6..2c87e4d9 100644 --- a/charts/harness/templates/harness-pod.yaml +++ b/charts/harness/templates/harness-pod.yaml @@ -70,6 +70,10 @@ spec: value: "4" - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM value: "1" + - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM + value: "1" + - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM + value: "1" # TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD - name: LLMDBENCH_DEPLOY_CURRENT_MODELID From 91c6c2f54665379d34b99f1c287a8b16f2aebb00 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 16:35:30 -0400 Subject: [PATCH 12/44] more image configurability Signed-off-by: Michael Kalantar --- charts/harness/templates/harness-pod.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/harness/templates/harness-pod.yaml b/charts/harness/templates/harness-pod.yaml index 2c87e4d9..91456252 100644 --- a/charts/harness/templates/harness-pod.yaml +++ b/charts/harness/templates/harness-pod.yaml @@ -8,7 +8,7 @@ spec: serviceAccountName: {{ include "harness.name" . }}-runner containers: - name: harness - image: "{{ .Values.harness.image.repository }}:{{ .Values.harness.image.tag }}" + image: "{{ .Values.harness.image.registry }}/{{ .Values.harness.image.repository }}/{{ .Values.harness.image.name }}:{{ .Values.harness.image.tag }}" imagePullPolicy: {{ .Values.harness.image.pullPolicy }} securityContext: runAsUser: 0 @@ -48,11 +48,11 @@ spec: value: "harness_pod" - name: LLMDBENCH_LLMD_IMAGE_REGISTRY - value: ghcr.io + value: {{ .Values.harness.image.registry }} - name: LLMDBENCH_LLMD_IMAGE_REPO - value: llm-d + value: {{ .Values.harness.image.repository }} - name: LLMDBENCH_LLMD_IMAGE_NAME - value: llm-d-benchmark + value: {{ .Values.harness.image.name }} - name: LLMDBENCH_LLMD_IMAGE_TAG value: {{ .Values.harness.image.tag }} {{- with .Values.harness.extraEnv }} From cde15493826f96d7038e671115ef5642e8f493f3 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 17:42:12 -0400 Subject: [PATCH 13/44] quote Signed-off-by: Michael Kalantar --- charts/harness/templates/harness-pod.yaml | 2 +- charts/harness/values.yaml | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/charts/harness/templates/harness-pod.yaml b/charts/harness/templates/harness-pod.yaml index 91456252..4d96b73a 100644 --- a/charts/harness/templates/harness-pod.yaml +++ b/charts/harness/templates/harness-pod.yaml @@ -54,7 +54,7 @@ spec: - name: LLMDBENCH_LLMD_IMAGE_NAME value: {{ .Values.harness.image.name }} - name: LLMDBENCH_LLMD_IMAGE_TAG - value: {{ .Values.harness.image.tag }} + value: {{ .Values.harness.image.tag | quote }} {{- with .Values.harness.extraEnv }} - name: {{ .name }} value: "{{ .value }}" diff --git a/charts/harness/values.yaml b/charts/harness/values.yaml index 4c2fd1ac..faae6341 100644 --- a/charts/harness/values.yaml +++ b/charts/harness/values.yaml @@ -2,7 +2,9 @@ harness: type: inference-perf resultsPVC: workspace-pvc image: - repository: ghcr.io/llm-d/llm-d-benchmark + registry: ghcr.io + repository: llm-d + name: llm-d-benchmark tag: v0.3.0rc2 pullPolicy: Always extraEnv: [] From 9792fa5b21aaff5e8ce763bca0575d46beb72e61 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 2 Oct 2025 18:14:46 -0400 Subject: [PATCH 14/44] MODELID Signed-off-by: Michael Kalantar --- charts/harness/templates/harness-pod.yaml | 9 +++++---- charts/harness/templates/harness-role.yaml | 2 +- charts/harness/templates/harness-rolebinding.yaml | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/charts/harness/templates/harness-pod.yaml b/charts/harness/templates/harness-pod.yaml index 4d96b73a..014c265c 100644 --- a/charts/harness/templates/harness-pod.yaml +++ b/charts/harness/templates/harness-pod.yaml @@ -60,6 +60,11 @@ spec: value: "{{ .value }}" {{- end }} + # TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD + - name: LLMDBENCH_DEPLOY_CURRENT_MODEL + value: "{{ .Values.stack.model }}" + - name: LLMDBENCH_DEPLOY_CURRENT_MODELID + value: {{ include "harness.sanitizeString" .Values.stack.model | quote }} - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS value: "0" - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS @@ -75,10 +80,6 @@ spec: - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM value: "1" - # TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD - - name: LLMDBENCH_DEPLOY_CURRENT_MODELID - value: "{{ .Values.stack.model }}" - - name: HF_TOKEN_SECRET value: "hf-secret" - name: HUGGING_FACE_HUB_TOKEN diff --git a/charts/harness/templates/harness-role.yaml b/charts/harness/templates/harness-role.yaml index 9ae95bb6..7aebcaa4 100644 --- a/charts/harness/templates/harness-role.yaml +++ b/charts/harness/templates/harness-role.yaml @@ -16,4 +16,4 @@ rules: verbs: ["get", "list", "watch"] - apiGroups: [""] resources: ["pods/log"] - verbs: ["get"] + verbs: ["get"] \ No newline at end of file diff --git a/charts/harness/templates/harness-rolebinding.yaml b/charts/harness/templates/harness-rolebinding.yaml index 202ab4ff..ec657601 100644 --- a/charts/harness/templates/harness-rolebinding.yaml +++ b/charts/harness/templates/harness-rolebinding.yaml @@ -24,4 +24,4 @@ subjects: roleRef: kind: ClusterRole name: system:openshift:scc:restricted - apiGroup: rbac.authorization.k8s.io + apiGroup: rbac.authorization.k8s.io \ No newline at end of file From 24c8fe1e19906e64f736bc2f7825f44a1ced75f1 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Fri, 3 Oct 2025 13:51:50 -0400 Subject: [PATCH 15/44] inital pipeline Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 89 ++++ tekton-poc/pipeline/experiment-task.yaml | 429 ++++++++++++++++++++ tekton-poc/pipeline/experiment-taskrun.yaml | 25 ++ tekton-poc/pipeline/pipelinerun-matrix.yaml | 33 ++ tekton-poc/pipeline/roles.yaml | 122 ++++++ tekton-poc/pipeline/stepactions.yaml | 317 +++++++++++++++ 6 files changed, 1015 insertions(+) create mode 100644 tekton-poc/README.md create mode 100644 tekton-poc/pipeline/experiment-task.yaml create mode 100644 tekton-poc/pipeline/experiment-taskrun.yaml create mode 100644 tekton-poc/pipeline/pipelinerun-matrix.yaml create mode 100644 tekton-poc/pipeline/roles.yaml create mode 100644 tekton-poc/pipeline/stepactions.yaml diff --git a/tekton-poc/README.md b/tekton-poc/README.md new file mode 100644 index 00000000..a75d6b62 --- /dev/null +++ b/tekton-poc/README.md @@ -0,0 +1,89 @@ +# Benchmarking with Tekton + +This folder contains a proof of concept + +## Tekton Basics +A **Pipeline** is set of **Tasks**. Tasks run in parallel. The execution flow can be controlled implicitly (via one task consume a result of another) or explcitly with mechanisms like `runAfter`, `when` and `finally`. +A **Task** is a sequence of **Steps**. Steps run sequentially. The step can programmatically determine to execute or skip. + +To execute a **Pipeline** create a **PipelineRun**, +an object that identifies: + - the Pipeline to execute and + - the values of any parameters + +Tekton creates a **TaskRun** for each Task in the Pipeline. +A TaskRun is an object that identifies: + - the Task and + - the values of any parameters (passed from the PipelineRun) + +The TaskRun is implemented by a Pod +Each Step is implemented by a Container in the Pod. + +## Supported Benchmarking Use Cases + +Given a matrix of factors and values, measure performance of a model over a matrix of factors/values +Factors may be model deployment related, such as: model, endpoint picker configuration, parallelism, ... +Factors may also be workload related, for example: question_len, output_len,workload_profile, ... + +This proof of concept currently implements a variation of the inference-scheduling [scenairo](https://github.com/llm-d/llm-d-benchmark/blob/main/scenarios/guides/inference-scheduling.sh)/[experiment](https://github.com/llm-d/llm-d-benchmark/blob/main/experiments/inference-scheduling.yaml). + +## Approach + +A single Task measures performance over a single set of values from the factor/values matrix. This task implements steps: + +1. Create/prepare an experiment namespace +2. Deploy a Gateway +3. Configure GAIE +4. Download the model from HuggingFace to a PVC +5. Deploy the model +6. Run the workload for a single set of parameters +7. Upload the results to external storage (not yet implemented)\ +8. Delete the experiment namespace + +A PipelineRun is created that embeds a Pipeline containing one Task with a matrix of values for a set of factors. An example is `pipelinerun-matrix.yaml`. + +## Use + +1. Create a namespace, for example: $NAMESPACE and set to current context: + ```shell + kubectl create ns $NAMESPACE + kubectl config set-context --current --namespace $NAMESPACE + ``` +2. Deploy a secret `hf-secret` containing your HuggingFace token in the namespace. + ```shell + kubectl create secret generic hf-secret \ + --namespace ${NAMESPACE} \ + --from-literal="HF_TOKEN=${HF_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + ``` +3. Give the task needed permissions + ```shell + kubectl apply -f pipeline/roles.yaml + ``` +4. Deploy the steps and tasks: + ```shell + kubectl apply -f pipeline/stepactions.yaml + kubectl apply -f pipeline/experiment-task.yaml + ``` +5. Run experiments (set the parameter `namespace` to $NAMESPACE): + ```shell + kubectl apply -f pipeline/pipelinerun-matrix.yaml + ``` + +See the TaskRun objects created: + +```shell +tkn tr list +``` + +See the logs for a TaskRun: + +```shell +tkn tr logs -f +``` + +## Cautions + +- be sure to set the namespace parameter in the pipeline run; this is where the pipeline runs and is the base of the name for each experiment +- the upload of data is not yet implemented +- there are hardcoded assumptions/values about the use case in several places; these will be removed as more use cases are explored diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml new file mode 100644 index 00000000..64a311c3 --- /dev/null +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -0,0 +1,429 @@ +apiVersion: tekton.dev/v1 +kind: Task +metadata: + name: experiment +spec: + description: > + Runs an llm-d-benchmark experiment. + + params: + + - name: question_len + type: string + - name: output_len + type: string + + - name: namespace + type: string + default: kalantar-llmd + description: Target namespace + + - name: model-id + type: string + default: "meta-llama/Llama-3.2-1B-Instruct" + - name: inferencePort + default: 8000 + + - name: experimentBaseUrl + type: string + - name: experimentName + type: string + default: "experiment" + + - name: workspace-pvc-name + type: string + default: workspace-pvc + - name: workspace-pvc-size + type: string + default: 20Gi + - name: workspace-storage-class + type: string + default: ocs-storagecluster-cephfs + + - name: model-pvc-name + type: string + default: model-pvc + - name: model-pvc-size + type: string + default: 300Gi + - name: model-storage-class + type: string + default: ocs-storagecluster-cephfs + + - name: download-job-name + type: string + default: download-job + + - default: llm-d-infra + description: Name of the Helm repository for the Gateway + name: gatewayRepoName + type: string + - default: https://llm-d-incubation.github.io/llm-d-infra/ + description: URL of the Helm repository for the Gateway + name: gatewayRepoUrl + type: string + - name: gatewayChartVersion + type: string + default: "" + description: Optional gateway chart version (used with --version) + + - name: gatewayExtraArgs + type: string + default: "" + description: Optional extra args for the gateway (to append to 'helm upgrade --install') + + - name: gaieChartVersion + type: string + default: "v0.5.1" + description: Optional GAIE chart version (used with --version) + + - name: gaieExtraArgs + type: string + default: "" + description: Optional extra args for GAIE (to append to 'helm upgrade --install') + + - default: llm-d-modelservice + description: Name of the Helm repository for the model engine + name: msRepoName + type: string + - default: https://llm-d-incubation.github.io/llm-d-modelservice/ + description: URL of the Helm repository for the model engine + name: msRepoUrl + type: string + - name: msChartVersion + type: string + default: "" + description: Optional modelservice chart version (used with --version) + + - name: msExtraArgs + type: string + default: "" + description: Optional extra args for the model engine (to append to 'helm upgrade --install') + + - name: modelWaitTimeout + type: string + default: 900 + + - name: harnessName + type: string + default: inference-perf + - name: harnessProfile + type: string + default: sanity_random.yaml + - name: stackType + type: string + default: lld-d + - name: experimentIDBase + type: string + default: experiment + + - name: dry-run + type: string + default: "false" + + steps: + - name: log-start + image: alpine:3.20 + script: | + #!/bin/sh + echo "🔄 Starting sweep step ..." + + - name: prepare-namespace + image: quay.io/openshift/origin-cli:latest + script: | + #!/bin/sh + + NAMESPACE="$(params.namespace)-$(context.taskRun.name)" + DRY_RUN="$(params.dry-run)" + + if [ "${DRY_RUN}" = "true" ]; then + echo ">> skipping" + exit 0 + fi + + kubectl create namespace ${NAMESPACE} \ + --dry-run=client -o yaml | kubectl apply -f - + + # HF_TOKEN=$( + HF_TOKEN=$( + kubectl get secret hf-secret \ + --namespace "$(context.taskRun.namespace)" \ + -o jsonpath='{.data.HF_TOKEN}' \ + | tr -d '\n' \ + | base64 -d + ) + # kubectl --namespace $(context.taskRun.namespace) get secret hf-secret -o jsonpath='{.data.HF_TOKEN}' | tr -d '\n' | base64 -d) + kubectl create secret generic hf-secret \ + --namespace ${NAMESPACE} \ + --from-literal="HF_TOKEN=${HF_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + + # TBD only if OpenShift + oc adm policy add-scc-to-user anyuid -z helm-installer -n ${NAMESPACE} + # oc adm policy add-scc-to-user privileged -z helm-installer -n ${NAMESPACE} + + - name: model-download + ref: + name: helm-upgrade-install + params: + # Location of helm chart + - name: git_url + value: "https://github.com/kalantar/llm-d-benchmark" + - name: git_revision + value: "tekton-poc" + - name: checkout_dir + value: "/tmp/llm-d-benchmark" + + # Helm arguments + - name: releaseName + value: $(params.experimentName)-download + - name: chart + value: /tmp/llm-d-benchmark/charts/model-download + - name: namespace + value: $(params.namespace)-$(context.taskRun.name) + - name: timeout + value: 15m + # - name: valuesYamlUrl + # value: "/tmp/llm-d-benchmark/charts/model-download/values.yaml" + - name: extraArgs + value: > + --set hf_model=$(params.model-id) + --set pvc.create=true + --set pvc.name=$(params.model-pvc-name) + --set pvc.size=$(params.model-pvc-size) + --set pvc.storageClass=$(params.model-storage-class) + + - name: dry-run + value: $(params.dry-run) + + - name: wait-for-download + image: alpine:3.20 + script : | + #!/bin/sh + echo "âŗ TBD: Wait for download job to complete" + + # TBD use tekton notion of workspace ?? + - name: create-workspace-pvc + ref: + name: create-rwx-pvc + params: + - name: name + value: $(params.workspace-pvc-name) + - name: namespace + value: $(params.namespace)-$(context.taskRun.name) + - name: size + value: $(params.workspace-pvc-size) + - name: storage-class + value: $(params.workspace-storage-class) + - name: dry-run + value: $(params.dry-run) + + - name: gateway + ref: + name: helm-upgrade-install + params: + - name: releaseName + value: $(params.experimentName)-gateway + - name: chart + value: llm-d-infra/llm-d-infra + - name: repoName + value: llm-d-infra + - name: repoUrl + value: https://llm-d-incubation.github.io/llm-d-infra/ + + - name: namespace + value: $(params.namespace)-$(context.taskRun.name) + - name: timeout + value: 15m + - name: valuesYamlUrl + value: "$(params.experimentBaseUrl)/gateway-values.yaml" + + - name: dry-run + value: $(params.dry-run) + + - name: gaie + ref: + name: helm-upgrade-install + params: + - name: releaseName + value: $(params.experimentName)-gaie + - name: chart + value: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + - name: version + value: $(params.gaieChartVersion) + + - name: namespace + value: $(params.namespace)-$(context.taskRun.name) + - name: timeout + value: 15m + - name: valuesYamlUrl + value: "$(params.experimentBaseUrl)/gaie-values.yaml" + - name: extraArgs + value: "--set inferenceExtension.pluginsConfigFile=$(params.gaiePluginConfig)" + + - name: dry-run + value: $(params.dry-run) + + - name: model-engine + ref: + name: helm-upgrade-install + params: + - name: releaseName + value: $(params.experimentName)-ms + - name: chart + value: llm-d-modelservice/llm-d-modelservice + - name: repoName + value: llm-d-modelservice + - name: repoUrl + value: https://llm-d-incubation.github.io/llm-d-modelservice/ + + - name: namespace + value: $(params.namespace)-$(context.taskRun.name) + - name: timeout + value: 15m + - name: valuesYamlUrl + value: "$(params.experimentBaseUrl)/ms-values.yaml" + + - name: dry-run + value: $(params.dry-run) + + - name: wait-for-model + image: alpine/kubectl:1.34.1 + script: | + #!/bin/sh + + if [ "$(params.dry-run)" = "true" ]; then + echo ">> skipping" + exit 0 + fi + NAMESPACE="$(params.namespace)-$(context.taskRun.name)" + MODEL_ID="$(params.model-id)" + MODEL_LABEL=$(echo "$MODEL_ID" | tr '[:upper:]' '[:lower:]' | sed 's/[./]/-/g') + MODEL_START_TIMEOUT="$(params.modelWaitTimeout)" + + echo "âŗ Waiting for pods serving model ${MODEL_ID} to be 'Running'" + echo "Model label = ${MODEL_LABEL}" + + kubectl --namespace ${NAMESPACE} \ + wait pod \ + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=decode \ + --for=create \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ (decode) pods serving model ${MODEL_ID} created" + + # kubectl --namespace ${NAMESPACE} \ + # wait pod \ + # -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ + # --for=create \ + # --timeout=${MODEL_START_TIMEOUT}s + # echo "✅ prefill pods serving model ${MODEL_ID} created" + + kubectl --namespace ${NAMESPACE} \ + wait pod \ + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=decode \ + --for=condition=Ready=True \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ (decode) pods serving model ${MODEL_ID} ready" + + # kubectl --namespace ${NAMESPACE} \ + # wait pod \ + # -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ + # --for=condition=Ready=True \ + # --timeout=${MODEL_START_TIMEOUT}s + # echo "✅ prefill pods serving model ${MODEL_ID} ready" + + - name: workload + ref: + name: helm-upgrade-install + params: + # Location of helm chart + - name: git_url + value: "https://github.com/kalantar/llm-d-benchmark" + - name: git_revision + value: "tekton-poc" + - name: checkout_dir + value: "/tmp/llm-d-benchmark" + + # Helm arguments + - name: releaseName + value: $(params.experimentName)-harness + - name: chart + value: /tmp/llm-d-benchmark/charts/harness + - name: namespace + value: $(params.namespace)-$(context.taskRun.name) + - name: timeout + value: 15m + # - name: valuesYamlUrl + # value: "/tmp/llm-d-benchmark/charts/harness/values.yaml" + - name: extraArgs + value: > + --set harness.image.registry=quay.io + --set harness.image.repository=namasluk + --set harness.image.name=llm-d-benchmark + --set harness.image.tag=251002.1 + --set experiment.profile.name=$(params.harnessProfile) + --set experiment.profile.shared_prefix.question_len=$(params.question_len) + --set experiment.profile.shared_prefix.output_len=$(params.output_len) + --set experiment.identifier=experiment-DATE + --set stack.model=$(params.model-id) + --set stack.name=$(context.taskRun.name) + --set stack.endpointUrl='http://experiment-gateway-inference-gateway:80' + + - name: dry-run + value: $(params.dry-run) + + - name: wait-for-workload + image: alpine/kubectl:1.34.1 + script : | + #!/bin/sh + + if [ "$(params.dry-run)" = "true" ]; then + echo ">> skipping" + exit 0 + fi + + NAMESPACE="$(params.namespace)-$(context.taskRun.name)" + HARNESS_NAME="$(params.harnessName)" + + echo "âŗ Waiting for pod ${HARNESS_NAME}-launcher to complete..." + + while true; do + STATUS=$(kubectl --namespace ${NAMESPACE} get pod ${HARNESS_NAME}-launcher -o jsonpath='{.status.phase}') + if [ "$STATUS" = "Succeeded" ] || [ "$STATUS" = "Failed" ]; then + echo "Pod completed with status: $STATUS" + break + fi + echo "âŗ Still waiting for pod to complete..." + sleep 5 + done + + echo "✅ workload completed" + + - name: upload-results + image: alpine:3.20 + script : | + #!/bin/sh + echo "🚚 TBD: Upload results" + + - name: delete-namespace + image: alpine/helm:3.14.0 + script : | + #!/bin/sh + + if [ "$(params.dry-run)" = "true" ]; then + echo ">> skipping" + exit 0 + fi + + NAMESPACE="$(params.namespace)-$(context.taskRun.name)" + + # helm delete --namespace ${NAMESPACE} $(params.experimentName)-harness + # kubectl delete namespace ${NAMESPACE} + + echo "✅ workload pod deleted" + + - name: log-completion + image: alpine:3.20 + script: | + #!/bin/sh + echo "✅ Sweep step complete." diff --git a/tekton-poc/pipeline/experiment-taskrun.yaml b/tekton-poc/pipeline/experiment-taskrun.yaml new file mode 100644 index 00000000..86fca127 --- /dev/null +++ b/tekton-poc/pipeline/experiment-taskrun.yaml @@ -0,0 +1,25 @@ +apiVersion: tekton.dev/v1 +kind: TaskRun +metadata: + name: experiment-run +spec: + serviceAccountName: helm-installer + taskRef: + name: experiment + params: + - name: namespace + value: kalantar + - name: model-id + value: "Qwen/Qwen3-0.6B" + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + + - name: gaiePluginConfig + value: "inf-sche-queue.yaml" + - name: question_len + value: 100 + - name: output_len + value: 300 + diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml new file mode 100644 index 00000000..cd64e491 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -0,0 +1,33 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + pipelineSpec: + tasks: + - name: run-experiment + taskRef: + name: experiment + params: + - name: namespace + value: kalantar + - name: model-id + value: "Qwen/Qwen3-0.6B" + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + matrix: + params: + - name: gaiePluginConfig + value: + - "inf-sche-queue.yaml" + - name: question_len + value: + - "100" + - "300" + - name: output_len + value: + - "300" diff --git a/tekton-poc/pipeline/roles.yaml b/tekton-poc/pipeline/roles.yaml new file mode 100644 index 00000000..68a8aa2d --- /dev/null +++ b/tekton-poc/pipeline/roles.yaml @@ -0,0 +1,122 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: helm-installer + namespace: kalantar +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: helm-installer-clusterrole +rules: +- apiGroups: [""] + resources: ["pods", "services", "namespaces", "persistentvolumeclaims", "secrets", "configmaps", "serviceaccounts"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] +- apiGroups: ["apps"] + resources: ["deployments", "replicasets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings", "clusterroles", "clusterrolebindings"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["gateway.networking.k8s.io"] + resources: ["gateways", "httproutes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["gateway.kgateway.dev"] + resources: ["gatewayparameters"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools", "inferencemodels"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["authentication.k8s.io"] + resources: ["tokenreviews"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["authorization.k8s.io"] + resources: ["subjectaccessreviews"] + verbs: ["create"] +- apiGroups: ["route.openshift.io"] + resources: ["routes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["security.openshift.io"] + resources: ["securitycontextconstraints"] + resourceNames: ["anyuid", "restricted", "privileged"] + verbs: ["use"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: helm-installer-clusterrolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: helm-installer-clusterrole +subjects: +- kind: ServiceAccount + name: helm-installer + namespace: kalantar +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: helm-installer-restricted-scc +subjects: + - kind: ServiceAccount + name: helm-installer +roleRef: + kind: ClusterRole + name: system:openshift:scc:restricted + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: helm-access + namespace: kalantar +rules: +- apiGroups: [""] + resources: ["secrets", "configmaps", "services", "pods", "namespaces", "serviceaccounts", "persistentvolumeclaims"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] +- apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["apps"] + resources: ["deployments", "replicasets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["gateway.networking.k8s.io"] + resources: ["gateways", "httproutes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["gateway.kgateway.dev"] + resources: ["gatewayparameters"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools", "inferencemodels"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["authentication.k8s.io"] + resources: ["tokenreviews", "subjectaccessreviews"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["route.openshift.io"] + resources: ["routes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["security.openshift.io"] + resources: ["securitycontextconstraints"] + resourceNames: ["restricted"] + verbs: ["use"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: helm-access-binding + namespace: kalantar +subjects: +- kind: ServiceAccount + name: helm-installer + namespace: kalantar +roleRef: + kind: Role + name: helm-access + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/tekton-poc/pipeline/stepactions.yaml b/tekton-poc/pipeline/stepactions.yaml new file mode 100644 index 00000000..7a135024 --- /dev/null +++ b/tekton-poc/pipeline/stepactions.yaml @@ -0,0 +1,317 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: create-rwx-pvc +spec: + params: + - name: name + type: string + - name: namespace + type: string + - name: size + type: string + default: "1Gi" + - name: storage-class + type: string + default: "default" + + - name: dry-run + type: string + default: "false" + env: + - name: NAME + value: $(params.name) + # - name: TARGET_NAMESPACE_RESULT + # value: $(results.targetNamespace.path) + - name: NAMESPACE + value: $(params.namespace) + - name: SIZE + value: $(params.size) + - name: STORAGE_CLASS + value: $(params.storage-class) + - name: DRY_RUN + value: $(params.dry-run) + image: alpine/kubectl:1.34.1 + script: | + #!/bin/sh + if [ "${DRY_RUN}" = "true" ]; then + echo ">> skipping" + exit 0 + fi + + # NAMESPACE=$(cat $TARGET_NAMESPACE_RESULT) + + cat <- + Chart ref or name. Examples: + - "nginx" (used with repoName to form repoName/nginx) + - "bitnami/nginx" + - "oci://registry.example.com/myorg/mychart" + - name: version + type: string + default: "" + description: Optional chart version + + # Repo management (add/update) + - name: repoName + type: string + default: "" + description: If set with repoUrl, the action will 'helm repo add' and 'helm repo update' + - name: repoUrl + type: string + default: "" + description: Chart repository URL + - name: updateRepo + type: string + default: "true" + description: '"true" to run helm repo update' + + # Repo auth/TLS (optional) + - name: repoUsername + type: string + default: "" + - name: repoPassword + type: string + default: "" + - name: repoPassCredentials + type: string + default: "false" + description: '"true" to pass credentials to all domains' + - name: repoInsecureSkipTLSVerify + type: string + default: "false" + - name: repoCAFile + type: string + default: "" + - name: repoCertFile + type: string + default: "" + - name: repoKeyFile + type: string + default: "" + + # Install/upgrade behavior + - name: namespace + type: string + default: "default" + - name: createNamespace + type: string + default: "true" + - name: wait + type: string + default: "true" + - name: timeout + type: string + default: "10m0s" + + # Values and extra args + - name: valuesYaml + type: string + default: "" + - name: valuesYamlUrl + type: string + default: "" + - name: extraArgs + type: string + default: "" + + - name: dry-run + type: string + default: "false" + # ---------- Params -> env (StepActions don't interpolate $(params.*) directly in script) ---------- + env: + - name: GIT_URL + value: $(params.git_url) + - name: GIT_REVISION + value: $(params.git_revision) + - name: GIT_DEPTH + value: $(params.depth) + - name: CHECKOUT_DIR + value: $(params.checkout_dir) + + - name: HELM_RELEASE + value: "$(params.releaseName)" + - name: HELM_CHART + value: "$(params.chart)" + - name: HELM_VERSION + value: "$(params.version)" + + - name: HELM_REPO_NAME + value: "$(params.repoName)" + - name: HELM_REPO_URL + value: "$(params.repoUrl)" + - name: HELM_REPO_UPDATE + value: "$(params.updateRepo)" + + - name: HELM_REPO_USERNAME + value: "$(params.repoUsername)" + - name: HELM_REPO_PASSWORD + value: "$(params.repoPassword)" + - name: HELM_REPO_PASS_CREDS + value: "$(params.repoPassCredentials)" + - name: HELM_REPO_INSECURE + value: "$(params.repoInsecureSkipTLSVerify)" + - name: HELM_REPO_CA_FILE + value: "$(params.repoCAFile)" + - name: HELM_REPO_CERT_FILE + value: "$(params.repoCertFile)" + - name: HELM_REPO_KEY_FILE + value: "$(params.repoKeyFile)" + + - name: HELM_NAMESPACE + value: "$(params.namespace)" + - name: HELM_CREATE_NAMESPACE + value: "$(params.createNamespace)" + - name: HELM_WAIT + value: "$(params.wait)" + - name: HELM_TIMEOUT + value: "$(params.timeout)" + - name: HELM_VALUES_YAML + value: "$(params.valuesYaml)" + - name: HELM_VALUES_YAML_URL + value: "$(params.valuesYamlUrl)" + - name: HELM_EXTRA_ARGS + value: "$(params.extraArgs)" + + - name: DRY_RUN + value: $(params.dry-run) + + script: | + #!/usr/bin/env sh + set -eu + + if [ "${DRY_RUN}" = "true" ]; then + echo ">> skipping" + exit 0 + fi + + # if a GIT_URL is defined, clone the project; we will use helm chart from this + if [ -n "${GIT_URL:-}" ]; then + mkdir -p "$CHECKOUT_DIR" + rm -rf "$CHECKOUT_DIR/.git" || true + echo "Cloning $GIT_URL @ $GIT_REVISION into $CHECKOUT_DIR" + git init "$CHECKOUT_DIR" + git -C "$CHECKOUT_DIR" remote add origin "$GIT_URL" + git -C "$CHECKOUT_DIR" fetch --depth "$GIT_DEPTH" origin "$GIT_REVISION" + git -C "$CHECKOUT_DIR" checkout FETCH_HEAD + COMMIT=$(git -C "$CHECKOUT_DIR" rev-parse HEAD) + echo "Checked out commit: $COMMIT" + fi + + # Construct optional values file; values overrides url + VALUES_FLAG="" + if [ -n "${HELM_VALUES_YAML_URL:-}" ]; then + VALUES_FLAG="-f ${HELM_VALUES_YAML_URL}" + fi + + if [ -n "${HELM_VALUES_YAML:-}" ]; then + printf "%s" "${HELM_VALUES_YAML}" > /tmp/${HELM_RELEASE}-values.yaml + VALUES_FLAG="-f /tmp/${HELM_RELEASE}-values.yaml" + fi + + # Optional repo add (idempotent via --force-update) + if [ -n "${HELM_REPO_NAME:-}" ] && [ -n "${HELM_REPO_URL:-}" ]; then + REPO_ADD_FLAGS="--force-update" + [ -n "${HELM_REPO_USERNAME:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --username ${HELM_REPO_USERNAME}" + [ -n "${HELM_REPO_PASSWORD:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --password ${HELM_REPO_PASSWORD}" + [ "${HELM_REPO_PASS_CREDS:-false}" = "true" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --pass-credentials" + [ "${HELM_REPO_INSECURE:-false}" = "true" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --insecure-skip-tls-verify" + [ -n "${HELM_REPO_CA_FILE:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --ca-file ${HELM_REPO_CA_FILE}" + [ -n "${HELM_REPO_CERT_FILE:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --cert-file ${HELM_REPO_CERT_FILE}" + [ -n "${HELM_REPO_KEY_FILE:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --key-file ${HELM_REPO_KEY_FILE}" + + echo "==> Adding/refreshing repo ${HELM_REPO_NAME} -> ${HELM_REPO_URL}" + # shellcheck disable=SC2086 + helm repo add bitnami https://charts.bitnami.com/bitnami + helm repo add "${HELM_REPO_NAME}" "${HELM_REPO_URL}" ${REPO_ADD_FLAGS} + # (helm repo add flags documented by Helm) # docs: https://helm.sh/docs/helm/helm_repo_add/ + + if [ "${HELM_REPO_UPDATE:-true}" = "true" ]; then + echo "==> Updating Helm repo cache" + # Update all repos for portability across Helm versions + helm repo update + fi + fi + + # Build common flags + CREATE_NS_FLAG=""; [ "${HELM_CREATE_NAMESPACE:-true}" = "true" ] && CREATE_NS_FLAG="--create-namespace" + WAIT_FLAG=""; [ "${HELM_WAIT:-true}" = "true" ] && WAIT_FLAG="--wait" + VERSION_FLAG=""; [ -n "${HELM_VERSION:-}" ] && VERSION_FLAG="--version ${HELM_VERSION}" + TIMEOUT_FLAG=""; [ -n "${HELM_TIMEOUT:-}" ] && TIMEOUT_FLAG="--timeout ${HELM_TIMEOUT}" + + # Decide final chart reference: + # - If user passed repoName and a bare chart, use repoName/chart. + # - If user passed repo/chart or oci://..., use as-is. + CHART_REF="${HELM_CHART}" + case "${HELM_CHART}" in + */*|oci://*) : ;; + *) if [ -n "${HELM_REPO_NAME:-}" ]; then CHART_REF="${HELM_REPO_NAME}/${HELM_CHART}"; fi ;; + esac + + if [ -n "${HELM_EXTRA_ARGS:-}" ]; then + HELM_EXTRA_ARGS=$(echo "$HELM_EXTRA_ARGS" | sed "s/DATE/$(date +%s)/g") + fi + + echo "==> helm upgrade --install ${HELM_RELEASE} ${CHART_REF} --namespace ${HELM_NAMESPACE} ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS}" + # shellcheck disable=SC2086 + helm template \ + "${HELM_RELEASE}" "${CHART_REF}" \ + --namespace "${HELM_NAMESPACE}" \ + ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS} \ + | kubectl --namespace "${HELM_NAMESPACE}" apply -f - + # helm upgrade --install \ + # "${HELM_RELEASE}" "${CHART_REF}" \ + # --namespace "${HELM_NAMESPACE}" \ + # ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS} + From 5c6600770a0056536a80246ab33ab4931ef16fb3 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Fri, 3 Oct 2025 13:54:37 -0400 Subject: [PATCH 16/44] inital pipeline Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/pipelinerun-matrix.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index cd64e491..9fa11494 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -12,7 +12,7 @@ spec: name: experiment params: - name: namespace - value: kalantar + value: CHANGE_ME - name: model-id value: "Qwen/Qwen3-0.6B" - name: experimentBaseUrl From abc04193e1b158e67fcc09dd6c50db7c97387a80 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Mon, 6 Oct 2025 10:53:29 -0400 Subject: [PATCH 17/44] utility to manage parallelism Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 28 +++ tekton-poc/pipeline/pipelinerun-matrix.yaml | 6 + tekton-poc/utility/transform-pr-parallel.py | 256 ++++++++++++++++++++ 3 files changed, 290 insertions(+) create mode 100644 tekton-poc/utility/transform-pr-parallel.py diff --git a/tekton-poc/README.md b/tekton-poc/README.md index a75d6b62..6bacc50d 100644 --- a/tekton-poc/README.md +++ b/tekton-poc/README.md @@ -82,6 +82,34 @@ See the logs for a TaskRun: tkn tr logs -f ``` +## Managing Parallelism + +The sample `PipelineRun` (`pipeline/pipelinerun-matrix.yaml`) executes all the tasks in parallel. Depdending on the size of the matrix, this may require a large number of resources. +A _matrix_ based `Task` can be unrolled into multiple tasks to reduce the parallelism. +The utility script `utility/transform-pr-parallel.py` does this as follows: + +1. Unroll a single parameter into one `Task` per value. Each resulting Task defines a matrix over the remaining parameters. + + ```shell + python transform-pr.py pipelinerun-matrix.yaml --unroll gaiePluginConfig -o pr-unrolled.yaml + ``` + +2. Unroll multiple parameters into [their Cartesian product] Tasks. Each resulting Task defines a matrix over the remaining parameters. + + ```shell + python transform-pr.py pipelinerun-matrix.yaml --unroll gaiePluginConfig,question_len -o pr-unrolled-2.yaml + ``` + +3. Unroll all the parameters into [their Cartian product] Tasks. Allow _n_ to run at once. This can be done using a _barrier_ strategy or a _sliding_window_ strategy + + ```shell + # Barrier (default) + python transform-pr.py pipelinerun-matrix.yaml -n 3 -o pr-expanded-barrier.yaml + + # Sliding window + python transform-pr.py pipelinerun-matrix.yaml -n 3 --sliding-window -o pr-expanded-sliding.yaml + ``` + ## Cautions - be sure to set the namespace parameter in the pipeline run; this is where the pipeline runs and is the base of the name for each experiment diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index 9fa11494..739e6d30 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -23,11 +23,17 @@ spec: params: - name: gaiePluginConfig value: + - "inf-sche-none.yaml" + - "inf-sche-prefix.yaml" + - "inf-sche-kv.yaml" - "inf-sche-queue.yaml" - name: question_len value: - "100" - "300" + - "1000" - name: output_len value: + - "100" - "300" + - "1000" diff --git a/tekton-poc/utility/transform-pr-parallel.py b/tekton-poc/utility/transform-pr-parallel.py new file mode 100644 index 00000000..426e0a58 --- /dev/null +++ b/tekton-poc/utility/transform-pr-parallel.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +import sys +import yaml +import itertools +import argparse +from copy import deepcopy + +def load_yaml_from_path_or_stdin(path: str): + """Load YAML from a file path or stdin if path == '-'.""" + if path == "-": + try: + return yaml.safe_load(sys.stdin) + except Exception as e: + raise ValueError(f"Failed to read YAML from stdin: {e}") + else: + try: + with open(path, "r") as f: + return yaml.safe_load(f) + except FileNotFoundError: + raise ValueError(f"Input file not found: {path}") + except Exception as e: + raise ValueError(f"Failed to read YAML from '{path}': {e}") + +def dump_yaml_to_path_or_stdout(data, path: str | None, announce_to_stderr: str | None = None): + """ + Write YAML to the given path. If path is None or '-', write to stdout with no extra noise. + If path is a real file, write there and optionally announce to stderr. + """ + if path is None or path == "-": + yaml.safe_dump(data, sys.stdout, sort_keys=False) + else: + with open(path, "w") as f: + yaml.safe_dump(data, f, sort_keys=False) + if announce_to_stderr: + print(announce_to_stderr, file=sys.stderr) + +# -------------------- EXPANSION (existing behavior) -------------------- # +def transform_matrix_to_batched_dict(original_yaml: dict, max_parallel: int, sliding_window: bool): + """ + Expand the matrix task into concrete tasks with runAfter enforcing either: + - barrier batching (default), or + - sliding-window (--sliding-window). + """ + if max_parallel < 1: + raise ValueError("max_parallel must be >= 1") + + try: + pipeline_spec = original_yaml["spec"]["pipelineSpec"] + tasks = pipeline_spec["tasks"] + except Exception: + raise ValueError("Input YAML must contain spec.pipelineSpec.tasks") + + if not isinstance(tasks, list) or len(tasks) == 0: + raise ValueError("spec.pipelineSpec.tasks must be a non-empty list") + + base_task = deepcopy(tasks[0]) + base_name = base_task.get("name", "task") + + # matrix params + matrix_params = {} + for p in base_task.get("matrix", {}).get("params", []): + vals = p.get("value", []) + if not isinstance(vals, list): + vals = [vals] + matrix_params[p["name"]] = vals + + combos = list(itertools.product(*matrix_params.values())) if matrix_params else [tuple()] + total = len(combos) + + new_tasks = [] + for i, combo in enumerate(combos): + t = deepcopy(base_task) + t.pop("matrix", None) + t["name"] = f"{base_name}-{i}" + + t["params"] = deepcopy(base_task.get("params", [])) + [ + {"name": name, "value": value} + for name, value in zip(matrix_params.keys(), combo) + ] + + if sliding_window: + if i >= max_parallel: + t["runAfter"] = [f"{base_name}-{i - max_parallel}"] + else: + t.pop("runAfter", None) + else: + batch_index = i // max_parallel + if batch_index > 0: + prev_start = (batch_index - 1) * max_parallel + prev_end = min(batch_index * max_parallel, total) + t["runAfter"] = [f"{base_name}-{j}" for j in range(prev_start, prev_end)] + else: + t.pop("runAfter", None) + + new_tasks.append(t) + + new_pr = deepcopy(original_yaml) + new_pipeline_spec = deepcopy(pipeline_spec) + new_pipeline_spec["tasks"] = new_tasks + new_pr["spec"]["pipelineSpec"] = new_pipeline_spec + return new_pr + +# -------------------- UNROLLING (new behavior) -------------------- # +def transform_unroll_params_dict(original_yaml: dict, unroll_params: list[str]): + """ + Unroll (hoist) one or more matrix parameters into separate tasks. + + For given unroll_params (subset of the matrix param names): + - Create one task for each Cartesian product of the chosen params' values. + - In each task: + * Set the chosen params as fixed task 'params' (not matrix). + * Keep a 'matrix' of the remaining matrix params (if any). + - Do not add runAfter constraints (preserve original no-dependency behavior). + """ + if not unroll_params: + raise ValueError("unroll_params must be a non-empty list of parameter names") + + try: + pipeline_spec = original_yaml["spec"]["pipelineSpec"] + tasks = pipeline_spec["tasks"] + except Exception: + raise ValueError("Input YAML must contain spec.pipelineSpec.tasks") + + if not isinstance(tasks, list) or len(tasks) == 0: + raise ValueError("spec.pipelineSpec.tasks must be a non-empty list") + + base_task = deepcopy(tasks[0]) + base_name = base_task.get("name", "task") + + # Load matrix params preserving order as a list of (name, values) + matrix_params_list = [] + for p in base_task.get("matrix", {}).get("params", []): + vals = p.get("value", []) + if not isinstance(vals, list): + vals = [vals] + matrix_params_list.append((p["name"], vals)) + + if not matrix_params_list: + raise ValueError("Base task has no matrix to unroll") + + # Validate unroll params are present in matrix + matrix_names = [name for name, _ in matrix_params_list] + unknown = [n for n in unroll_params if n not in matrix_names] + if unknown: + raise ValueError(f"Unroll params not found in matrix: {unknown}. Available: {matrix_names}") + + # Split into "chosen" vs "remaining" + chosen = [(name, vals) for name, vals in matrix_params_list if name in unroll_params] + remaining = [(name, vals) for name, vals in matrix_params_list if name not in unroll_params] + + # Cartesian product over chosen + chosen_names = [name for name, _ in chosen] + chosen_values_lists = [vals for _, vals in chosen] + chosen_combos = list(itertools.product(*chosen_values_lists)) if chosen else [tuple()] + + new_tasks = [] + for i, combo in enumerate(chosen_combos): + t = deepcopy(base_task) + t["name"] = f"{base_name}-{i}" + + # Remove matrix entirely; we will rebuild it only with remaining params + t.pop("matrix", None) + + # Merge original params plus fixed chosen params for this task + t["params"] = deepcopy(base_task.get("params", [])) + [ + {"name": name, "value": value} + for name, value in zip(chosen_names, combo) + ] + + # Rebuild matrix from the remaining params (if any) + if remaining: + t["matrix"] = { + "params": [{"name": name, "value": vals} for name, vals in remaining] + } + else: + # Nothing remains; ensure no stray runAfter or matrix fields + t.pop("matrix", None) + + # Preserve lack of dependencies (no runAfter) unless the base had them explicitly + if "runAfter" in t: + # Typically matrix tasks don't carry runAfter; remove to keep parallelism by default + t.pop("runAfter", None) + + new_tasks.append(t) + + # Replace tasks with our new set + new_pr = deepcopy(original_yaml) + new_pipeline_spec = deepcopy(pipeline_spec) + new_pipeline_spec["tasks"] = new_tasks + new_pr["spec"]["pipelineSpec"] = new_pipeline_spec + return new_pr + +def main(): + parser = argparse.ArgumentParser( + description=( + "Tekton PipelineRun matrix transformer.\n" + "Default: expand the matrix to concrete tasks with barrier batching or sliding-window.\n" + "Use --unroll to split specified matrix params into separate tasks while keeping a reduced matrix." + ) + ) + parser.add_argument("input", help="Input PipelineRun YAML file or '-' for stdin") + + # Mutually exclusive: either unroll OR expand + mode_group = parser.add_mutually_exclusive_group() + mode_group.add_argument( + "--unroll", metavar="PARAMS", + help="Comma-separated matrix parameter names to hoist into tasks (e.g., 'gaiePluginConfig' or 'p1,p2')." + ) + mode_group.add_argument( + "--sliding-window", action="store_true", + help="(Expand mode) Use sliding-window scheduling (each task i depends on i-n). Default is barrier batching." + ) + + # Expansion options (used only if NOT --unroll) + parser.add_argument( + "-n", "--max-parallel", type=int, default=1, + help="(Expand mode) Maximum number of tasks to run in parallel. Default: 1" + ) + + parser.add_argument( + "-o", "--output", default=None, + help="Output file path. Use '-' or omit to write to stdout." + ) + + args = parser.parse_args() + + try: + original = load_yaml_from_path_or_stdin(args.input) + + if args.unroll: + unroll_params = [s.strip() for s in args.unroll.split(",") if s.strip()] + transformed = transform_unroll_params_dict(original_yaml=original, unroll_params=unroll_params) + mode_desc = f"unroll={unroll_params}" + else: + transformed = transform_matrix_to_batched_dict( + original_yaml=original, + max_parallel=args.max_parallel, + sliding_window=args.sliding_window, + ) + mode_desc = "sliding-window" if args.sliding_window else "barrier" + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + announce = None + if args.output not in (None, "-"): + if args.unroll: + announce = f"✅ Transformed PipelineRun saved to '{args.output}' ({mode_desc})" + else: + announce = f"✅ Transformed PipelineRun saved to '{args.output}' (mode={mode_desc}, max_parallel={args.max_parallel})" + + dump_yaml_to_path_or_stdout(transformed, args.output, announce_to_stderr=announce) + +if __name__ == "__main__": + main() \ No newline at end of file From 46d469fe030313f8dfc128966ce729992dd6976b Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 8 Oct 2025 14:45:42 -0400 Subject: [PATCH 18/44] change workload steps Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/experiment-task.yaml | 291 ++++-- tekton-poc/pipeline/experiment-taskrun.yaml | 25 - .../pipeline/pipelinerun-matrix-subset.yaml | 81 ++ tekton-poc/pipeline/pipelinerun-matrix.yaml | 13 +- .../pipeline/pipelinerun-sequential-1.yaml | 841 ++++++++++++++++ .../pipelinerun-sequential-4-sliding.yaml | 835 ++++++++++++++++ .../pipeline/pipelinerun-sequential-4.yaml | 931 ++++++++++++++++++ ...un-sequential-unroll-gaiePluginConfig.yaml | 119 +++ tekton-poc/pipeline/stepactions.yaml | 6 +- 9 files changed, 3022 insertions(+), 120 deletions(-) delete mode 100644 tekton-poc/pipeline/experiment-taskrun.yaml create mode 100644 tekton-poc/pipeline/pipelinerun-matrix-subset.yaml create mode 100644 tekton-poc/pipeline/pipelinerun-sequential-1.yaml create mode 100644 tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml create mode 100644 tekton-poc/pipeline/pipelinerun-sequential-4.yaml create mode 100644 tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index 64a311c3..2f651b89 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -6,8 +6,10 @@ spec: description: > Runs an llm-d-benchmark experiment. - params: + workspaces: + - name: data + params: - name: question_len type: string - name: output_len @@ -30,16 +32,6 @@ spec: type: string default: "experiment" - - name: workspace-pvc-name - type: string - default: workspace-pvc - - name: workspace-pvc-size - type: string - default: 20Gi - - name: workspace-storage-class - type: string - default: ocs-storagecluster-cephfs - - name: model-pvc-name type: string default: model-pvc @@ -104,6 +96,15 @@ spec: type: string default: 900 + - name: llmdbenchImageRegistry + default: "quay.io" + - name: llmdbenchImageRepo + default: "namasluk" + - name: llmdbenchImageName + default: "llm-d-benchmark" + - name: llmdbenchImageTag + default: "251002.1" + - name: harnessName type: string default: inference-perf @@ -113,10 +114,20 @@ spec: - name: stackType type: string default: lld-d - - name: experimentIDBase + - name: pipelineUID type: string default: experiment + - name: bucket + type: string + default: "cloud-object-storage-cos-standard-ere" + - name: prefix + type: string + default: "results" + - name: endpoint + type: string + default: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + - name: dry-run type: string default: "false" @@ -202,22 +213,6 @@ spec: #!/bin/sh echo "âŗ TBD: Wait for download job to complete" - # TBD use tekton notion of workspace ?? - - name: create-workspace-pvc - ref: - name: create-rwx-pvc - params: - - name: name - value: $(params.workspace-pvc-name) - - name: namespace - value: $(params.namespace)-$(context.taskRun.name) - - name: size - value: $(params.workspace-pvc-size) - - name: storage-class - value: $(params.workspace-storage-class) - - name: dry-run - value: $(params.dry-run) - - name: gateway ref: name: helm-upgrade-install @@ -246,7 +241,7 @@ spec: name: helm-upgrade-install params: - name: releaseName - value: $(params.experimentName)-gaie + value: $(params.experimentName)-gaie-NAMESPACE_HASH - name: chart value: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool - name: version @@ -283,6 +278,11 @@ spec: value: 15m - name: valuesYamlUrl value: "$(params.experimentBaseUrl)/ms-values.yaml" + - name: extraArgs + value: > + --set routing.inferencePool.name=$(params.experimentName)-gaie-NAMESPACE_HASH + --set routing.httpRoute.rules[0].backendRefs[0].name=$(params.experimentName)-gaie-NAMESPACE_HASH + --set routing.httpRoute.rules[1].backendRefs[0].name=$(params.experimentName)-gaie-NAMESPACE_HASH - name: dry-run value: $(params.dry-run) @@ -333,94 +333,199 @@ spec: # echo "✅ prefill pods serving model ${MODEL_ID} ready" - name: workload - ref: - name: helm-upgrade-install - params: - # Location of helm chart - - name: git_url - value: "https://github.com/kalantar/llm-d-benchmark" - - name: git_revision - value: "tekton-poc" - - name: checkout_dir - value: "/tmp/llm-d-benchmark" - - # Helm arguments - - name: releaseName - value: $(params.experimentName)-harness - - name: chart - value: /tmp/llm-d-benchmark/charts/harness - - name: namespace - value: $(params.namespace)-$(context.taskRun.name) - - name: timeout - value: 15m - # - name: valuesYamlUrl - # value: "/tmp/llm-d-benchmark/charts/harness/values.yaml" - - name: extraArgs - value: > - --set harness.image.registry=quay.io - --set harness.image.repository=namasluk - --set harness.image.name=llm-d-benchmark - --set harness.image.tag=251002.1 - --set experiment.profile.name=$(params.harnessProfile) - --set experiment.profile.shared_prefix.question_len=$(params.question_len) - --set experiment.profile.shared_prefix.output_len=$(params.output_len) - --set experiment.identifier=experiment-DATE - --set stack.model=$(params.model-id) - --set stack.name=$(context.taskRun.name) - --set stack.endpointUrl='http://experiment-gateway-inference-gateway:80' - - - name: dry-run - value: $(params.dry-run) + image: $(params.llmdbenchImageRegistry)/$(params.llmdbenchImageRepo)/$(params.llmdbenchImageName):$(params.llmdbenchImageTag) + env: + - name: LLMDBENCH_RUN_EXPERIMENT_LAUNCHER + value: "1" + - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZE_LOCALLY + value: "0" + - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS + value: "$(params.harnessName)-llm-d-benchmark.sh" + - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZER + value: "$(params.harnessName)-analyze_results.sh" + - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_NAME + value: "$(params.harnessProfile)" + - name: LLMDBENCH_HARNESS_NAME + value: "$(params.harnessName)" + - name: LLMDBENCH_HARNESS_NAMESPACE + value: "$(params.namespace)-$(context.taskRun.name)" + - name: LLMDBENCH_HARNESS_STACK_TYPE + value: "llm-d" + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: "http://experiment-gateway-inference-gateway.$(params.namespace)-$(context.taskRun.name).svc.cluster.local:80" + - name: LLMDBENCH_DEPLOY_METHODS + value: "modelservice" + - name: LLMDBENCH_MAGIC_ENVAR + value: "harness_pod" + + - name: LLMDBENCH_LLMD_IMAGE_REGISTRY + value: "$(params.llmdbenchImageRegistry)" + - name: LLMDBENCH_LLMD_IMAGE_REPO + value: "$(params.llmdbenchImageRepo)" + - name: LLMDBENCH_LLMD_IMAGE_NAME + value: "$(params.llmdbenchImageName)" + - name: LLMDBENCH_LLMD_IMAGE_TAG + value: "$(params.llmdbenchImageTag)" + + # TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD + - name: LLMDBENCH_DEPLOY_CURRENT_MODEL + value: "$(params.model-id)" + - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS + value: "0" + - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS + value: "2" + - name: LLMDBENCH_VLLM_COMMON_AFFINITY + value: "nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3" + - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM + value: "4" + - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM + value: "1" + - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM + value: "1" + - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM + value: "1" + + - name: HF_TOKEN_SECRET + value: "hf-secret" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + + computeResources: + requests: + memory: "32Gi" + cpu: "16" + limits: + memory: "32Gi" + cpu: "16" - - name: wait-for-workload - image: alpine/kubectl:1.34.1 - script : | - #!/bin/sh + script: | + #!/bin/bash + + export EXPERIMENT_ID="experiment-$(echo -n $(params.pipelineUID) | cut -c1-8)" + export LLMDBENCH_RUN_EXPERIMENT_ID="${EXPERIMENT_ID}" + export LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR="$(workspaces.data.path)/$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" + export LLMDBENCH_CONTROL_WORK_DIR="$(workspaces.data.path)/$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" + export LLMDBENCH_HARNESS_STACK_NAME=$(echo "$(params.model-id)" | tr '[:upper:]' '[:lower:]' | sed 's/[./]/-/g') + export LLMDBENCH_DEPLOY_CURRENT_MODELID="${LLMDBENCH_HARNESS_STACK_NAME}" + export LLMDBENCH_DEPLOY_CURRENT_TOKENIZER="$(params.model-id)" + + export QUESTION_LEN=$(params.question_len) + export OUTPUT_LEN=$(params.output_len) + + get_profiles() { + git init llm-d-benchmark + cd llm-d-benchmark + git remote add origin https://github.com/llm-d/llm-d-benchmark.git + git config core.sparseCheckout true + echo "workload/profiles/" >> .git/info/sparse-checkout + git pull origin main + } if [ "$(params.dry-run)" = "true" ]; then echo ">> skipping" exit 0 fi - NAMESPACE="$(params.namespace)-$(context.taskRun.name)" - HARNESS_NAME="$(params.harnessName)" - - echo "âŗ Waiting for pod ${HARNESS_NAME}-launcher to complete..." - - while true; do - STATUS=$(kubectl --namespace ${NAMESPACE} get pod ${HARNESS_NAME}-launcher -o jsonpath='{.status.phase}') - if [ "$STATUS" = "Succeeded" ] || [ "$STATUS" = "Failed" ]; then - echo "Pod completed with status: $STATUS" - break - fi - echo "âŗ Still waiting for pod to complete..." - sleep 5 + get_profiles + + echo "creating CONTROL directories" + mkdir -p ${LLMDBENCH_CONTROL_WORK_DIR}/setup + rm -f ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands + touch ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands + + workload=$(echo $(params.harnessProfile) | sed 's^\.yaml^^g' ) + echo "workload = $workload" + workload_template_list=$(find workload/profiles/ -name "${workload}.yaml.in") + echo "workload_template_list = $workload_template_list" + + for workload_template_full_path in $workload_template_list; do + echo "PROCESSING $workload_template_full_path" + workload_template_type=$(echo ${workload_template_full_path} | rev | cut -d '/' -f 2 | rev) + echo "workload_template_type = $workload_template_type" + workload_template_file_name=$(echo ${workload_template_full_path} | rev | cut -d '/' -f 1 | rev | sed -e "s^\.yaml.in$^^g") + echo "workload_template_file_name = $workload_template_file_name" + ## + workload_output_file=${LLMDBENCH_CONTROL_WORK_DIR}/shared_prefix_synthetic_short.yaml + # workload_output_file=${LLMDBENCH_CONTROL_WORK_DIR}/$workload_template_type/$workload_template_file_name + echo "workload_output_file = $workload_output_file" + ## + mkdir -p ${LLMDBENCH_CONTROL_WORK_DIR}/$workload_template_type + + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL^${LLMDBENCH_DEPLOY_CURRENT_MODEL}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL^${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_TOKENIZER^${LLMDBENCH_DEPLOY_CURRENT_TOKENIZER}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands + echo "s^question_len: .*^question_len: ${QUESTION_LEN}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands + echo "s^output_len: .*^output_len: ${OUTPUT_LEN}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands + echo "s^ path: .*^ path: ${LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands + + echo "------" + cat ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands + echo "------" + echo "workload_output_file=$workload_output_file" + sed -f ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands $workload_template_full_path > $workload_output_file + + cat $workload_output_file done - echo "✅ workload completed" + llm-d-benchmark.sh - name: upload-results - image: alpine:3.20 - script : | - #!/bin/sh - echo "🚚 TBD: Upload results" + image: amazon/aws-cli:2.31.9 + workingDir: $(workspaces.data.path) + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: ibm-cos-secret + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: ibm-cos-secret + key: AWS_SECRET_ACCESS_KEY + - name: AWS_EC2_METADATA_DISABLED + value: "true" + script: | + #!/usr/bin/env sh + set -euo pipefail + + dnf install tar gzip -y + + EXPERIMENT_ID="experiment-$(echo -n $(params.pipelineUID) | cut -c1-8)" + EXPERIMENT_RESULTS_FOLDER="$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" + ARCHIVE_NAME="${EXPERIMENT_RESULTS_FOLDER}.tar.gz" + + tar -czf ${ARCHIVE_NAME} -C "$(workspaces.data.path)" ${EXPERIMENT_RESULTS_FOLDER} + + aws s3 cp ${ARCHIVE_NAME} "s3://$(params.bucket)/${ARCHIVE_NAME}" \ + --endpoint-url "$(params.endpoint)" \ + --content-type "application/x-tar" \ + --content-encoding "gzip" \ + --no-progress + # --recursive \ + + rm -rf ${ARCHIVE_NAME} + + echo "✅ Uploaded results to ${ARCHIVE_NAME}" - name: delete-namespace image: alpine/helm:3.14.0 script : | #!/bin/sh + NAMESPACE="$(params.namespace)-$(context.taskRun.name)" + if [ "$(params.dry-run)" = "true" ]; then echo ">> skipping" exit 0 fi - NAMESPACE="$(params.namespace)-$(context.taskRun.name)" - - # helm delete --namespace ${NAMESPACE} $(params.experimentName)-harness # kubectl delete namespace ${NAMESPACE} - echo "✅ workload pod deleted" + echo "✅ workload namespace deleted" - name: log-completion image: alpine:3.20 diff --git a/tekton-poc/pipeline/experiment-taskrun.yaml b/tekton-poc/pipeline/experiment-taskrun.yaml deleted file mode 100644 index 86fca127..00000000 --- a/tekton-poc/pipeline/experiment-taskrun.yaml +++ /dev/null @@ -1,25 +0,0 @@ -apiVersion: tekton.dev/v1 -kind: TaskRun -metadata: - name: experiment-run -spec: - serviceAccountName: helm-installer - taskRef: - name: experiment - params: - - name: namespace - value: kalantar - - name: model-id - value: "Qwen/Qwen3-0.6B" - - name: experimentBaseUrl - value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - - name: harnessProfile - value: shared_prefix_synthetic.yaml - - - name: gaiePluginConfig - value: "inf-sche-queue.yaml" - - name: question_len - value: 100 - - name: output_len - value: 300 - diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml new file mode 100644 index 00000000..244d8c72 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -0,0 +1,81 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: "Qwen/Qwen3-0.6B" + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + matrix: + params: + - name: gaiePluginConfig + value: + - "inf-sche-none.yaml" + - "inf-sche-prefix.yaml" + - "inf-sche-kv.yaml" + - "inf-sche-queue.yaml" + - name: question_len + value: + - "100" + - "300" + - "1000" + - name: output_len + value: + - "100" + - "300" + - "1000" + include: + - name: combo-1 + params: + - name: gaiePluginConfig + value: "inf-sche-none.yaml" + - name: question_len + value: "100" + - name: output_len + value: "100" + - name: combo-2 + params: + - name: gaiePluginConfig + value: "inf-sche-prefix.yaml" + - name: question_len + value: "300" + - name: output_len + value: "300" + - name: combo-3 + params: + - name: gaiePluginConfig + value: "inf-sche-kv.yaml" + - name: question_len + value: "1000" + - name: output_len + value: "100" + - name: combo-4 + params: + - name: gaiePluginConfig + value: "inf-sche-queue.yaml" + - name: question_len + value: "300" + - name: output_len + value: "1000" \ No newline at end of file diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index 739e6d30..73c6ef5f 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -5,11 +5,20 @@ metadata: spec: taskRunTemplate: serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc pipelineSpec: + workspaces: + - name: data tasks: - name: run-experiment taskRef: name: experiment + workspaces: + - name: data + workspace: data params: - name: namespace value: CHANGE_ME @@ -18,7 +27,9 @@ spec: - name: experimentBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile - value: shared_prefix_synthetic.yaml + value: shared_prefix_synthetic_short.yaml + - name: pipelineUID + value: "$(context.pipelineRun.uid)" matrix: params: - name: gaiePluginConfig diff --git a/tekton-poc/pipeline/pipelinerun-sequential-1.yaml b/tekton-poc/pipeline/pipelinerun-sequential-1.yaml new file mode 100644 index 00000000..a4b77783 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-sequential-1.yaml @@ -0,0 +1,841 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment-0 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + - name: run-experiment-1 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-0 + - name: run-experiment-2 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-1 + - name: run-experiment-3 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-2 + - name: run-experiment-4 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-3 + - name: run-experiment-5 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-4 + - name: run-experiment-6 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-5 + - name: run-experiment-7 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-6 + - name: run-experiment-8 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-7 + - name: run-experiment-9 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-8 + - name: run-experiment-10 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-9 + - name: run-experiment-11 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-10 + - name: run-experiment-12 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-11 + - name: run-experiment-13 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-12 + - name: run-experiment-14 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-13 + - name: run-experiment-15 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-14 + - name: run-experiment-16 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-15 + - name: run-experiment-17 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-16 + - name: run-experiment-18 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-17 + - name: run-experiment-19 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-18 + - name: run-experiment-20 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-19 + - name: run-experiment-21 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-20 + - name: run-experiment-22 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-21 + - name: run-experiment-23 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-22 + - name: run-experiment-24 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-23 + - name: run-experiment-25 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-24 + - name: run-experiment-26 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-25 + - name: run-experiment-27 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-26 + - name: run-experiment-28 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-27 + - name: run-experiment-29 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-28 + - name: run-experiment-30 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-29 + - name: run-experiment-31 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-30 + - name: run-experiment-32 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-31 + - name: run-experiment-33 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-32 + - name: run-experiment-34 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-33 + - name: run-experiment-35 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-34 diff --git a/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml b/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml new file mode 100644 index 00000000..76f815b6 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml @@ -0,0 +1,835 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment-0 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + - name: run-experiment-1 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + - name: run-experiment-2 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + - name: run-experiment-3 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + - name: run-experiment-4 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-0 + - name: run-experiment-5 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-1 + - name: run-experiment-6 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-2 + - name: run-experiment-7 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-3 + - name: run-experiment-8 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-4 + - name: run-experiment-9 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-5 + - name: run-experiment-10 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-6 + - name: run-experiment-11 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-7 + - name: run-experiment-12 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-8 + - name: run-experiment-13 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-9 + - name: run-experiment-14 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-10 + - name: run-experiment-15 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-11 + - name: run-experiment-16 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-12 + - name: run-experiment-17 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-13 + - name: run-experiment-18 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-14 + - name: run-experiment-19 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-15 + - name: run-experiment-20 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-16 + - name: run-experiment-21 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-17 + - name: run-experiment-22 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-18 + - name: run-experiment-23 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-19 + - name: run-experiment-24 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-20 + - name: run-experiment-25 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-21 + - name: run-experiment-26 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-22 + - name: run-experiment-27 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-23 + - name: run-experiment-28 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-24 + - name: run-experiment-29 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-25 + - name: run-experiment-30 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-26 + - name: run-experiment-31 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-27 + - name: run-experiment-32 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-28 + - name: run-experiment-33 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-29 + - name: run-experiment-34 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-30 + - name: run-experiment-35 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-31 diff --git a/tekton-poc/pipeline/pipelinerun-sequential-4.yaml b/tekton-poc/pipeline/pipelinerun-sequential-4.yaml new file mode 100644 index 00000000..988117a1 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-sequential-4.yaml @@ -0,0 +1,931 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment-0 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + - name: run-experiment-1 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + - name: run-experiment-2 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + - name: run-experiment-3 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + - name: run-experiment-4 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-0 + - run-experiment-1 + - run-experiment-2 + - run-experiment-3 + - name: run-experiment-5 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-0 + - run-experiment-1 + - run-experiment-2 + - run-experiment-3 + - name: run-experiment-6 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-0 + - run-experiment-1 + - run-experiment-2 + - run-experiment-3 + - name: run-experiment-7 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-0 + - run-experiment-1 + - run-experiment-2 + - run-experiment-3 + - name: run-experiment-8 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-4 + - run-experiment-5 + - run-experiment-6 + - run-experiment-7 + - name: run-experiment-9 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-4 + - run-experiment-5 + - run-experiment-6 + - run-experiment-7 + - name: run-experiment-10 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-4 + - run-experiment-5 + - run-experiment-6 + - run-experiment-7 + - name: run-experiment-11 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-4 + - run-experiment-5 + - run-experiment-6 + - run-experiment-7 + - name: run-experiment-12 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-8 + - run-experiment-9 + - run-experiment-10 + - run-experiment-11 + - name: run-experiment-13 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-8 + - run-experiment-9 + - run-experiment-10 + - run-experiment-11 + - name: run-experiment-14 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-8 + - run-experiment-9 + - run-experiment-10 + - run-experiment-11 + - name: run-experiment-15 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-8 + - run-experiment-9 + - run-experiment-10 + - run-experiment-11 + - name: run-experiment-16 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-12 + - run-experiment-13 + - run-experiment-14 + - run-experiment-15 + - name: run-experiment-17 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-12 + - run-experiment-13 + - run-experiment-14 + - run-experiment-15 + - name: run-experiment-18 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-12 + - run-experiment-13 + - run-experiment-14 + - run-experiment-15 + - name: run-experiment-19 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-12 + - run-experiment-13 + - run-experiment-14 + - run-experiment-15 + - name: run-experiment-20 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-16 + - run-experiment-17 + - run-experiment-18 + - run-experiment-19 + - name: run-experiment-21 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-16 + - run-experiment-17 + - run-experiment-18 + - run-experiment-19 + - name: run-experiment-22 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-16 + - run-experiment-17 + - run-experiment-18 + - run-experiment-19 + - name: run-experiment-23 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-16 + - run-experiment-17 + - run-experiment-18 + - run-experiment-19 + - name: run-experiment-24 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-20 + - run-experiment-21 + - run-experiment-22 + - run-experiment-23 + - name: run-experiment-25 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-20 + - run-experiment-21 + - run-experiment-22 + - run-experiment-23 + - name: run-experiment-26 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-20 + - run-experiment-21 + - run-experiment-22 + - run-experiment-23 + - name: run-experiment-27 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-20 + - run-experiment-21 + - run-experiment-22 + - run-experiment-23 + - name: run-experiment-28 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-24 + - run-experiment-25 + - run-experiment-26 + - run-experiment-27 + - name: run-experiment-29 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-24 + - run-experiment-25 + - run-experiment-26 + - run-experiment-27 + - name: run-experiment-30 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-24 + - run-experiment-25 + - run-experiment-26 + - run-experiment-27 + - name: run-experiment-31 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-24 + - run-experiment-25 + - run-experiment-26 + - run-experiment-27 + - name: run-experiment-32 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-28 + - run-experiment-29 + - run-experiment-30 + - run-experiment-31 + - name: run-experiment-33 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-28 + - run-experiment-29 + - run-experiment-30 + - run-experiment-31 + - name: run-experiment-34 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-28 + - run-experiment-29 + - run-experiment-30 + - run-experiment-31 + - name: run-experiment-35 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-28 + - run-experiment-29 + - run-experiment-30 + - run-experiment-31 diff --git a/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml b/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml new file mode 100644 index 00000000..5c36a680 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml @@ -0,0 +1,119 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment-0 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + matrix: + params: + - name: question_len + value: &id001 + - '100' + - '300' + - '1000' + - name: output_len + value: &id002 + - '100' + - '300' + - '1000' + - name: run-experiment-1 + taskRef: + name: experiment + runAfter: + - run-experiment-0 + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + matrix: + params: + - name: question_len + value: *id001 + - name: output_len + value: *id002 + - name: run-experiment-2 + taskRef: + name: experiment + runAfter: + - run-experiment-1 + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + matrix: + params: + - name: question_len + value: *id001 + - name: output_len + value: *id002 + - name: run-experiment-3 + taskRef: + name: experiment + runAfter: + - run-experiment-2 + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + matrix: + params: + - name: question_len + value: *id001 + - name: output_len + value: *id002 diff --git a/tekton-poc/pipeline/stepactions.yaml b/tekton-poc/pipeline/stepactions.yaml index 7a135024..60d28385 100644 --- a/tekton-poc/pipeline/stepactions.yaml +++ b/tekton-poc/pipeline/stepactions.yaml @@ -236,6 +236,10 @@ spec: exit 0 fi + SHA256CMD=$(type -p gsha256sum || type -p sha256sum) + NAMESPACE_HASH=$(echo -n "$HELM_NAMESPACE" | $SHA256CMD | awk '{print $1}' | cut -c1-8) + HELM_RELEASE=$(echo "$HELM_RELEASE" | sed "s/NAMESPACE_HASH/$NAMESPACE_HASH/g") + # if a GIT_URL is defined, clone the project; we will use helm chart from this if [ -n "${GIT_URL:-}" ]; then mkdir -p "$CHECKOUT_DIR" @@ -300,7 +304,7 @@ spec: esac if [ -n "${HELM_EXTRA_ARGS:-}" ]; then - HELM_EXTRA_ARGS=$(echo "$HELM_EXTRA_ARGS" | sed "s/DATE/$(date +%s)/g") + HELM_EXTRA_ARGS=$(echo "$HELM_EXTRA_ARGS" | sed "s/NAMESPACE_HASH/$NAMESPACE_HASH/g") fi echo "==> helm upgrade --install ${HELM_RELEASE} ${CHART_REF} --namespace ${HELM_NAMESPACE} ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS}" From 59b329aac2e35c8f0b77024fee0d77bb164e0fd1 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 8 Oct 2025 14:57:03 -0400 Subject: [PATCH 19/44] update readme Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 25 +++++++++++++------ ... => pipelinerun-sequential-4-barrier.yaml} | 0 2 files changed, 18 insertions(+), 7 deletions(-) rename tekton-poc/pipeline/{pipelinerun-sequential-4.yaml => pipelinerun-sequential-4-barrier.yaml} (100%) diff --git a/tekton-poc/README.md b/tekton-poc/README.md index 6bacc50d..677fa3af 100644 --- a/tekton-poc/README.md +++ b/tekton-poc/README.md @@ -37,8 +37,8 @@ A single Task measures performance over a single set of values from the factor/v 4. Download the model from HuggingFace to a PVC 5. Deploy the model 6. Run the workload for a single set of parameters -7. Upload the results to external storage (not yet implemented)\ -8. Delete the experiment namespace +7. Upload the results to external storage (s3) +8. Delete the experiment namespace (not yet implemented) A PipelineRun is created that embeds a Pipeline containing one Task with a matrix of values for a set of factors. An example is `pipelinerun-matrix.yaml`. @@ -60,12 +60,15 @@ A PipelineRun is created that embeds a Pipeline containing one Task with a matri ```shell kubectl apply -f pipeline/roles.yaml ``` -4. Deploy the steps and tasks: + +4. Create a RWX PVC `workspace-pvc` for storing execution results. This PVC is shared between all tasks. + +5. Deploy the steps and tasks: ```shell kubectl apply -f pipeline/stepactions.yaml kubectl apply -f pipeline/experiment-task.yaml ``` -5. Run experiments (set the parameter `namespace` to $NAMESPACE): +6. Run experiments (set the parameter `namespace` to $NAMESPACE): ```shell kubectl apply -f pipeline/pipelinerun-matrix.yaml ``` @@ -84,9 +87,17 @@ tkn tr logs -f ## Managing Parallelism -The sample `PipelineRun` (`pipeline/pipelinerun-matrix.yaml`) executes all the tasks in parallel. Depdending on the size of the matrix, this may require a large number of resources. -A _matrix_ based `Task` can be unrolled into multiple tasks to reduce the parallelism. -The utility script `utility/transform-pr-parallel.py` does this as follows: +The default PipelineSpec (in `pipeline/pipelinerun-matrix.yaml`) executes all the tasks in parallel. It can be modified in a number of ways to reduce the amount of parallel execution (at the expense of time). + +Some examples are provided: + +- `pipeline/pipelinerun-matrix-subset.yaml`: Uses `matrix.include` to list an explicit set of combinations to execute. +- `pipeline/pipelinerun-sequential-1.yaml`: Executes 1 task at a time. Each task depends on the previous one. +- `pipeline/pipelinerun-sequential-4-barrier.yaml`: Executes 4 tasks at a time. When all 4 complete, the next 4 start. +- `pipeline/pipelinerun-sequential-4-sliding.yaml`: Executes 4 tasks at a time. When one task completes another starts. +- `pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml`: Creates one task for each value of one dimention of the matrix. Each is executed in sequence. However, for other dimensions, parallel execution takes place. + +The utility script `utility/transform-pr-parallel.py` can be used to transform a default `PipelineRun` into alternatives as follows: 1. Unroll a single parameter into one `Task` per value. Each resulting Task defines a matrix over the remaining parameters. diff --git a/tekton-poc/pipeline/pipelinerun-sequential-4.yaml b/tekton-poc/pipeline/pipelinerun-sequential-4-barrier.yaml similarity index 100% rename from tekton-poc/pipeline/pipelinerun-sequential-4.yaml rename to tekton-poc/pipeline/pipelinerun-sequential-4-barrier.yaml From 8dfadd87e9b034bfbd4a9095b03fdec401172917 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 8 Oct 2025 15:44:06 -0400 Subject: [PATCH 20/44] pin image version Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/experiment-task.yaml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index 2f651b89..8108f7a5 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -140,7 +140,7 @@ spec: echo "🔄 Starting sweep step ..." - name: prepare-namespace - image: quay.io/openshift/origin-cli:latest + image: quay.io/openshift/origin-cli:4.21 script: | #!/bin/sh @@ -155,7 +155,6 @@ spec: kubectl create namespace ${NAMESPACE} \ --dry-run=client -o yaml | kubectl apply -f - - # HF_TOKEN=$( HF_TOKEN=$( kubectl get secret hf-secret \ --namespace "$(context.taskRun.namespace)" \ @@ -163,7 +162,7 @@ spec: | tr -d '\n' \ | base64 -d ) - # kubectl --namespace $(context.taskRun.namespace) get secret hf-secret -o jsonpath='{.data.HF_TOKEN}' | tr -d '\n' | base64 -d) + kubectl create secret generic hf-secret \ --namespace ${NAMESPACE} \ --from-literal="HF_TOKEN=${HF_TOKEN}" \ @@ -311,6 +310,7 @@ spec: --timeout=${MODEL_START_TIMEOUT}s echo "✅ (decode) pods serving model ${MODEL_ID} created" + # TBD check if any prefill pods and wait if so # kubectl --namespace ${NAMESPACE} \ # wait pod \ # -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ @@ -325,6 +325,7 @@ spec: --timeout=${MODEL_START_TIMEOUT}s echo "✅ (decode) pods serving model ${MODEL_ID} ready" + # TBD check if any prefill pods and wait if so # kubectl --namespace ${NAMESPACE} \ # wait pod \ # -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ From ed015a8bf21b78e72a86b7d8ec6e09761ff26fa3 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 9 Oct 2025 09:45:05 -0400 Subject: [PATCH 21/44] update roles.yaml Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 2 +- tekton-poc/pipeline/roles.yaml | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/tekton-poc/README.md b/tekton-poc/README.md index 677fa3af..dc5c58df 100644 --- a/tekton-poc/README.md +++ b/tekton-poc/README.md @@ -56,7 +56,7 @@ A PipelineRun is created that embeds a Pipeline containing one Task with a matri --from-literal="HF_TOKEN=${HF_TOKEN}" \ --dry-run=client -o yaml | kubectl apply -f - ``` -3. Give the task needed permissions +3. Give the task needed permissions (edit to set namespace) ```shell kubectl apply -f pipeline/roles.yaml ``` diff --git a/tekton-poc/pipeline/roles.yaml b/tekton-poc/pipeline/roles.yaml index 68a8aa2d..5bf06b1a 100644 --- a/tekton-poc/pipeline/roles.yaml +++ b/tekton-poc/pipeline/roles.yaml @@ -2,7 +2,6 @@ apiVersion: v1 kind: ServiceAccount metadata: name: helm-installer - namespace: kalantar --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -55,7 +54,7 @@ roleRef: subjects: - kind: ServiceAccount name: helm-installer - namespace: kalantar + namespace: CHANGE_ME --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding @@ -73,7 +72,6 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: helm-access - namespace: kalantar rules: - apiGroups: [""] resources: ["secrets", "configmaps", "services", "pods", "namespaces", "serviceaccounts", "persistentvolumeclaims"] @@ -111,11 +109,10 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: helm-access-binding - namespace: kalantar subjects: - kind: ServiceAccount name: helm-installer - namespace: kalantar + namespace: CHANGE_ME roleRef: kind: Role name: helm-access From 0fb4271ea3cd39b3e77220df4b40aba24f78c80b Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 9 Oct 2025 10:30:20 -0400 Subject: [PATCH 22/44] update readme Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 68 ++++++++++++++++++++++++++++++---- tekton-poc/pipeline/roles.yaml | 8 +++- 2 files changed, 66 insertions(+), 10 deletions(-) diff --git a/tekton-poc/README.md b/tekton-poc/README.md index dc5c58df..769c03f5 100644 --- a/tekton-poc/README.md +++ b/tekton-poc/README.md @@ -42,13 +42,20 @@ A single Task measures performance over a single set of values from the factor/v A PipelineRun is created that embeds a Pipeline containing one Task with a matrix of values for a set of factors. An example is `pipelinerun-matrix.yaml`. -## Use +## Usage -1. Create a namespace, for example: $NAMESPACE and set to current context: +### Setup + +1. Create a namespace where the Tekton pipeline will execute. ```shell + export $NAMESPACE=your_namespace kubectl create ns $NAMESPACE + ``` + For convenience, set the current context: + ```shell kubectl config set-context --current --namespace $NAMESPACE ``` + 2. Deploy a secret `hf-secret` containing your HuggingFace token in the namespace. ```shell kubectl create secret generic hf-secret \ @@ -56,35 +63,80 @@ A PipelineRun is created that embeds a Pipeline containing one Task with a matri --from-literal="HF_TOKEN=${HF_TOKEN}" \ --dry-run=client -o yaml | kubectl apply -f - ``` + 3. Give the task needed permissions (edit to set namespace) ```shell - kubectl apply -f pipeline/roles.yaml + envsubst '$NAMESPACE' < pipeline/roles.yaml | kubectl apply -f - + ``` + +4. Create a RWX PVC `workspace-pvc` for storing execution results. This PVC is shared between all tasks. For example: + ```shell + cat < -f ``` +Describe a `TaskRun`: + +```shell +tkn tr describe +``` + +### Cleanup + +Delete the `PipelineRun`: + +```shell +tkn pr delete -f +``` + +**Note**: The current implementation does not remove the namespaces created by each sweep step. Manually delete them to release all their resources. If you leave them, subsequent executions of the pipeline will attempt to reuse the resources. + ## Managing Parallelism The default PipelineSpec (in `pipeline/pipelinerun-matrix.yaml`) executes all the tasks in parallel. It can be modified in a number of ways to reduce the amount of parallel execution (at the expense of time). diff --git a/tekton-poc/pipeline/roles.yaml b/tekton-poc/pipeline/roles.yaml index 5bf06b1a..b49148df 100644 --- a/tekton-poc/pipeline/roles.yaml +++ b/tekton-poc/pipeline/roles.yaml @@ -2,6 +2,7 @@ apiVersion: v1 kind: ServiceAccount metadata: name: helm-installer + namespace: ${NAMESPACE} --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole @@ -54,12 +55,13 @@ roleRef: subjects: - kind: ServiceAccount name: helm-installer - namespace: CHANGE_ME + namespace: ${NAMESPACE} --- apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: helm-installer-restricted-scc + namespace: ${NAMESPACE} subjects: - kind: ServiceAccount name: helm-installer @@ -72,6 +74,7 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: Role metadata: name: helm-access + namespace: ${NAMESPACE} rules: - apiGroups: [""] resources: ["secrets", "configmaps", "services", "pods", "namespaces", "serviceaccounts", "persistentvolumeclaims"] @@ -109,10 +112,11 @@ apiVersion: rbac.authorization.k8s.io/v1 kind: RoleBinding metadata: name: helm-access-binding + namespace: ${NAMESPACE} subjects: - kind: ServiceAccount name: helm-installer - namespace: CHANGE_ME + namespace: ${NAMESPACE} roleRef: kind: Role name: helm-access From 6535c56366efe44a8a26932bcf5f29becb077dc3 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 9 Oct 2025 10:32:48 -0400 Subject: [PATCH 23/44] update readme Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tekton-poc/README.md b/tekton-poc/README.md index 769c03f5..8d4464c7 100644 --- a/tekton-poc/README.md +++ b/tekton-poc/README.md @@ -64,7 +64,7 @@ A PipelineRun is created that embeds a Pipeline containing one Task with a matri --dry-run=client -o yaml | kubectl apply -f - ``` -3. Give the task needed permissions (edit to set namespace) +3. Give the task needed permissions ```shell envsubst '$NAMESPACE' < pipeline/roles.yaml | kubectl apply -f - ``` From d349a0177670213af76d782422d0d4ea4199227c Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 9 Oct 2025 13:29:59 -0400 Subject: [PATCH 24/44] remove hardcoded param Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/experiment-task.yaml | 2 +- tekton-poc/pipeline/roles.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index 8108f7a5..3257f4a9 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -449,7 +449,7 @@ spec: workload_template_file_name=$(echo ${workload_template_full_path} | rev | cut -d '/' -f 1 | rev | sed -e "s^\.yaml.in$^^g") echo "workload_template_file_name = $workload_template_file_name" ## - workload_output_file=${LLMDBENCH_CONTROL_WORK_DIR}/shared_prefix_synthetic_short.yaml + workload_output_file=${LLMDBENCH_CONTROL_WORK_DIR}/${workload_template_file_name}.yaml # workload_output_file=${LLMDBENCH_CONTROL_WORK_DIR}/$workload_template_type/$workload_template_file_name echo "workload_output_file = $workload_output_file" ## diff --git a/tekton-poc/pipeline/roles.yaml b/tekton-poc/pipeline/roles.yaml index b49148df..5f447233 100644 --- a/tekton-poc/pipeline/roles.yaml +++ b/tekton-poc/pipeline/roles.yaml @@ -47,7 +47,7 @@ rules: apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: - name: helm-installer-clusterrolebinding + name: helm-installer-crb-${NAMESPACE} roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole From 28fabf124176d1e95bfe5acd6160edb263939efb Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 9 Oct 2025 15:45:36 -0400 Subject: [PATCH 25/44] expose s3 config Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 5 ++ tekton-poc/pipeline/experiment-task.yaml | 28 ++++---- .../pipeline/pipelinerun-matrix-subset.yaml | 68 +++++++++---------- tekton-poc/pipeline/pipelinerun-matrix.yaml | 45 +++++++++--- 4 files changed, 89 insertions(+), 57 deletions(-) diff --git a/tekton-poc/README.md b/tekton-poc/README.md index 8d4464c7..a449fa2b 100644 --- a/tekton-poc/README.md +++ b/tekton-poc/README.md @@ -178,3 +178,8 @@ The utility script `utility/transform-pr-parallel.py` can be used to transform a - be sure to set the namespace parameter in the pipeline run; this is where the pipeline runs and is the base of the name for each experiment - the upload of data is not yet implemented - there are hardcoded assumptions/values about the use case in several places; these will be removed as more use cases are explored + + +# Issues + +- document set up s3 keys diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index 3257f4a9..df3cbf4c 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -14,6 +14,8 @@ spec: type: string - name: output_len type: string + - name: gaiePluginConfig + type: string - name: namespace type: string @@ -118,15 +120,13 @@ spec: type: string default: experiment - - name: bucket + - name: s3-keys type: string - default: "cloud-object-storage-cos-standard-ere" - - name: prefix + default: "s3-keys" + - name: s3-bucket type: string - default: "results" - - name: endpoint + - name: s3-endpoint type: string - default: "https://s3.us-east.cloud-object-storage.appdomain.cloud" - name: dry-run type: string @@ -137,7 +137,10 @@ spec: image: alpine:3.20 script: | #!/bin/sh - echo "🔄 Starting sweep step ..." + echo "🔄 Starting sweep step for ..." + echo " gaiePluginConfig = $(params.gaiePluginConfig)" + echo " question_len = $(params.question_len)" + echo " output_len = $(params.output_len)" - name: prepare-namespace image: quay.io/openshift/origin-cli:4.21 @@ -480,12 +483,12 @@ spec: - name: AWS_ACCESS_KEY_ID valueFrom: secretKeyRef: - name: ibm-cos-secret + name: $(params.s3-keys) key: AWS_ACCESS_KEY_ID - name: AWS_SECRET_ACCESS_KEY valueFrom: secretKeyRef: - name: ibm-cos-secret + name: $(params.s3-keys) key: AWS_SECRET_ACCESS_KEY - name: AWS_EC2_METADATA_DISABLED value: "true" @@ -499,10 +502,11 @@ spec: EXPERIMENT_RESULTS_FOLDER="$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" ARCHIVE_NAME="${EXPERIMENT_RESULTS_FOLDER}.tar.gz" - tar -czf ${ARCHIVE_NAME} -C "$(workspaces.data.path)" ${EXPERIMENT_RESULTS_FOLDER} + tar -czf ${ARCHIVE_NAME} \ + -C "$(workspaces.data.path)" ${EXPERIMENT_RESULTS_FOLDER} - aws s3 cp ${ARCHIVE_NAME} "s3://$(params.bucket)/${ARCHIVE_NAME}" \ - --endpoint-url "$(params.endpoint)" \ + aws s3 cp ${ARCHIVE_NAME} "s3://$(params.s3-bucket)/${ARCHIVE_NAME}" \ + --endpoint-url "$(params.s3-endpoint)" \ --content-type "application/x-tar" \ --content-encoding "gzip" \ --no-progress diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml index 244d8c72..3a10d432 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -9,6 +9,24 @@ spec: - name: data persistentVolumeClaim: claimName: workspace-pvc + params: + - name: namespace + value: kalantar + - name: model-id + value: "Qwen/Qwen3-0.6B" + + # Harness / Workload + - name: harnessProfile + value: shared_prefix_synthetic_short.yaml + + # Output Location + - name: s3-keys + value: ibm-cos-secret + - name: s3-bucket + value: "cloud-object-storage-cos-standard-ere" + - name: s3-endpoint + value: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + pipelineSpec: workspaces: - name: data @@ -21,31 +39,25 @@ spec: workspace: data params: - name: namespace - value: kalantar + value: $(params.namespace) - name: model-id - value: "Qwen/Qwen3-0.6B" + value: $(params.model-id) - name: experimentBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + + - name: s3-keys + value: $(params.s3-keys) + - name: s3-bucket + value: $(params.s3-bucket) + - name: s3-endpoint + value: $(params.s3-endpoint) + - name: harnessProfile - value: shared_prefix_synthetic.yaml + value: $(params.harnessProfile) + + - name: pipelineUID + value: "$(context.pipelineRun.uid)" matrix: - params: - - name: gaiePluginConfig - value: - - "inf-sche-none.yaml" - - "inf-sche-prefix.yaml" - - "inf-sche-kv.yaml" - - "inf-sche-queue.yaml" - - name: question_len - value: - - "100" - - "300" - - "1000" - - name: output_len - value: - - "100" - - "300" - - "1000" include: - name: combo-1 params: @@ -63,19 +75,3 @@ spec: value: "300" - name: output_len value: "300" - - name: combo-3 - params: - - name: gaiePluginConfig - value: "inf-sche-kv.yaml" - - name: question_len - value: "1000" - - name: output_len - value: "100" - - name: combo-4 - params: - - name: gaiePluginConfig - value: "inf-sche-queue.yaml" - - name: question_len - value: "300" - - name: output_len - value: "1000" \ No newline at end of file diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index 73c6ef5f..34320b82 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -9,6 +9,24 @@ spec: - name: data persistentVolumeClaim: claimName: workspace-pvc + params: + - name: namespace + value: kalantar + - name: model-id + value: "Qwen/Qwen3-0.6B" + + # Harness / Workload + - name: harnessProfile + value: shared_prefix_synthetic_short.yaml + + # Output Location + - name: s3-keys + value: ibm-cos-secret + - name: s3-bucket + value: "cloud-object-storage-cos-standard-ere" + - name: s3-endpoint + value: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + pipelineSpec: workspaces: - name: data @@ -21,30 +39,39 @@ spec: workspace: data params: - name: namespace - value: CHANGE_ME + value: $(params.namespace) - name: model-id - value: "Qwen/Qwen3-0.6B" + value: $(params.model-id) - name: experimentBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + + - name: s3-keys + value: $(params.s3-keys) + - name: s3-bucket + value: $(params.s3-bucket) + - name: s3-endpoint + value: $(params.s3-endpoint) + - name: harnessProfile - value: shared_prefix_synthetic_short.yaml + value: $(params.harnessProfile) + - name: pipelineUID value: "$(context.pipelineRun.uid)" matrix: params: - name: gaiePluginConfig value: - - "inf-sche-none.yaml" - - "inf-sche-prefix.yaml" - - "inf-sche-kv.yaml" + # - "inf-sche-none.yaml" + # - "inf-sche-prefix.yaml" + # - "inf-sche-kv.yaml" - "inf-sche-queue.yaml" - name: question_len value: - - "100" - - "300" + # - "100" + # - "300" - "1000" - name: output_len value: - - "100" + # - "100" - "300" - "1000" From ec1a648a2cfda33f0ff29c22d29d8417d8085b7c Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 9 Oct 2025 15:56:31 -0400 Subject: [PATCH 26/44] document s3 Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/tekton-poc/README.md b/tekton-poc/README.md index a449fa2b..1f8ed491 100644 --- a/tekton-poc/README.md +++ b/tekton-poc/README.md @@ -44,6 +44,12 @@ A PipelineRun is created that embeds a Pipeline containing one Task with a matri ## Usage +### Requirements + +1. HF token +2. s3 bucket and necessary keys +3. + ### Setup 1. Create a namespace where the Tekton pipeline will execute. @@ -56,7 +62,7 @@ A PipelineRun is created that embeds a Pipeline containing one Task with a matri kubectl config set-context --current --namespace $NAMESPACE ``` -2. Deploy a secret `hf-secret` containing your HuggingFace token in the namespace. +2. Create a secret `hf-secret` containing your HuggingFace token in the namespace. ```shell kubectl create secret generic hf-secret \ --namespace ${NAMESPACE} \ @@ -64,12 +70,14 @@ A PipelineRun is created that embeds a Pipeline containing one Task with a matri --dry-run=client -o yaml | kubectl apply -f - ``` -3. Give the task needed permissions +3. Create a secret containing your s3 credentials `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. + +4. Give the task needed permissions ```shell envsubst '$NAMESPACE' < pipeline/roles.yaml | kubectl apply -f - ``` -4. Create a RWX PVC `workspace-pvc` for storing execution results. This PVC is shared between all tasks. For example: +5. Create a RWX PVC `workspace-pvc` for storing execution results. This PVC is shared between all tasks. For example: ```shell cat < Date: Fri, 10 Oct 2025 08:39:47 -0400 Subject: [PATCH 27/44] clarify use of namespace Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/experiment-task.yaml | 25 +++++++++---------- .../pipeline/pipelinerun-matrix-subset.yaml | 6 ++--- tekton-poc/pipeline/pipelinerun-matrix.yaml | 6 ++--- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index df3cbf4c..d79ce3da 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -17,10 +17,9 @@ spec: - name: gaiePluginConfig type: string - - name: namespace + - name: targetNamespacePrefix type: string - default: kalantar-llmd - description: Target namespace + default: llmdbench - name: model-id type: string @@ -147,7 +146,7 @@ spec: script: | #!/bin/sh - NAMESPACE="$(params.namespace)-$(context.taskRun.name)" + NAMESPACE="$(params.targetNamespacePrefix)-$(context.taskRun.name)" DRY_RUN="$(params.dry-run)" if [ "${DRY_RUN}" = "true" ]; then @@ -193,7 +192,7 @@ spec: - name: chart value: /tmp/llm-d-benchmark/charts/model-download - name: namespace - value: $(params.namespace)-$(context.taskRun.name) + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) - name: timeout value: 15m # - name: valuesYamlUrl @@ -229,7 +228,7 @@ spec: value: https://llm-d-incubation.github.io/llm-d-infra/ - name: namespace - value: $(params.namespace)-$(context.taskRun.name) + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) - name: timeout value: 15m - name: valuesYamlUrl @@ -250,7 +249,7 @@ spec: value: $(params.gaieChartVersion) - name: namespace - value: $(params.namespace)-$(context.taskRun.name) + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) - name: timeout value: 15m - name: valuesYamlUrl @@ -275,7 +274,7 @@ spec: value: https://llm-d-incubation.github.io/llm-d-modelservice/ - name: namespace - value: $(params.namespace)-$(context.taskRun.name) + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) - name: timeout value: 15m - name: valuesYamlUrl @@ -298,7 +297,7 @@ spec: echo ">> skipping" exit 0 fi - NAMESPACE="$(params.namespace)-$(context.taskRun.name)" + NAMESPACE="$(params.targetNamespacePrefix)-$(context.taskRun.name)" MODEL_ID="$(params.model-id)" MODEL_LABEL=$(echo "$MODEL_ID" | tr '[:upper:]' '[:lower:]' | sed 's/[./]/-/g') MODEL_START_TIMEOUT="$(params.modelWaitTimeout)" @@ -352,11 +351,11 @@ spec: - name: LLMDBENCH_HARNESS_NAME value: "$(params.harnessName)" - name: LLMDBENCH_HARNESS_NAMESPACE - value: "$(params.namespace)-$(context.taskRun.name)" + value: "$(params.targetNamespacePrefix)-$(context.taskRun.name)" - name: LLMDBENCH_HARNESS_STACK_TYPE value: "llm-d" - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL - value: "http://experiment-gateway-inference-gateway.$(params.namespace)-$(context.taskRun.name).svc.cluster.local:80" + value: "http://experiment-gateway-inference-gateway.$(params.targetNamespacePrefix)-$(context.taskRun.name).svc.cluster.local:80" - name: LLMDBENCH_DEPLOY_METHODS value: "modelservice" - name: LLMDBENCH_MAGIC_ENVAR @@ -521,7 +520,7 @@ spec: script : | #!/bin/sh - NAMESPACE="$(params.namespace)-$(context.taskRun.name)" + NAMESPACE="$(params.targetNamespacePrefix)-$(context.taskRun.name)" if [ "$(params.dry-run)" = "true" ]; then echo ">> skipping" @@ -530,7 +529,7 @@ spec: # kubectl delete namespace ${NAMESPACE} - echo "✅ workload namespace deleted" + echo "✅ workload namespace ${NAMESPACE} deleted" - name: log-completion image: alpine:3.20 diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml index 3a10d432..8700a029 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -10,7 +10,7 @@ spec: persistentVolumeClaim: claimName: workspace-pvc params: - - name: namespace + - name: targetNamespacePrefix value: kalantar - name: model-id value: "Qwen/Qwen3-0.6B" @@ -38,8 +38,8 @@ spec: - name: data workspace: data params: - - name: namespace - value: $(params.namespace) + - name: targetNamespacePrefix + value: $(params.targetNamespacePrefix) - name: model-id value: $(params.model-id) - name: experimentBaseUrl diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index 34320b82..290d688e 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -10,7 +10,7 @@ spec: persistentVolumeClaim: claimName: workspace-pvc params: - - name: namespace + - name: targetNamespacePrefix value: kalantar - name: model-id value: "Qwen/Qwen3-0.6B" @@ -38,8 +38,8 @@ spec: - name: data workspace: data params: - - name: namespace - value: $(params.namespace) + - name: targetNamespacePrefix + value: $(params.targetNamespacePrefix) - name: model-id value: $(params.model-id) - name: experimentBaseUrl From afb5655226f33d1882a67f9da8fb539509dfbe6c Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Fri, 10 Oct 2025 11:53:12 -0400 Subject: [PATCH 28/44] change image for s3 upload Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/experiment-task.yaml | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index d79ce3da..b6720374 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -476,7 +476,12 @@ spec: llm-d-benchmark.sh - name: upload-results - image: amazon/aws-cli:2.31.9 + image: ubuntu:24.04 + # Tried amazon/aws-cli:2.31.9 but latest tar available via dnf install tar -u is 1.34. + # Had errors "file changed as we read it". It may be caused by the way tar identifes + # file changes in v 1.34 (ctime). Recommended solution to move to 1.35. See https://stackoverflow.com/a/77765876. + # and tar release notes https://lists.gnu.org/archive/html/info-gnu/2023-07/msg00005.html) + # A smaller image is probably desirable. A restriction is that AWS CLI v2 requires glibc. workingDir: $(workspaces.data.path) env: - name: AWS_ACCESS_KEY_ID @@ -493,14 +498,24 @@ spec: value: "true" script: | #!/usr/bin/env sh - set -euo pipefail - dnf install tar gzip -y + apt-get update && \ + apt-get install -y --no-install-recommends ca-certificates curl unzip tar gzip && \ + rm -rf /var/lib/apt/lists/* + + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip && \ + unzip /tmp/awscliv2.zip -d /tmp && \ + /tmp/aws/install && \ + rm -rf /tmp/aws /tmp/awscliv2.zip + + tar --version && gzip --version && aws --version EXPERIMENT_ID="experiment-$(echo -n $(params.pipelineUID) | cut -c1-8)" EXPERIMENT_RESULTS_FOLDER="$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" ARCHIVE_NAME="${EXPERIMENT_RESULTS_FOLDER}.tar.gz" + tar --version && gzip --version && aws --version + tar -czf ${ARCHIVE_NAME} \ -C "$(workspaces.data.path)" ${EXPERIMENT_RESULTS_FOLDER} From 78de7de8b68e5536c0f3260781ca2397879583a7 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Fri, 10 Oct 2025 12:14:01 -0400 Subject: [PATCH 29/44] prevent using kalantar ns Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/pipelinerun-matrix-subset.yaml | 3 ++- tekton-poc/pipeline/pipelinerun-matrix.yaml | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml index 8700a029..9d102b9e 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -11,7 +11,8 @@ spec: claimName: workspace-pvc params: - name: targetNamespacePrefix - value: kalantar + # This can be anything. + value: $(params.targetNamespacePrefix) - name: model-id value: "Qwen/Qwen3-0.6B" diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index 290d688e..0e3af8b2 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -11,7 +11,8 @@ spec: claimName: workspace-pvc params: - name: targetNamespacePrefix - value: kalantar + # This can be anything. + value: $(params.targetNamespacePrefix) - name: model-id value: "Qwen/Qwen3-0.6B" From 17bee4b6c5bf0062e28ad2bc11b4f20e5f8c3cf9 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Fri, 10 Oct 2025 12:16:32 -0400 Subject: [PATCH 30/44] prevent using kalantar ns Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/pipelinerun-matrix-subset.yaml | 2 +- tekton-poc/pipeline/pipelinerun-matrix.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml index 9d102b9e..aa6e715c 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -12,7 +12,7 @@ spec: params: - name: targetNamespacePrefix # This can be anything. - value: $(params.targetNamespacePrefix) + value: $(context.pipelineRun.namespace) - name: model-id value: "Qwen/Qwen3-0.6B" diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index 0e3af8b2..6db84cf4 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -12,7 +12,7 @@ spec: params: - name: targetNamespacePrefix # This can be anything. - value: $(params.targetNamespacePrefix) + value: $(context.pipelineRun.namespace) - name: model-id value: "Qwen/Qwen3-0.6B" From f918279b46f956efa2d6983569e013342300ab19 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Fri, 10 Oct 2025 17:10:54 -0400 Subject: [PATCH 31/44] delete experiment namespaces Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/experiment-task.yaml | 22 +++++++++++-------- .../pipeline/pipelinerun-matrix-subset.yaml | 6 +++++ tekton-poc/pipeline/pipelinerun-matrix.yaml | 6 +++++ 3 files changed, 25 insertions(+), 9 deletions(-) diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index b6720374..e16bf6d1 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -127,6 +127,9 @@ spec: - name: s3-endpoint type: string + - name: debug + type: string + default: "false" - name: dry-run type: string default: "false" @@ -477,10 +480,10 @@ spec: - name: upload-results image: ubuntu:24.04 - # Tried amazon/aws-cli:2.31.9 but latest tar available via dnf install tar -u is 1.34. - # Had errors "file changed as we read it". It may be caused by the way tar identifes - # file changes in v 1.34 (ctime). Recommended solution to move to 1.35. See https://stackoverflow.com/a/77765876. - # and tar release notes https://lists.gnu.org/archive/html/info-gnu/2023-07/msg00005.html) + # Tried amazon/aws-cli:2.31.9 but latest tar available via `dnf install tar -y` is 1.34. + # There were sporadic errors "file changed as we read it". It may be caused by the way + # tar identifes file changes in v 1.34 (via ctime). A recommended solution to move to 1.35. + # See https://stackoverflow.com/a/77765876 and tar release notes https://lists.gnu.org/archive/html/info-gnu/2023-07/msg00005.html) # A smaller image is probably desirable. A restriction is that AWS CLI v2 requires glibc. workingDir: $(workspaces.data.path) env: @@ -531,19 +534,20 @@ spec: echo "✅ Uploaded results to ${ARCHIVE_NAME}" - name: delete-namespace - image: alpine/helm:3.14.0 + image: alpine/kubectl:1.34.1 script : | #!/bin/sh NAMESPACE="$(params.targetNamespacePrefix)-$(context.taskRun.name)" + DEBUG="$(params.debug)" - if [ "$(params.dry-run)" = "true" ]; then - echo ">> skipping" + if [ "$(params.debug)" = "true" ]; then + echo "âš ī¸ DEBUG=true; leaving namespace ${NAMESPACE} for inspection" + echo "âš ī¸ Manually clean up resources with \"kubectl delete namespace ${NAMESPACE}\"" exit 0 fi - # kubectl delete namespace ${NAMESPACE} - + kubectl delete namespace ${NAMESPACE} echo "✅ workload namespace ${NAMESPACE} deleted" - name: log-completion diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml index aa6e715c..d374a269 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -28,6 +28,10 @@ spec: - name: s3-endpoint value: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + # Control + - name: debug + value: false + pipelineSpec: workspaces: - name: data @@ -56,6 +60,8 @@ spec: - name: harnessProfile value: $(params.harnessProfile) + - name: debug + value: "$(params.debug)" - name: pipelineUID value: "$(context.pipelineRun.uid)" matrix: diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index 6db84cf4..64aa21ad 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -28,6 +28,10 @@ spec: - name: s3-endpoint value: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + # Control + - name: debug + value: false + pipelineSpec: workspaces: - name: data @@ -56,6 +60,8 @@ spec: - name: harnessProfile value: $(params.harnessProfile) + - name: debug + value: "$(params.debug)" - name: pipelineUID value: "$(context.pipelineRun.uid)" matrix: From c5a5e3845d16a7e5670bd8ea6c31c927cec11ec6 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Tue, 14 Oct 2025 13:59:03 -0400 Subject: [PATCH 32/44] pd yaml Signed-off-by: Michael Kalantar --- .../pd-disaggregation/gaie-values.yaml | 36 +++ .../pd-disaggregation/gateway-values.yaml | 8 + .../examples/pd-disaggregation/ms-values.yaml | 264 ++++++++++++++++++ 3 files changed, 308 insertions(+) create mode 100644 tekton-poc/examples/pd-disaggregation/gaie-values.yaml create mode 100644 tekton-poc/examples/pd-disaggregation/gateway-values.yaml create mode 100644 tekton-poc/examples/pd-disaggregation/ms-values.yaml diff --git a/tekton-poc/examples/pd-disaggregation/gaie-values.yaml b/tekton-poc/examples/pd-disaggregation/gaie-values.yaml new file mode 100644 index 00000000..2b039b76 --- /dev/null +++ b/tekton-poc/examples/pd-disaggregation/gaie-values.yaml @@ -0,0 +1,36 @@ +inferenceExtension: + replicas: 1 + image: + name: llm-d-inference-scheduler + hub: ghcr.io/llm-d + tag: v0.2.1 + pullPolicy: Always + extProcPort: 9002 + extraContainerPorts: + - name: zmq + containerPort: 5557 + protocol: TCP + extraServicePorts: + - name: zmq + port: 5557 + targetPort: 5557 + protocol: TCP + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token + key: HF_TOKEN + pluginsConfigFile: "plugins-v2.yaml" + +inferencePool: + targetPortNumber: 8000 + modelServerType: vllm + apiVersion: "inference.networking.x-k8s.io/v1alpha2" + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: meta-lla-1b4505f6-instruct +provider: + name: none + diff --git a/tekton-poc/examples/pd-disaggregation/gateway-values.yaml b/tekton-poc/examples/pd-disaggregation/gateway-values.yaml new file mode 100644 index 00000000..b22f8140 --- /dev/null +++ b/tekton-poc/examples/pd-disaggregation/gateway-values.yaml @@ -0,0 +1,8 @@ +gateway: + gatewayClassName: kgateway + service: + type: NodePort + destinationRule: + host: gaie-inference-scheduling-epp.kalantar-is.svc.cluster.local + gatewayParameters: + enabled: true diff --git a/tekton-poc/examples/pd-disaggregation/ms-values.yaml b/tekton-poc/examples/pd-disaggregation/ms-values.yaml new file mode 100644 index 00000000..cb81e79a --- /dev/null +++ b/tekton-poc/examples/pd-disaggregation/ms-values.yaml @@ -0,0 +1,264 @@ +fullnameOverride: meta-lla-1b4505f6-instruct +multinode: false + +modelArtifacts: + uri: pvc://model-pvc/models/meta-llama/Llama-3.1-8B-Instruct + size: 300Gi + authSecretName: "hf-secret" + name: meta-llama/Llama-3.1-8B-Instruct + +routing: + servicePort: 8000 + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: infra-nam-release-inference-gateway + proxy: + image: "ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0" + secure: false + connector: nixlv2 + debugLevel: 3 + inferenceModel: + create: true + inferencePool: + create: false + name: meta-lla-1b4505f6-instruct-gaie + httpRoute: + create: true + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: meta-lla-1b4505f6-instruct-gaie + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + matches: + - path: + type: PathPrefix + value: /meta-llama-llama-3-1-8b-instruct/ + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: meta-lla-1b4505f6-instruct-gaie + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + + epp: + create: false + +decode: + create: true + replicas: 3 + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + - NVIDIA-H100-80GB-HBM3 + parallelism: + data: 1 + tensor: 4 + annotations: + deployed-by: nick + modelservice: llm-d-benchmark + podAnnotations: + deployed-by: nick + modelservice: llm-d-benchmark + k8s.v1.cni.cncf.io/networks: multi-nic-compute + #no____config + containers: + - name: "vllm" + mountModelVolume: true + image: "ghcr.io/llm-d/llm-d:v0.2.0" + modelCommand: vllmServe + + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + - "--tensor-parallel-size" + - "4" + env: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: "rc,sm,cuda_ipc,cuda_copy,tcp" + - name: UCX_SOCKADDR_TLS_PRIORITY + value: "tcp" + - name: UCX_NET_DEVICES + value: mlx5_1:1 + - name: NCCL_IB_HCA + value: mlx5_1 + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + resources: + limits: + memory: 128Gi + cpu: "32" + + nvidia.com/gpu: "4" + rdma/roce_gdr: "1" + requests: + memory: 128Gi + cpu: "32" + + nvidia.com/gpu: "4" + rdma/roce_gdr: "1" + extraConfig: + startupProbe: + httpGet: + path: /health + port: 8200 + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: 8200 + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8200 + failureThreshold: 3 + periodSeconds: 5 + #no____config + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 16Gi + +prefill: + create: true + replicas: 1 + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + - NVIDIA-H100-80GB-HBM3 + parallelism: + data: 1 + tensor: 4 + annotations: + deployed-by: nick + modelservice: llm-d-benchmark + podAnnotations: + deployed-by: nick + modelservice: llm-d-benchmark + k8s.v1.cni.cncf.io/networks: multi-nic-compute + #no____config + containers: + - name: "vllm" + mountModelVolume: true + image: "ghcr.io/llm-d/llm-d:v0.2.0" + modelCommand: vllmServe + + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + - "--tensor-parallel-size" + - "4" + env: + - name: VLLM_IS_PREFILL + value: "1" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: "rc,sm,cuda_ipc,cuda_copy,tcp" + - name: UCX_SOCKADDR_TLS_PRIORITY + value: "tcp" + - name: UCX_NET_DEVICES + value: mlx5_1:1 + - name: NCCL_IB_HCA + value: mlx5_1 + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + resources: + limits: + memory: 128Gi + cpu: "32" + + nvidia.com/gpu: "4" + rdma/roce_gdr: "1" + requests: + memory: 128Gi + cpu: "32" + + nvidia.com/gpu: "4" + rdma/roce_gdr: "1" + extraConfig: + startupProbe: + httpGet: + path: /health + port: 8000 + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: 8000 + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8000 + failureThreshold: 3 + periodSeconds: 5 + #no____config + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 16Gi + From 5f4e42bb5070fef65cbf9460bc3189ecfc0d42eb Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Tue, 14 Oct 2025 14:26:50 -0400 Subject: [PATCH 33/44] update secret name Signed-off-by: Michael Kalantar --- tekton-poc/examples/pd-disaggregation/gaie-values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tekton-poc/examples/pd-disaggregation/gaie-values.yaml b/tekton-poc/examples/pd-disaggregation/gaie-values.yaml index 2b039b76..fe75e584 100644 --- a/tekton-poc/examples/pd-disaggregation/gaie-values.yaml +++ b/tekton-poc/examples/pd-disaggregation/gaie-values.yaml @@ -19,7 +19,7 @@ inferenceExtension: - name: HF_TOKEN valueFrom: secretKeyRef: - name: hf-token + name: hf-secret key: HF_TOKEN pluginsConfigFile: "plugins-v2.yaml" From c3754c7ce20d5a4070cc41e250ff266afc17eca9 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Tue, 14 Oct 2025 14:41:19 -0400 Subject: [PATCH 34/44] update fullnameoverride Signed-off-by: Michael Kalantar --- tekton-poc/examples/pd-disaggregation/ms-values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tekton-poc/examples/pd-disaggregation/ms-values.yaml b/tekton-poc/examples/pd-disaggregation/ms-values.yaml index cb81e79a..c5e725ff 100644 --- a/tekton-poc/examples/pd-disaggregation/ms-values.yaml +++ b/tekton-poc/examples/pd-disaggregation/ms-values.yaml @@ -1,4 +1,4 @@ -fullnameOverride: meta-lla-1b4505f6-instruct +fullnameOverride: meta-llama-llama-3-1-8b-instruct multinode: false modelArtifacts: From 3fcbb206e6d150329ed78d1abcc80b27cfd40851 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Tue, 14 Oct 2025 16:13:20 -0400 Subject: [PATCH 35/44] fix tensor Signed-off-by: Michael Kalantar --- tekton-poc/examples/pd-disaggregation/ms-values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tekton-poc/examples/pd-disaggregation/ms-values.yaml b/tekton-poc/examples/pd-disaggregation/ms-values.yaml index c5e725ff..387f7cc6 100644 --- a/tekton-poc/examples/pd-disaggregation/ms-values.yaml +++ b/tekton-poc/examples/pd-disaggregation/ms-values.yaml @@ -92,7 +92,7 @@ decode: - "--max-model-len" - "16000" - "--tensor-parallel-size" - - "4" + - "1" env: - name: VLLM_NIXL_SIDE_CHANNEL_HOST valueFrom: @@ -193,7 +193,7 @@ prefill: - "--max-model-len" - "16000" - "--tensor-parallel-size" - - "4" + - "1" env: - name: VLLM_IS_PREFILL value: "1" From b09aa28aa3c89be1d4b69070735f239fe079eb46 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Tue, 14 Oct 2025 16:42:33 -0400 Subject: [PATCH 36/44] gateway name Signed-off-by: Michael Kalantar --- tekton-poc/examples/pd-disaggregation/ms-values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tekton-poc/examples/pd-disaggregation/ms-values.yaml b/tekton-poc/examples/pd-disaggregation/ms-values.yaml index 387f7cc6..6f7cdce1 100644 --- a/tekton-poc/examples/pd-disaggregation/ms-values.yaml +++ b/tekton-poc/examples/pd-disaggregation/ms-values.yaml @@ -12,7 +12,7 @@ routing: parentRefs: - group: gateway.networking.k8s.io kind: Gateway - name: infra-nam-release-inference-gateway + name: experiment-gateway-inference-gateway proxy: image: "ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0" secure: false From b60f4472abaf1e8f098ea33c9f18d9364ceb2399 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Tue, 14 Oct 2025 17:15:04 -0400 Subject: [PATCH 37/44] desitation name Signed-off-by: Michael Kalantar --- tekton-poc/examples/pd-disaggregation/gateway-values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tekton-poc/examples/pd-disaggregation/gateway-values.yaml b/tekton-poc/examples/pd-disaggregation/gateway-values.yaml index b22f8140..3f836050 100644 --- a/tekton-poc/examples/pd-disaggregation/gateway-values.yaml +++ b/tekton-poc/examples/pd-disaggregation/gateway-values.yaml @@ -3,6 +3,6 @@ gateway: service: type: NodePort destinationRule: - host: gaie-inference-scheduling-epp.kalantar-is.svc.cluster.local + host: experiment-gaie-685a862b-epp.kalantar-is.svc.cluster.local gatewayParameters: enabled: true From a3407aed99bb1ef3a1b8d55e36386cb30ab38ac9 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Tue, 14 Oct 2025 17:29:49 -0400 Subject: [PATCH 38/44] label Signed-off-by: Michael Kalantar --- tekton-poc/examples/pd-disaggregation/gaie-values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tekton-poc/examples/pd-disaggregation/gaie-values.yaml b/tekton-poc/examples/pd-disaggregation/gaie-values.yaml index fe75e584..b860acef 100644 --- a/tekton-poc/examples/pd-disaggregation/gaie-values.yaml +++ b/tekton-poc/examples/pd-disaggregation/gaie-values.yaml @@ -30,7 +30,7 @@ inferencePool: modelServers: matchLabels: llm-d.ai/inferenceServing: "true" - llm-d.ai/model: meta-lla-1b4505f6-instruct + llm-d.ai/model: meta-llama-llama-3-1-8b-instruct provider: name: none From 2af0868d5f04d407ebdd4de4f35e36eac1b9b59a Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Tue, 14 Oct 2025 18:07:58 -0400 Subject: [PATCH 39/44] progress towards pd scenario Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 21 +- tekton-poc/pipeline/experiment-task.yaml | 338 +++++++-------- tekton-poc/pipeline/pd-disaggregation-pr.yaml | 150 +++++++ .../pipeline/pipelinerun-matrix-subset.yaml | 62 ++- tekton-poc/pipeline/steps/inference-perf.yaml | 366 ++++++++++++++++ .../pipeline/{ => steps}/stepactions.yaml | 15 + tekton-poc/pipeline/steps/treatment.yaml | 117 ++++++ tekton-poc/pipeline/steps/vllm-benchmark.yaml | 392 ++++++++++++++++++ 8 files changed, 1253 insertions(+), 208 deletions(-) create mode 100644 tekton-poc/pipeline/pd-disaggregation-pr.yaml create mode 100644 tekton-poc/pipeline/steps/inference-perf.yaml rename tekton-poc/pipeline/{ => steps}/stepactions.yaml (96%) create mode 100644 tekton-poc/pipeline/steps/treatment.yaml create mode 100644 tekton-poc/pipeline/steps/vllm-benchmark.yaml diff --git a/tekton-poc/README.md b/tekton-poc/README.md index 1f8ed491..a1f5b8fc 100644 --- a/tekton-poc/README.md +++ b/tekton-poc/README.md @@ -155,7 +155,7 @@ tkn pr delete -f The default PipelineSpec (in `pipeline/pipelinerun-matrix.yaml`) executes all the tasks in parallel. It can be modified in a number of ways to reduce the amount of parallel execution (at the expense of time). -Some examples are provided: +Some examples are provided (**Note** examples need to be updated): - `pipeline/pipelinerun-matrix-subset.yaml`: Uses `matrix.include` to list an explicit set of combinations to execute. - `pipeline/pipelinerun-sequential-1.yaml`: Executes 1 task at a time. Each task depends on the previous one. @@ -194,6 +194,23 @@ The utility script `utility/transform-pr-parallel.py` can be used to transform a - there are hardcoded assumptions/values about the use case in several places; these will be removed as more use cases are explored -# Issues +# To Do +- modify script to handle unroll better +- modify script to handle unroll and n together +- single experiment namespace (possibly different from tekton ns) +- use more stepActions +- incorporate memory planner (Jing) +- PD example (Nick) + - [IN PROGRESS] deployment of the pd scenario + - [DONE] enabling multiple harnesses (inference-perf and vllm-benchmark) + - [DONE] making factors/treatments general (they are hardcoded) + - [NOT STARTED] use capacity planner to determine whether or not to continue + - [IN PROGRESS] move step implementations to stepactions + - [NOT STARTED] move from multiple namespaces to single namespace + +- can we have just one prepare-profile now that we have treatments? +- should we have a convert step independent of the analysis step? +- eventually one for analysis based on analysis of converted results +- need to wait for model download diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index e16bf6d1..0bd8ea84 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -10,12 +10,15 @@ spec: - name: data params: - - name: question_len + - name: factorMapping type: string - - name: output_len - type: string - - name: gaiePluginConfig + description: | + JSON string mapping factor to path in source yaml file sorted by purpose. + - name: treatment type: string + description: | + JSON string of factors and values for one treatment. + Includes both infrastructure and workload factors. - name: targetNamespacePrefix type: string @@ -134,15 +137,73 @@ spec: type: string default: "false" + results: + - name: treatmentAnalysisModelservice + value: $(steps.analyze-modelservice-factors.results.treatmentAnalysis) + - name: treatmentAnalysisGaie + value: $(steps.analyze-gaie-factors.results.treatmentAnalysis) + - name: treatmentAnalysisWorkload + value: $(steps.analyze-workload-factors.results.treatmentAnalysis) + steps: - name: log-start image: alpine:3.20 script: | #!/bin/sh echo "🔄 Starting sweep step for ..." - echo " gaiePluginConfig = $(params.gaiePluginConfig)" - echo " question_len = $(params.question_len)" - echo " output_len = $(params.output_len)" + printf "%s" "$(params.treatment)" + + - name: analyze-modelservice-factors + ref: + name: analyze-treatment + params: + - name: factorType + value: modelservice + - name: factorMapping + value: $(params.factorMapping) + - name: treatment + value: $(params.treatment) + + - name: analyze-gaie-factors + ref: + name: analyze-treatment + params: + - name: factorType + value: gaie + - name: factorMapping + value: $(params.factorMapping) + - name: treatment + value: $(params.treatment) + + - name: analyze-workload-factors + ref: + name: analyze-treatment + params: + - name: factorType + value: workload + - name: factorMapping + value: $(params.factorMapping) + - name: treatment + value: $(params.treatment) + + - name: display-treatment-analysis + image: alpine:3.20 + env: + - name: MODELSERVICE_SET_ARGS + value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" + - name: GAIE_SET_ARGS + value: "$(steps.analyze-gaie-factors.results.treatmentAnalysis)" + - name: WORKLOAD_SET_ARGS + value: "$(steps.analyze-workload-factors.results.treatmentAnalysis)" + + script: | + #!/bin/sh + apk add --no-cache jq yq-go >/dev/null + jq --version + + echo "helm upgrade --install ... $(echo ${MODELSERVICE_SET_ARGS} | jq '.setArgs')" + echo "helm upgrade --install ... $(echo ${GAIE_SET_ARGS} | jq '.setArgs')" + echo "$(echo ${WORKLOAD_SET_ARGS} | jq '.updates')" - name: prepare-namespace image: quay.io/openshift/origin-cli:4.21 @@ -257,8 +318,8 @@ spec: value: 15m - name: valuesYamlUrl value: "$(params.experimentBaseUrl)/gaie-values.yaml" - - name: extraArgs - value: "--set inferenceExtension.pluginsConfigFile=$(params.gaiePluginConfig)" + - name: treatmentAnalysis + value: "$(steps.analyze-gaie-factors.results.treatmentAnalysis)" - name: dry-run value: $(params.dry-run) @@ -287,6 +348,8 @@ spec: --set routing.inferencePool.name=$(params.experimentName)-gaie-NAMESPACE_HASH --set routing.httpRoute.rules[0].backendRefs[0].name=$(params.experimentName)-gaie-NAMESPACE_HASH --set routing.httpRoute.rules[1].backendRefs[0].name=$(params.experimentName)-gaie-NAMESPACE_HASH + - name: treatmentAnalysis + value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" - name: dry-run value: $(params.dry-run) @@ -338,67 +401,33 @@ spec: # --timeout=${MODEL_START_TIMEOUT}s # echo "✅ prefill pods serving model ${MODEL_ID} ready" - - name: workload - image: $(params.llmdbenchImageRegistry)/$(params.llmdbenchImageRepo)/$(params.llmdbenchImageName):$(params.llmdbenchImageTag) - env: - - name: LLMDBENCH_RUN_EXPERIMENT_LAUNCHER - value: "1" - - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZE_LOCALLY - value: "0" - - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS - value: "$(params.harnessName)-llm-d-benchmark.sh" - - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZER - value: "$(params.harnessName)-analyze_results.sh" - - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_NAME - value: "$(params.harnessProfile)" - - name: LLMDBENCH_HARNESS_NAME - value: "$(params.harnessName)" - - name: LLMDBENCH_HARNESS_NAMESPACE - value: "$(params.targetNamespacePrefix)-$(context.taskRun.name)" - - name: LLMDBENCH_HARNESS_STACK_TYPE - value: "llm-d" - - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL - value: "http://experiment-gateway-inference-gateway.$(params.targetNamespacePrefix)-$(context.taskRun.name).svc.cluster.local:80" - - name: LLMDBENCH_DEPLOY_METHODS - value: "modelservice" - - name: LLMDBENCH_MAGIC_ENVAR - value: "harness_pod" - - - name: LLMDBENCH_LLMD_IMAGE_REGISTRY - value: "$(params.llmdbenchImageRegistry)" - - name: LLMDBENCH_LLMD_IMAGE_REPO - value: "$(params.llmdbenchImageRepo)" - - name: LLMDBENCH_LLMD_IMAGE_NAME - value: "$(params.llmdbenchImageName)" - - name: LLMDBENCH_LLMD_IMAGE_TAG - value: "$(params.llmdbenchImageTag)" - - # TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD - - name: LLMDBENCH_DEPLOY_CURRENT_MODEL - value: "$(params.model-id)" - - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS - value: "0" - - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS - value: "2" - - name: LLMDBENCH_VLLM_COMMON_AFFINITY - value: "nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3" - - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM - value: "4" - - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM - value: "1" - - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM - value: "1" - - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM - value: "1" - - - name: HF_TOKEN_SECRET - value: "hf-secret" - - name: HUGGING_FACE_HUB_TOKEN - valueFrom: - secretKeyRef: - name: hf-secret - key: HF_TOKEN + - name: inference-perf-prepare-profile + ref: + name: inference-perf-prepare-profile + params: + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + - name: treatmentAnalysis + value: $(steps.analyze-workload-factors.results.treatmentAnalysis) + - name: model-id + value: $(params.model-id) + - name: namespace + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) + - name: pipelineUID + value: $(params.pipelineUID) + - name: inference-perf-run + ref: + name: inference-perf-run + params: + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + - name: pipelineUID + value: $(params.pipelineUID) computeResources: requests: memory: "32Gi" @@ -407,131 +436,58 @@ spec: memory: "32Gi" cpu: "16" - script: | - #!/bin/bash - - export EXPERIMENT_ID="experiment-$(echo -n $(params.pipelineUID) | cut -c1-8)" - export LLMDBENCH_RUN_EXPERIMENT_ID="${EXPERIMENT_ID}" - export LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR="$(workspaces.data.path)/$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" - export LLMDBENCH_CONTROL_WORK_DIR="$(workspaces.data.path)/$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" - export LLMDBENCH_HARNESS_STACK_NAME=$(echo "$(params.model-id)" | tr '[:upper:]' '[:lower:]' | sed 's/[./]/-/g') - export LLMDBENCH_DEPLOY_CURRENT_MODELID="${LLMDBENCH_HARNESS_STACK_NAME}" - export LLMDBENCH_DEPLOY_CURRENT_TOKENIZER="$(params.model-id)" - - export QUESTION_LEN=$(params.question_len) - export OUTPUT_LEN=$(params.output_len) - - get_profiles() { - git init llm-d-benchmark - cd llm-d-benchmark - git remote add origin https://github.com/llm-d/llm-d-benchmark.git - git config core.sparseCheckout true - echo "workload/profiles/" >> .git/info/sparse-checkout - git pull origin main - } - - if [ "$(params.dry-run)" = "true" ]; then - echo ">> skipping" - exit 0 - fi - - get_profiles - - echo "creating CONTROL directories" - mkdir -p ${LLMDBENCH_CONTROL_WORK_DIR}/setup - rm -f ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands - touch ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands - - workload=$(echo $(params.harnessProfile) | sed 's^\.yaml^^g' ) - echo "workload = $workload" - workload_template_list=$(find workload/profiles/ -name "${workload}.yaml.in") - echo "workload_template_list = $workload_template_list" - - for workload_template_full_path in $workload_template_list; do - echo "PROCESSING $workload_template_full_path" - workload_template_type=$(echo ${workload_template_full_path} | rev | cut -d '/' -f 2 | rev) - echo "workload_template_type = $workload_template_type" - workload_template_file_name=$(echo ${workload_template_full_path} | rev | cut -d '/' -f 1 | rev | sed -e "s^\.yaml.in$^^g") - echo "workload_template_file_name = $workload_template_file_name" - ## - workload_output_file=${LLMDBENCH_CONTROL_WORK_DIR}/${workload_template_file_name}.yaml - # workload_output_file=${LLMDBENCH_CONTROL_WORK_DIR}/$workload_template_type/$workload_template_file_name - echo "workload_output_file = $workload_output_file" - ## - mkdir -p ${LLMDBENCH_CONTROL_WORK_DIR}/$workload_template_type - - echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL^${LLMDBENCH_DEPLOY_CURRENT_MODEL}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands - echo "s^REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL^${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands - echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_TOKENIZER^${LLMDBENCH_DEPLOY_CURRENT_TOKENIZER}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands - echo "s^question_len: .*^question_len: ${QUESTION_LEN}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands - echo "s^output_len: .*^output_len: ${OUTPUT_LEN}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands - echo "s^ path: .*^ path: ${LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR}^g" >> ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands - - echo "------" - cat ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands - echo "------" - echo "workload_output_file=$workload_output_file" - sed -f ${LLMDBENCH_CONTROL_WORK_DIR}/setup/sed-commands $workload_template_full_path > $workload_output_file - - cat $workload_output_file - done - - llm-d-benchmark.sh - - - name: upload-results - image: ubuntu:24.04 - # Tried amazon/aws-cli:2.31.9 but latest tar available via `dnf install tar -y` is 1.34. - # There were sporadic errors "file changed as we read it". It may be caused by the way - # tar identifes file changes in v 1.34 (via ctime). A recommended solution to move to 1.35. - # See https://stackoverflow.com/a/77765876 and tar release notes https://lists.gnu.org/archive/html/info-gnu/2023-07/msg00005.html) - # A smaller image is probably desirable. A restriction is that AWS CLI v2 requires glibc. - workingDir: $(workspaces.data.path) - env: - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: $(params.s3-keys) - key: AWS_ACCESS_KEY_ID - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: $(params.s3-keys) - key: AWS_SECRET_ACCESS_KEY - - name: AWS_EC2_METADATA_DISABLED - value: "true" - script: | - #!/usr/bin/env sh - - apt-get update && \ - apt-get install -y --no-install-recommends ca-certificates curl unzip tar gzip && \ - rm -rf /var/lib/apt/lists/* - - curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip && \ - unzip /tmp/awscliv2.zip -d /tmp && \ - /tmp/aws/install && \ - rm -rf /tmp/aws /tmp/awscliv2.zip - - tar --version && gzip --version && aws --version - - EXPERIMENT_ID="experiment-$(echo -n $(params.pipelineUID) | cut -c1-8)" - EXPERIMENT_RESULTS_FOLDER="$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" - ARCHIVE_NAME="${EXPERIMENT_RESULTS_FOLDER}.tar.gz" - - tar --version && gzip --version && aws --version + - name: inference-perf-analyze-results + ref: + name: inference-perf-analyze-results + params: + - name: harnessName + value: $(params.harnessName) + - name: pipelineUID + value: $(params.pipelineUID) - tar -czf ${ARCHIVE_NAME} \ - -C "$(workspaces.data.path)" ${EXPERIMENT_RESULTS_FOLDER} + - name: vllm-benchmark-prepare-profile + ref: + name: vllm-benchmark-prepare-profile + params: + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + - name: treatmentAnalysis + value: $(steps.analyze-workload-factors.results.treatmentAnalysis) + - name: model-id + value: $(params.model-id) + - name: namespace + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) + - name: pipelineUID + value: $(params.pipelineUID) - aws s3 cp ${ARCHIVE_NAME} "s3://$(params.s3-bucket)/${ARCHIVE_NAME}" \ - --endpoint-url "$(params.s3-endpoint)" \ - --content-type "application/x-tar" \ - --content-encoding "gzip" \ - --no-progress - # --recursive \ + - name: vllm-benchmark-run + ref: + name: vllm-benchmark-run + params: + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + - name: pipelineUID + value: $(params.pipelineUID) + computeResources: + requests: + memory: "32Gi" + cpu: "16" + limits: + memory: "32Gi" + cpu: "16" - rm -rf ${ARCHIVE_NAME} - - echo "✅ Uploaded results to ${ARCHIVE_NAME}" + - name: vllm-benchmark-analyze-results + ref: + name: vllm-benchmark-analyze-results + params: + - name: harnessName + value: $(params.harnessName) + - name: pipelineUID + value: $(params.pipelineUID) - name: delete-namespace image: alpine/kubectl:1.34.1 diff --git a/tekton-poc/pipeline/pd-disaggregation-pr.yaml b/tekton-poc/pipeline/pd-disaggregation-pr.yaml new file mode 100644 index 00000000..cc694459 --- /dev/null +++ b/tekton-poc/pipeline/pd-disaggregation-pr.yaml @@ -0,0 +1,150 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: pd +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + params: + - name: targetNamespacePrefix + # This can be anything. + value: $(context.pipelineRun.namespace) + - name: model-id + value: "meta-llama/Llama-3.1-8B-Instruct" + + # Harness / Workload + - name: harnessName + value: vllm-benchmark + - name: harnessProfile + value: random_concurrent.yaml + + # Output Location + - name: s3-keys + value: ibm-cos-secret + - name: s3-bucket + value: "cloud-object-storage-cos-standard-ere" + - name: s3-endpoint + value: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + + # Control + - name: debug + value: true + + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: targetNamespacePrefix + value: $(params.targetNamespacePrefix) + - name: model-id + value: $(params.model-id) + - name: experimentBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/pd-disaggregation/ + + - name: s3-keys + value: $(params.s3-keys) + - name: s3-bucket + value: $(params.s3-bucket) + - name: s3-endpoint + value: $(params.s3-endpoint) + + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + + - name: factorMapping + value: | + { + "modelservice": { + "prefillReplicas": "prefill.replicas", + "prefillTensorParallelism": "prefill.parallelism.tensor", + "decodeReplicas": "decode.replicas", + "decodeTensorParallelism": "decode.parallelism.tensor" + }, + "gaie": { + "gaiePluginConfig": "inferenceExtension.pluginsConfigFile" + }, + "workload": { + "max-concurrency": "max-concurrency", + "num_prompts": "num-prompts", + "question_len": "data.shared_prefix.question_len", + "output_len": "data.shared_prefix.output_len" + } + } + + - name: debug + value: "$(params.debug)" + - name: pipelineUID + value: "$(context.pipelineRun.uid)" + + matrix: + include: + - name: combo-0 + params: + - name: treatment + value: | + { + "prefillReplicas": 1, + "prefillTensorParallelism": 1, + "decodeReplicas": 1, + "decodeTensorParallelism": 1, + "max-concurrency": 1, + "num-prompts": 10 + } + # - name: combo-1 + # params: + # - name: treatment + # value: | + # { + # "prefillReplicas": 1, + # "prefillTensorParallelism": 2, + # "decodeReplicas": 1, + # "decodeTensorParallelism": 1, + # "max-concurrency": 1, + # "num-prompts": 10 + # } + + # params: + # - name: max-concurrency + # value: + # - "1" + # # - "8" + # # - "32" + # # - "64" + # # - "128" + # # - "256" + # - name: num-prompts + # value: + # - "10" + # # - "80" + # # - "320" + # # - "640" + # # - "1280" + # # - "2560" + +# LLMDBENCH_VLLM_COMMON_REPLICAS: "2,4" +# decode.replicas +# LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM: "8" +# decode.parallelism.tensor + +# LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS: "2,4,6,8" +# prefill.replicas +# LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM: "1,2" +# prefill.parallelism.tensor +# LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS: "1,2,4" +# decode.replicas +# LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM: "2,4,8" +# decodeTensorParallelism +# decode.parallelism.tensor \ No newline at end of file diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml index d374a269..11c73b85 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -17,6 +17,8 @@ spec: value: "Qwen/Qwen3-0.6B" # Harness / Workload + - name: harnessName + value: inference-perf - name: harnessProfile value: shared_prefix_synthetic_short.yaml @@ -30,7 +32,7 @@ spec: # Control - name: debug - value: false + value: true pipelineSpec: workspaces: @@ -57,28 +59,58 @@ spec: - name: s3-endpoint value: $(params.s3-endpoint) + - name: harnessName + value: $(params.harnessName) - name: harnessProfile value: $(params.harnessProfile) + - name: factorMapping + value: | + { + "modelservice": { + "prefillReplicas": "prefill.replicas", + "prefillTensorParallelism": "prefill.parallelism.tensor", + "decodeReplicas": "decode.replicas", + "decodeTensorParallelism": "decode.parallelism.tensor" + }, + "gaie": { + "gaiePluginConfig": "inferenceExtension.pluginsConfigFile" + }, + "workload": { + "max-concurrency": "max-concurrency", + "num_prompts": "num-prompts", + "question_len": "data.shared_prefix.question_len", + "output_len": "data.shared_prefix.output_len" + } + } + + - name: max-concurrency + value: "1" + - name: num-prompts + value: "10" + - name: debug value: "$(params.debug)" - name: pipelineUID value: "$(context.pipelineRun.uid)" matrix: include: - - name: combo-1 + - name: combo-0 params: - - name: gaiePluginConfig - value: "inf-sche-none.yaml" - - name: question_len - value: "100" - - name: output_len - value: "100" - - name: combo-2 + - name: treatment + value: | + { + "gaiePluginConfig": "inf-sche-queue.yaml", + "question_len": 100, + "output_len": 100 + } + - name: combo-1 params: - - name: gaiePluginConfig - value: "inf-sche-prefix.yaml" - - name: question_len - value: "300" - - name: output_len - value: "300" + - name: treatment + value: | + { + "gaiePluginConfig": "inf-sche-prefix.yaml", + "question_len": 300, + "output_len": 300 + } + diff --git a/tekton-poc/pipeline/steps/inference-perf.yaml b/tekton-poc/pipeline/steps/inference-perf.yaml new file mode 100644 index 00000000..50f91936 --- /dev/null +++ b/tekton-poc/pipeline/steps/inference-perf.yaml @@ -0,0 +1,366 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: inference-perf-prepare-profile +spec: + params: + - name: harnessName + - name: harnessProfile + - name: model-id + - name: namespace + - name: treatmentAnalysis + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "inference-perf" + - name: HARNESS_PROFILE + value: "$(params.harnessProfile)" + + - name: TREATMENT_ANALYSIS + value: "$(params.treatmentAnalysis)" + + - name: LLMDBENCH_DEPLOY_CURRENT_MODEL + value: "$(params.model-id)" + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: "http://experiment-gateway-inference-gateway.$(params.namespace).svc.cluster.local:80" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/bin/bash + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + # TBD is this necessary or is it already there? + apt-get update + apt-get install -y --no-install-recommends curl ca-certificates jq + curl -fsSL https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 \ + -o /usr/local/bin/yq + chmod +x /usr/local/bin/yq + jq --version + yq --version + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L48-L54 + # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + CONTROL_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + RUN_DIR=$(pwd) + + echo "🔄 Installing required tools" + apt-get update + apt-get install -y \ + wget \ + && apt-get clean && rm -rf /var/cache/apt + + # Ensure all folders created + mkdir -p $RESULTS_DIR + mkdir -p $CONTROL_DIR/setup + rm -rf $CONTROL_DIR/setup/sed-commands + touch $CONTROL_DIR/setup/sed-commands + mkdir -p ${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/templates + + cd ${RUN_DIR}/vllm-benchmark/ + + # Define constants: input profile template name and location; final profile name and location + workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) + workload_template=${workload}.yaml.in + workload_template_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/templates/${workload_template} + workload_profile=${workload}.yaml + workload_profile_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/${workload_profile} + + echo "🔄 Prepare workload profile" + # Fetch profile template from llmd-benchmark + wget -O ${workload_template_path} \ + --quiet \ + https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/workload/profiles/${MY_HARNESS_NAME}/${workload_template} + + # Apply treatment to profile template to produce final profile + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL^${LLMDBENCH_DEPLOY_CURRENT_MODEL}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL^${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_TOKENIZER^${LLMDBENCH_DEPLOY_CURRENT_TOKENIZER}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "---------- sed-commands" + cat ${CONTROL_DIR}/setup/sed-commands + echo "----------" + sed -f ${CONTROL_DIR}/setup/sed-commands ${workload_template_path} > ${workload_profile_path} + + # TBD eliminate the TARGET_FILE env variable + TARGET_FILE=${workload_profile_path} + echo "${TREATMENT_ANALYSIS}" | jq '.updates' > /tmp/updates.json + echo ">>> /tmp/updates.json" + cat /tmp/updates.json + + if [ ! -f "$TARGET_FILE" ]; then + echo "ERROR: File not found: $TARGET_FILE" >&2 + exit 1 + fi + + # Apply updates to JSON or YAML + if [ "$(jq 'length' /tmp/updates.json)" -gt 0 ]; then + ext="${TARGET_FILE##*.}" + tmp="${TARGET_FILE}.tmp" + + # TBD eliminate the json path (copilot generated this); profiles are yaml files + if [ "$ext" = "json" ]; then + jq --slurpfile upds /tmp/updates.json ' + reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + ' "$TARGET_FILE" > "$tmp" + mv "$tmp" "$TARGET_FILE" + else + # YAML path: YAML → JSON → apply → YAML + yq -o=json '.' "$TARGET_FILE" \ + | jq --slurpfile upds /tmp/updates.json ' + reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + ' \ + | yq -P > "$tmp" + mv "$tmp" "$TARGET_FILE" + fi + fi + + echo "---------- workload profile" + cat ${workload_profile_path} + echo "----------" + echo "✅ workload profile ready" +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: inference-perf-run +spec: + params: + - name: harnessName + - name: harnessProfile + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "inference-perf" + - name: HARNESS_PROFILE + value: "$(params.harnessProfile)" + + - name: GIT_REPO_URL + value: "https://github.com/kubernetes-sigs/inference-perf.git" + - name: GIT_REVISION + value: "main" + - name: GIT_COMMIT + value: "1ccc48b6bb9c9abb61558b719041fb000b265e59" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/usr/bin/env bash + + # https://github.com/llm-d/llm-d-benchmark/blob/main/workload/harnesses/inference-perf-llm-d-benchmark.sh + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L56-L62 + # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + CONTROL_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + RUN_DIR=$(pwd) + + # TODO figure out which are actually needed for each step + echo "🔄 Installing required tools" + apt-get update + apt-get install -y \ + git \ + pip \ + yq \ + && apt-get clean && rm -rf /var/cache/apt + + echo "🔄 Cloning and installing harness: ${MY_HARNESS_NAME}" + git clone --branch ${GIT_REVISION} ${GIT_REPO_URL} + cd inference-perf + git checkout ${GIT_COMMIT} + pip install . + + # profile name and location + workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) + workload_profile=${workload}.yaml + workload_profile_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/${workload_profile} + + # update .storage.local_storage.path in profile + pushd "$RESULTS_DIR" + yq '.storage["local_storage"]["path"] = '\"${RESULTS_DIR}\" <"${workload_profile_path}" -y >${workload_profile} + + # run inference-perf + inference-perf --config_file "$(realpath ./${workload_profile})" > >(tee -a ${RESULTS_DIR}/stdout.log) 2> >(tee -a ${RESULTS_DIR}/stderr.log >&2) + export LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC=$? + + # If benchmark harness returned with an error, exit here + if [[ $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC -ne 0 ]]; then + echo "❌ Harness returned with error $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC" + exit $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC + fi + echo "✅ Harness completed successfully." +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: inference-perf-analyze-results +spec: + params: + - name: harnessName + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "inference-perf" + + - name: GIT_REPO_URL + value: "https://github.com/kubernetes-sigs/inference-perf.git" + - name: GIT_REVISION + value: "main" + - name: GIT_COMMIT + value: "1ccc48b6bb9c9abb61558b719041fb000b265e59" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + +# https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/usr/bin/env bash + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + echo "🔄 Installing requirements" + apt-get update + apt-get install -y \ + git \ + pip \ + && apt-get clean && rm -rf /var/cache/apt + + git clone --branch ${GIT_REVISION} ${GIT_REPO_URL} + cd inference-perf + git checkout ${GIT_COMMIT} + pip install . + + cat < requirements-analysis.txt + matplotlib>=3.7.0 + numpy>=2.3.1 + seaborn>=0.12.0 + pandas>=2.2.3 + pydantic>=2.11.7 + PyYAML>=6.0.2 + scipy>=1.16.0 + requests>=2.32.5 + EOF + + cat requirements-analysis.txt + pip --version + pip install --no-cache-dir \ + --disable-pip-version-check \ + --upgrade \ + -r ./requirements-analysis.txt \ + --root-user-action=ignore + pip list + + # Download covert python from llm-d-benchmark + # TBD: should the python be embedded in the step? A separate step perhaps. + export ROOT_DIR=workload/report + export BRANCH=main + + cat < >(tee -a $RESULTS_DIR/stderr.log >&2) + # Report errors but don't quit + export RUN_EXPERIMENT_CONVERT_RC=$? + if [[ $RUN_EXPERIMENT_CONVERT_RC -ne 0 ]]; then + echo "./convert.py returned with error $RUN_EXPERIMENT_CONVERT_RC converting: $result" + fi + done + + # Define function to call analysis so can call multiple times + # https://github.com/llm-d/llm-d-benchmark/blob/main/analysis/inference-perf-analyze_results.sh + analyze_results () { + mkdir -p $RESULTS_DIR/analysis + sleep 60 + tm=$(date) + inference-perf --analyze "$RESULTS_DIR" + ec=$? + find $RESULTS_DIR -type f -newermt "${tm}" -exec mv -t "$RESULTS_DIR"/analysis {} + + return $ec + } + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/llm-d-benchmark.sh#L63-L74 + echo "🔄 Running analysis" + # Try to run analysis twice then give up + analyze_results + ec=$? + if [[ $ec -ne 0 ]]; then + echo "execution of analyzer failed, wating 120 seconds and trying again" + sleep 120 + set -x + analyze_results + fi + # Return with error code of first iteration of experiment analyzer + # TBD modify this message depending on success + echo "✅ Results analyzed and reports generated" + exit $ec diff --git a/tekton-poc/pipeline/stepactions.yaml b/tekton-poc/pipeline/steps/stepactions.yaml similarity index 96% rename from tekton-poc/pipeline/stepactions.yaml rename to tekton-poc/pipeline/steps/stepactions.yaml index 60d28385..282bfbff 100644 --- a/tekton-poc/pipeline/stepactions.yaml +++ b/tekton-poc/pipeline/steps/stepactions.yaml @@ -165,6 +165,9 @@ spec: - name: extraArgs type: string default: "" + - name: treatmentAnalysis + type: string + default: "" - name: dry-run type: string @@ -224,6 +227,9 @@ spec: - name: HELM_EXTRA_ARGS value: "$(params.extraArgs)" + - name: TREATMENT_ANALYSIS + value: "$(params.treatmentAnalysis)" + - name: DRY_RUN value: $(params.dry-run) @@ -236,6 +242,11 @@ spec: exit 0 fi + apk add --no-cache jq >/dev/null + + echo ">>> helm step: treatment" + printf "%s" "${TREATMENT_ANALYSIS}" + SHA256CMD=$(type -p gsha256sum || type -p sha256sum) NAMESPACE_HASH=$(echo -n "$HELM_NAMESPACE" | $SHA256CMD | awk '{print $1}' | cut -c1-8) HELM_RELEASE=$(echo "$HELM_RELEASE" | sed "s/NAMESPACE_HASH/$NAMESPACE_HASH/g") @@ -303,6 +314,10 @@ spec: *) if [ -n "${HELM_REPO_NAME:-}" ]; then CHART_REF="${HELM_REPO_NAME}/${HELM_CHART}"; fi ;; esac + if [ -n "${TREATMENT_ANALYSIS:-}" ]; then + HELM_EXTRA_ARGS="${HELM_EXTRA_ARGS} $(echo ${TREATMENT_ANALYSIS} | jq -r '.setArgs')" + fi + if [ -n "${HELM_EXTRA_ARGS:-}" ]; then HELM_EXTRA_ARGS=$(echo "$HELM_EXTRA_ARGS" | sed "s/NAMESPACE_HASH/$NAMESPACE_HASH/g") fi diff --git a/tekton-poc/pipeline/steps/treatment.yaml b/tekton-poc/pipeline/steps/treatment.yaml new file mode 100644 index 00000000..b371836f --- /dev/null +++ b/tekton-poc/pipeline/steps/treatment.yaml @@ -0,0 +1,117 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: analyze-treatment +spec: + description: | + Produce '--set/--set-string path=value' flags for factorType and + apply values into a JSON/YAML file. Works with flat or nested treatment. + image: alpine:3.20 + # Pass params via env (StepAction scripts cannot use $(params.*) directly). + # We'll read these envs inside the script. + params: + - name: factorType + type: string + - name: factorMapping + type: string + description: JSON mapping + - name: treatment + type: string + description: JSON values (flat or nested by key) + # - name: file + # type: string + # description: Target file path (relative to workdir or absolute) + # - name: workdir + # type: string + # description: Working directory (usually the bound workspace path) + # default: /workspace + results: + - name: treatmentAnalysis + description: Space-separated '--set/--set-string path=value' tokens + # workingDir: $(params.workdir) + env: + - name: SELECTOR + value: $(params.factorType) + - name: MAP_JSON + value: $(params.factorMapping) + - name: VAL_JSON + value: $(params.treatment) + # - name: TARGET_FILE + # value: $(params.file) + script: | + #!/bin/sh + set -eu + apk add --no-cache jq yq >/dev/null + # jq --version + # yq --version + + # echo "$SELECTOR" + # echo "$MAP_JSON" + # echo "$VAL_JSON" + + # Build updates + flags (uses $val for type checks — fixed version) + jq -r -n \ + --arg root "$SELECTOR" \ + --argjson map "$MAP_JSON" \ + --argjson vals "$VAL_JSON" ' + ($map[$root] // {}) as $m + | if ($m | type) != "object" then + error("Key not found in mapping: " + $root) + else + (if ($vals[$root] | type) == "object" then $vals[$root] else $vals end) as $v + | { + updates: [ + $m | to_entries[] + | select($v[.key] != null) + | { path: (.value | split(".")), value: $v[.key] } + ], + setArgs: ( + [ $m | to_entries[] + | select($v[.key] != null) + | ( $v[.key] ) as $val + | if ( ($val | type) == "string" ) then + "--set-string \(.value)=\($val)" + else + "--set \(.value)=\( if ( ($val|type)=="object" or ($val|type)=="array") then ($val|tojson) else ($val|tostring) end )" + end + ] | join(" ") + ) + } + end + ' > /tmp/out.json + + # FLAGS=$(jq -r '.setArgs' /tmp/out.json) + # jq '.updates' /tmp/out.json > /tmp/updates.json + + # if [ ! -f "$TARGET_FILE" ]; then + # echo "ERROR: File not found: $TARGET_FILE" >&2 + # # still write empty result + # printf "" > "$(step.results.treatmentAnalysis.path)" + # exit 1 + # fi + + # # Apply updates to JSON or YAML + # if [ "$(jq 'length' /tmp/updates.json)" -gt 0 ]; then + # ext="${TARGET_FILE##*.}" + # tmp="${TARGET_FILE}.tmp" + + # if [ "$ext" = "json" ]; then + # jq --slurpfile upds /tmp/updates.json ' + # reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + # ' "$TARGET_FILE" > "$tmp" + # mv "$tmp" "$TARGET_FILE" + # else + # # YAML path: YAML → JSON → apply → YAML + # yq -o=json '.' "$TARGET_FILE" \ + # | jq --slurpfile upds /tmp/updates.json ' + # reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + # ' \ + # | yq -P > "$tmp" + # mv "$tmp" "$TARGET_FILE" + # fi + # fi + + # printf "%s" "$(cat /tmp/out.json)" + # Emit flags as a step-scoped result + # printf "%s" "$FLAGS" > "$(step.results.treatmentAnalysis.path)" + printf "%s" "$(cat /tmp/out.json)" > "$(step.results.treatmentAnalysis.path)" diff --git a/tekton-poc/pipeline/steps/vllm-benchmark.yaml b/tekton-poc/pipeline/steps/vllm-benchmark.yaml new file mode 100644 index 00000000..8daa7aeb --- /dev/null +++ b/tekton-poc/pipeline/steps/vllm-benchmark.yaml @@ -0,0 +1,392 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: vllm-benchmark-prepare-profile +spec: + params: + - name: harnessName + - name: harnessProfile + - name: model-id + - name: namespace + - name: treatmentAnalysis + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "vllm-benchmark" + - name: HARNESS_PROFILE + value: "$(params.harnessProfile)" + + - name: TREATMENT_ANALYSIS + value: "$(params.treatmentAnalysis)" + + - name: LLMDBENCH_DEPLOY_CURRENT_MODEL + value: "$(params.model-id)" + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: "http://experiment-gateway-inference-gateway.$(params.namespace).svc.cluster.local:80" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/bin/bash + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + # TBD is this necessary or is it already there? + apt-get update + apt-get install -y --no-install-recommends curl ca-certificates jq + curl -fsSL https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 \ + -o /usr/local/bin/yq + chmod +x /usr/local/bin/yq + jq --version + yq --version + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L56-L62 + # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + CONTROL_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + RUN_DIR=$(pwd) + + echo "🔄 Installing required tools" + apt-get update + apt-get install -y \ + wget \ + && apt-get clean && rm -rf /var/cache/apt + + # Ensure all folders created + mkdir -p $RESULTS_DIR + mkdir -p $CONTROL_DIR/setup + rm -rf $CONTROL_DIR/setup/sed-commands + touch $CONTROL_DIR/setup/sed-commands + mkdir -p ${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/templates + + cd ${RUN_DIR}/vllm-benchmark/ + + # Define constants: input profile template name and location; final profile name and location + workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) + workload_template=${workload}.yaml.in + workload_template_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/templates/${workload_template} + workload_profile=${workload}.yaml + workload_profile_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/${workload_profile} + + echo "🔄 Prepare workload profile" + # Fetch profile template from llmd-benchmark + wget -O ${workload_template_path} \ + --quiet \ + https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/workload/profiles/${MY_HARNESS_NAME}/${workload_template} + + # Apply treatment to profile template to produce final profile + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL^${LLMDBENCH_DEPLOY_CURRENT_MODEL}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL^${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_TOKENIZER^${LLMDBENCH_DEPLOY_CURRENT_TOKENIZER}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "---------- sed-commands" + cat ${CONTROL_DIR}/setup/sed-commands + echo "----------" + sed -f ${CONTROL_DIR}/setup/sed-commands ${workload_template_path} > ${workload_profile_path} + + # TBD eliminate the TARGET_FILE env variable + TARGET_FILE=${workload_profile_path} + echo "${TREATMENT_ANALYSIS}" | jq '.updates' > /tmp/updates.json + echo ">>> /tmp/updates.json" + cat /tmp/updates.json + + if [ ! -f "$TARGET_FILE" ]; then + echo "ERROR: File not found: $TARGET_FILE" >&2 + exit 1 + fi + + # Apply updates to JSON or YAML + if [ "$(jq 'length' /tmp/updates.json)" -gt 0 ]; then + ext="${TARGET_FILE##*.}" + tmp="${TARGET_FILE}.tmp" + + # TBD eliminate the json path (copilot generated this); profiles are yaml files + if [ "$ext" = "json" ]; then + jq --slurpfile upds /tmp/updates.json ' + reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + ' "$TARGET_FILE" > "$tmp" + mv "$tmp" "$TARGET_FILE" + else + # YAML path: YAML → JSON → apply → YAML + yq -o=json '.' "$TARGET_FILE" \ + | jq --slurpfile upds /tmp/updates.json ' + reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + ' \ + | yq -P > "$tmp" + mv "$tmp" "$TARGET_FILE" + fi + fi + + echo "---------- workload profile" + cat ${workload_profile_path} + echo "----------" + echo "✅ workload profile ready" +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: vllm-benchmark-run +spec: + params: + - name: harnessName + - name: harnessProfile + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "vllm-benchmark" + - name: HARNESS_PROFILE + value: "$(params.harnessProfile)" + + - name: GIT_REPO_URL + value: "https://github.com/vllm-project/vllm.git" + - name: GIT_REVISION + value: "main" + - name: GIT_COMMIT + value: "b6381ced9c52271f799a8348fcc98c5f40528cdf" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/bin/bash + + # https://github.com/llm-d/llm-d-benchmark/blob/main/workload/harnesses/vllm-benchmark-llm-d-benchmark.sh + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L56-L62 + # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + CONTROL_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + RUN_DIR=$(pwd) + + # TODO figure out which are actually needed for each step + echo "🔄 Installing required tools" + apt-get update + apt-get install -y \ + git \ + gpg \ + pip \ + yq \ + && apt-get clean && rm -rf /var/cache/apt + + echo "🔄 Cloning and installing harness: ${MY_HARNESS_NAME}" + git clone --branch ${GIT_REVISION} ${GIT_REPO_URL} + cd vllm + git checkout ${GIT_COMMIT} + cd .. + mv -f vllm vllm-benchmark + + # TBD pin versions + cat < requirements-vllm-benchmark.txt + aiohttp + datasets + numpy + pandas + pillow + tqdm + transformers + EOF + + cat requirements-vllm-benchmark.txt + pip --version + pip install --no-cache-dir \ + --disable-pip-version-check \ + --upgrade \ + -r ./requirements-vllm-benchmark.txt \ + --root-user-action=ignore + pip list + + # profile name and location + workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) + workload_profile=${workload}.yaml + workload_profile_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/${workload_profile} + + # run vllm-benchmark + cp ${workload_profile_path} ${workload_profile} + en=$(cat ${workload_profile} | yq -r .executable) + + echo "pwd = $(pwd)" + echo "RUN_DIR=$RUN_DIR" + echo "running - ${RUN_DIR}/vllm-benchmark/benchmarks/${en}" + ls -l ${RUN_DIR}/vllm-benchmark/benchmarks + python ${RUN_DIR}/vllm-benchmark/benchmarks/${en} --$(cat ${workload_profile} | grep -v "^executable" | yq -r 'to_entries | map("\(.key)=\(.value)") | join(" --")' | sed -e 's^=none ^^g' -e 's^=none$^^g') --seed $(date +%s) --save-result > >(tee -a $RESULTS_DIR/stdout.log) 2> >(tee -a $RESULTS_DIR/stderr.log >&2) + export LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC=$? + find ${RUN_DIR}/vllm-benchmark -maxdepth 1 -mindepth 1 -name '*.json' -exec mv -t "$RESULTS_DIR"/ {} + + + # If benchmark harness returned with an error, exit here + if [[ $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC -ne 0 ]]; then + echo "❌ Harness returned with error $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC" + exit $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC + fi + echo "✅ Harness completed successfully." +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: vllm-benchmark-analyze-results +spec: + params: + - name: harnessName + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "vllm-benchmark" + + - name: GIT_REPO_URL + value: "https://github.com/kubernetes-sigs/inference-perf.git" + - name: GIT_REVISION + value: "main" + - name: GIT_COMMIT + value: "1ccc48b6bb9c9abb61558b719041fb000b265e59" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + +# https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/usr/bin/env bash + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + echo "🔄 Installing requirements" + # apt-get update + # apt-get install -y \ + # git \ + # pip \ + # && apt-get clean && rm -rf /var/cache/apt + + cat < requirements-analysis.txt + matplotlib>=3.7.0 + numpy>=2.3.1 + seaborn>=0.12.0 + pandas>=2.2.3 + pydantic>=2.11.7 + PyYAML>=6.0.2 + scipy>=1.16.0 + requests>=2.32.5 + EOF + + cat requirements-analysis.txt + pip --version + pip install --no-cache-dir \ + --disable-pip-version-check \ + --upgrade \ + -r ./requirements-analysis.txt \ + --root-user-action=ignore + pip list + + # Download covert python from llm-d-benchmark + # TBD: should the python be embedded in the step? A separate step perhaps. + export ROOT_DIR=workload/report + export BRANCH=main + + cat < >(tee -a $RESULTS_DIR/stderr.log >&2) + # Report errors but don't quit + export RUN_EXPERIMENT_CONVERT_RC=$? + if [[ $RUN_EXPERIMENT_CONVERT_RC -ne 0 ]]; then + echo "./convert.py returned with error $RUN_EXPERIMENT_CONVERT_RC converting: $result" + fi + done + + # Define function to call analysis so can call multiple times + # https://github.com/llm-d/llm-d-benchmark/blob/main/analysis/vllm-benchmark-analyze_results.sh + analyze_results () { + mkdir -p $RESULTS_DIR/analysis + result_start=$(grep -nr "Result ==" $RESULTS_DIR/stdout.log | cut -d ':' -f 1) + total_file_lenght=$(cat $RESULTS_DIR/stdout.log | wc -l) + cat $RESULTS_DIR/stdout.log | sed "$result_start,$total_file_lenght!d" > $RESULTS_DIR/analysis/summary.txt + return $? + } + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/llm-d-benchmark.sh#L63-L74 + echo "🔄 Running analysis" + # Try to run analysis twice then give up + analyze_results + ec=$? + if [[ $ec -ne 0 ]]; then + echo "execution of analyzer failed, wating 120 seconds and trying again" + sleep 120 + set -x + analyze_results + fi + # Return with error code of first iteration of experiment analyzer + # TBD modify this message depending on success + echo "✅ Results analyzed and reports generated" + exit $ec + From e7301928f93d4c9beb08952889870b2bf7d3ff7e Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Wed, 15 Oct 2025 13:33:20 -0400 Subject: [PATCH 40/44] remove --tensor-parallel-size Signed-off-by: Michael Kalantar --- tekton-poc/examples/pd-disaggregation/ms-values.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tekton-poc/examples/pd-disaggregation/ms-values.yaml b/tekton-poc/examples/pd-disaggregation/ms-values.yaml index 6f7cdce1..7a5be879 100644 --- a/tekton-poc/examples/pd-disaggregation/ms-values.yaml +++ b/tekton-poc/examples/pd-disaggregation/ms-values.yaml @@ -91,8 +91,6 @@ decode: - "--disable-uvicorn-access-log" - "--max-model-len" - "16000" - - "--tensor-parallel-size" - - "1" env: - name: VLLM_NIXL_SIDE_CHANNEL_HOST valueFrom: @@ -192,8 +190,6 @@ prefill: - "--disable-uvicorn-access-log" - "--max-model-len" - "16000" - - "--tensor-parallel-size" - - "1" env: - name: VLLM_IS_PREFILL value: "1" From 3bacf14c9cc7ed02b674635b7fb3b2a466861351 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Fri, 17 Oct 2025 10:26:56 -0400 Subject: [PATCH 41/44] capacity planner Signed-off-by: Michael Kalantar --- tekton-poc/README.md | 12 +- tekton-poc/pipeline/experiment-task.yaml | 425 +++++++++++++++--- tekton-poc/pipeline/pd-disaggregation-pr.yaml | 110 ++++- .../pipeline/pipelinerun-matrix-subset.yaml | 4 + tekton-poc/pipeline/pipelinerun-matrix.yaml | 17 +- tekton-poc/pipeline/steps/inference-perf.yaml | 137 ------ tekton-poc/pipeline/steps/treatment.yaml | 54 +-- tekton-poc/pipeline/steps/vllm-benchmark.yaml | 137 ------ 8 files changed, 473 insertions(+), 423 deletions(-) diff --git a/tekton-poc/README.md b/tekton-poc/README.md index a1f5b8fc..44616393 100644 --- a/tekton-poc/README.md +++ b/tekton-poc/README.md @@ -204,13 +204,19 @@ The utility script `utility/transform-pr-parallel.py` can be used to transform a - incorporate memory planner (Jing) - PD example (Nick) - [IN PROGRESS] deployment of the pd scenario + - [DONE] wait for model download + - [NOT STARTED] move from helm chart (job) to step - depends on ns change + - [NOT STARTED] debug --tensor-parallel-size argument - [DONE] enabling multiple harnesses (inference-perf and vllm-benchmark) - [DONE] making factors/treatments general (they are hardcoded) - - [NOT STARTED] use capacity planner to determine whether or not to continue + - [DONE] use capacity planner to determine whether or not to continue - [IN PROGRESS] move step implementations to stepactions - [NOT STARTED] move from multiple namespaces to single namespace -- can we have just one prepare-profile now that we have treatments? - should we have a convert step independent of the analysis step? - eventually one for analysis based on analysis of converted results -- need to wait for model download + +- wrapper to generate pipelineRun +- generate task? + +- missing steps: validate accelerator configuartion (wrt to cluster) \ No newline at end of file diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index 0bd8ea84..a2b4840f 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -10,16 +10,17 @@ spec: - name: data params: - - name: factorMapping - type: string - description: | - JSON string mapping factor to path in source yaml file sorted by purpose. - name: treatment type: string description: | JSON string of factors and values for one treatment. Includes both infrastructure and workload factors. + - name: factorMapping + type: string + description: | + JSON string mapping factor to path in source yaml file sorted by purpose. + - name: targetNamespacePrefix type: string default: llmdbench @@ -30,6 +31,27 @@ spec: - name: inferencePort default: 8000 + # Properties needed to evaluate stack capacity (will it be able to host the model)? + - name: validateCapacity + default: "true" + - name: behaviorOnValidationFailure + default: "terminate" + + - name: maxModelLength + + - name: decodeReplicas + - name: decodeTensorParallelism + - name: decodeDataParallelism + - name: decodeNumGpus + + - name: prefillReplicas + - name: prefillTensorParallelism + - name: prefillDataParallelism + - name: prefillNumGpus + + - name: gpuType + - name: gpuMemory + - name: experimentBaseUrl type: string - name: experimentName @@ -133,6 +155,9 @@ spec: - name: debug type: string default: "false" + - name: step-upload-results + type: string + default: "true" - name: dry-run type: string default: "false" @@ -186,24 +211,192 @@ spec: - name: treatment value: $(params.treatment) - - name: display-treatment-analysis - image: alpine:3.20 - env: - - name: MODELSERVICE_SET_ARGS - value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" - - name: GAIE_SET_ARGS - value: "$(steps.analyze-gaie-factors.results.treatmentAnalysis)" - - name: WORKLOAD_SET_ARGS - value: "$(steps.analyze-workload-factors.results.treatmentAnalysis)" + # - name: display-treatment-analysis + # image: alpine:3.20 + # env: + # - name: MODELSERVICE_SET_ARGS + # value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" + # - name: GAIE_SET_ARGS + # value: "$(steps.analyze-gaie-factors.results.treatmentAnalysis)" + # - name: WORKLOAD_SET_ARGS + # value: "$(steps.analyze-workload-factors.results.treatmentAnalysis)" - script: | - #!/bin/sh - apk add --no-cache jq yq-go >/dev/null - jq --version + # script: | + # #!/bin/sh + # apk add --no-cache jq yq-go >/dev/null + # jq --version - echo "helm upgrade --install ... $(echo ${MODELSERVICE_SET_ARGS} | jq '.setArgs')" - echo "helm upgrade --install ... $(echo ${GAIE_SET_ARGS} | jq '.setArgs')" - echo "$(echo ${WORKLOAD_SET_ARGS} | jq '.updates')" + # echo "helm upgrade --install ... $(echo ${MODELSERVICE_SET_ARGS} | jq '.setArgs')" + # echo "helm upgrade --install ... $(echo ${GAIE_SET_ARGS} | jq '.setArgs')" + # echo "$(echo ${WORKLOAD_SET_ARGS} | jq '.updates')" + + # printf "%s" "$MODELSERVICE_SET_ARGS" + + - name: compute-decode-dp + ref: + name: + compute-value + params: + - name: name + value: "decodeDataParallelism" + - name: value + value: $(params.decodeDataParallelism) + - name: defaultValue + value: 1 + + - name: compute-decode-tp + ref: + name: + compute-value + params: + - name: name + value: "decodeTensorParallelism" + - name: value + value: $(params.decodeTensorParallelism) + - name: defaultValue + value: 1 + + - name: compute-decode-replicas + ref: + name: + compute-value + params: + - name: name + value: "decodeReplicas" + - name: value + value: $(params.decodeReplicas) + - name: defaultValue + value: 1 + + - name: compute-decode-num-gpus + ref: + name: + compute-num-gpus + params: + - name: name + value: "decodeNumGpus" + - name: value + value: $(params.decodeNumGpus) + - name: dp + value: $(steps.compute-decode-dp.results.value) + - name: tp + value: $(steps.compute-decode-tp.results.value) + + # - name: display-decode-values + # image: alpine:3.20 + # env: + # - name: REPLICAS + # value: "$(steps.compute-decode-replicas.results.value)" + # - name: TP + # value: "$(steps.compute-decode-tp.results.value)" + # - name: DP + # value: "$(steps.compute-decode-dp.results.value)" + # - name: NUM_GPUS + # value: "$(steps.compute-decode-num-gpus.results.value)" + + # script: | + # #!/bin/sh + + # echo "decodeReplicas = ${REPLICAS}" + # echo "decodeTensorParallelism = ${TP}" + # echo "decodeDataParallelism = ${DP}" + # echo "decodeNumGpus = ${NUM_GPUS}" + + - name: compute-prefill-dp + ref: + name: + compute-value + params: + - name: name + value: "prefillDataParallelism" + - name: value + value: $(params.prefillDataParallelism) + - name: defaultValue + value: 1 + + - name: compute-prefill-tp + ref: + name: + compute-value + params: + - name: name + value: "prefillTensorParallelism" + - name: value + value: $(params.prefillTensorParallelism) + - name: defaultValue + value: 1 + + - name: compute-prefill-replicas + ref: + name: + compute-value + params: + - name: name + value: "prefillReplicas" + - name: value + value: $(params.prefillReplicas) + - name: defaultValue + value: 1 + + - name: compute-prefill-num-gpus + ref: + name: + compute-num-gpus + params: + - name: name + value: "prefillNumGpus" + - name: value + value: $(params.prefillNumGpus) + - name: dp + value: $(steps.compute-prefill-dp.results.value) + - name: tp + value: $(steps.compute-prefill-tp.results.value) + + - name: check-decode-capacity + ref: + name: check-capacity + params: + - name: validateCapacity + value: $(params.validateCapacity) + - name: behaviorOnValidationFailure + value: $(params.behaviorOnValidationFailure) + - name: model + value: $(params.model-id) + - name: max_model_len + value: $(params.maxModelLength) + - name: replicas + value: $(steps.compute-decode-replicas.results.value) + - name: tp + value: $(steps.compute-decode-tp.results.value) + - name: dp + value: $(steps.compute-decode-dp.results.value) + - name: gpu_memory + value: $(params.gpuMemory) + - name: user_requested_gpu_count + value: $(steps.compute-decode-num-gpus.results.value) + + - name: check-prefill-capacity + ref: + name: check-capacity + params: + - name: validateCapacity + value: $(params.validateCapacity) + - name: behaviorOnValidationFailure + value: $(params.behaviorOnValidationFailure) + - name: model + value: $(params.model-id) + - name: max_model_len + value: $(params.maxModelLength) + - name: replicas + value: $(steps.compute-prefill-replicas.results.value) + - name: tp + value: $(steps.compute-prefill-tp.results.value) + - name: dp + value: $(steps.compute-prefill-dp.results.value) + - name: gpu_memory + value: $(params.gpuMemory) + - name: user_requested_gpu_count + value: $(steps.compute-prefill-num-gpus.results.value) - name: prepare-namespace image: quay.io/openshift/origin-cli:4.21 @@ -238,6 +431,9 @@ spec: oc adm policy add-scc-to-user anyuid -z helm-installer -n ${NAMESPACE} # oc adm policy add-scc-to-user privileged -z helm-installer -n ${NAMESPACE} + # TBD when move from multiple NS to single NS then can move to + # step implementation instead of kubernetes job (replacing the next 2 steps) + # Can't do yet because step executes in a different NS from target. - name: model-download ref: name: helm-upgrade-install @@ -273,10 +469,45 @@ spec: value: $(params.dry-run) - name: wait-for-download - image: alpine:3.20 + image: alpine/kubectl:1.34.1 + env: + - name: JOB_NAME + value: "llm-d-benchark-job" + - name: NAMESPACE + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) + - name: TIMEOUT + value: "300" # seconds + - name: SLEEP_INTERVAL + value: "5" # seconds script : | - #!/bin/sh - echo "âŗ TBD: Wait for download job to complete" + #!/usr/bin/env sh + + echo "âŗ Wait for model to download" + + elapsed=0 + + while [ "$elapsed" -lt "${TIMEOUT}" ]; do + status=$(kubectl get job "${JOB_NAME}" -n "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}') + if [ "$status" = "True" ]; then + echo "✅ Job succeeded" + kubectl delete job "${JOB_NAME}" -n "${NAMESPACE}" --ignore-not-found + exit 0 + fi + + status=$(kubectl get job "${JOB_NAME}" -n "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}') + if [ "$status" = "True" ]; then + echo "❌ Job failed" + kubectl delete job "${JOB_NAME}" -n "${NAMESPACE}" --ignore-not-found + exit 1 + fi + + sleep "${SLEEP_INTERVAL}" + elapsed=$((elapsed + SLEEP_INTERVAL)) + done + + echo "❌ Timed out waiting for job to complete or fail" + kubectl delete job "${JOB_NAME}" -n "${NAMESPACE}" --ignore-not-found + exit 2 - name: gateway ref: @@ -355,6 +586,11 @@ spec: value: $(params.dry-run) - name: wait-for-model + env: + - name: DECODE_REPLICAS + value: $(steps.compute-decode-replicas.results.value) + - name: PREFILL_REPLICAS + value: $(steps.compute-prefill-replicas.results.value) image: alpine/kubectl:1.34.1 script: | #!/bin/sh @@ -371,39 +607,45 @@ spec: echo "âŗ Waiting for pods serving model ${MODEL_ID} to be 'Running'" echo "Model label = ${MODEL_LABEL}" - kubectl --namespace ${NAMESPACE} \ - wait pod \ - -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=decode \ - --for=create \ - --timeout=${MODEL_START_TIMEOUT}s - echo "✅ (decode) pods serving model ${MODEL_ID} created" + if [ ${DECODE_REPLICAS} -gt 0 ]; then + kubectl --namespace ${NAMESPACE} \ + wait pod \ + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=decode \ + --for=create \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ (decode) pods serving model ${MODEL_ID} created" + fi - # TBD check if any prefill pods and wait if so - # kubectl --namespace ${NAMESPACE} \ - # wait pod \ - # -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ - # --for=create \ - # --timeout=${MODEL_START_TIMEOUT}s - # echo "✅ prefill pods serving model ${MODEL_ID} created" - - kubectl --namespace ${NAMESPACE} \ + if [ ${PREFILL_REPLICAS} -gt 0 ]; then + kubectl --namespace ${NAMESPACE} \ + wait pod \ + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ + --for=create \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ prefill pods serving model ${MODEL_ID} created" + fi + + if [ ${DECODE_REPLICAS} -gt 0 ]; then + kubectl --namespace ${NAMESPACE} \ + wait pod \ + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=decode \ + --for=condition=Ready=True \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ (decode) pods serving model ${MODEL_ID} ready" + fi + + if [ ${PREFILL_REPLICAS} -gt 0 ]; then + kubectl --namespace ${NAMESPACE} \ wait pod \ - -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=decode \ - --for=condition=Ready=True \ - --timeout=${MODEL_START_TIMEOUT}s - echo "✅ (decode) pods serving model ${MODEL_ID} ready" - - # TBD check if any prefill pods and wait if so - # kubectl --namespace ${NAMESPACE} \ - # wait pod \ - # -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ - # --for=condition=Ready=True \ - # --timeout=${MODEL_START_TIMEOUT}s - # echo "✅ prefill pods serving model ${MODEL_ID} ready" - - - name: inference-perf-prepare-profile + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ + --for=condition=Ready=True \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ prefill pods serving model ${MODEL_ID} ready" + fi + + - name: prepare-workload-profile ref: - name: inference-perf-prepare-profile + name: prepare-workload-profile params: - name: harnessName value: $(params.harnessName) @@ -445,23 +687,6 @@ spec: - name: pipelineUID value: $(params.pipelineUID) - - name: vllm-benchmark-prepare-profile - ref: - name: vllm-benchmark-prepare-profile - params: - - name: harnessName - value: $(params.harnessName) - - name: harnessProfile - value: $(params.harnessProfile) - - name: treatmentAnalysis - value: $(steps.analyze-workload-factors.results.treatmentAnalysis) - - name: model-id - value: $(params.model-id) - - name: namespace - value: $(params.targetNamespacePrefix)-$(context.taskRun.name) - - name: pipelineUID - value: $(params.pipelineUID) - - name: vllm-benchmark-run ref: name: vllm-benchmark-run @@ -489,6 +714,66 @@ spec: - name: pipelineUID value: $(params.pipelineUID) + - name: upload-results + image: ubuntu:24.04 + # Tried amazon/aws-cli:2.31.9 but latest tar available via `dnf install tar -y` is 1.34. + # There were sporadic errors "file changed as we read it". It may be caused by the way + # tar identifes file changes in v 1.34 (via ctime). A recommended solution to move to 1.35. + # See https://stackoverflow.com/a/77765876 and tar release notes https://lists.gnu.org/archive/html/info-gnu/2023-07/msg00005.html) + # A smaller image is probably desirable. A restriction is that AWS CLI v2 requires glibc. + workingDir: $(workspaces.data.path) + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: $(params.s3-keys) + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: $(params.s3-keys) + key: AWS_SECRET_ACCESS_KEY + - name: AWS_EC2_METADATA_DISABLED + value: "true" + script: | + #!/usr/bin/env sh + + if [ "$(params.step-upload-results)" = "false" ]; then + echo "Upload disabled ... skipping." + exit 0 + fi + + apt-get update && \ + apt-get install -y --no-install-recommends ca-certificates curl unzip tar gzip && \ + rm -rf /var/lib/apt/lists/* + + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip && \ + unzip /tmp/awscliv2.zip -d /tmp && \ + /tmp/aws/install && \ + rm -rf /tmp/aws /tmp/awscliv2.zip + + tar --version && gzip --version && aws --version + + EXPERIMENT_ID="experiment-$(echo -n $(params.pipelineUID) | cut -c1-8)" + EXPERIMENT_RESULTS_FOLDER="$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" + ARCHIVE_NAME="${EXPERIMENT_RESULTS_FOLDER}.tar.gz" + + tar --version && gzip --version && aws --version + + tar -czf ${ARCHIVE_NAME} \ + -C "$(workspaces.data.path)" ${EXPERIMENT_RESULTS_FOLDER} + + aws s3 cp ${ARCHIVE_NAME} "s3://$(params.s3-bucket)/${ARCHIVE_NAME}" \ + --endpoint-url "$(params.s3-endpoint)" \ + --content-type "application/x-tar" \ + --content-encoding "gzip" \ + --no-progress + # --recursive \ + + rm -rf ${ARCHIVE_NAME} + + echo "✅ Uploaded results to ${ARCHIVE_NAME}" + - name: delete-namespace image: alpine/kubectl:1.34.1 script : | diff --git a/tekton-poc/pipeline/pd-disaggregation-pr.yaml b/tekton-poc/pipeline/pd-disaggregation-pr.yaml index cc694459..b78c5891 100644 --- a/tekton-poc/pipeline/pd-disaggregation-pr.yaml +++ b/tekton-poc/pipeline/pd-disaggregation-pr.yaml @@ -15,6 +15,42 @@ spec: value: $(context.pipelineRun.namespace) - name: model-id value: "meta-llama/Llama-3.1-8B-Instruct" + # value: "meta-llama/Llama-3.1-70B-Instruct" + + # Properties needed to evaluate stack capacity (will it be able to host the model)? + - name: validateCapacity + value: true + - name: behaviorOnValidationFailure + value: terminate + - name: maxModelLength + value: 16000 + # will be set via treatment below + # - name: decodeReplicas + # - name: decodeTensorParallelism + - name: decodeDataParallelism + value: 1 + # If not set, will be set to decodeTensorParallelism * decodeDataParallelism + # - name: decodeNumGpus + + # will be set via treatment below + # - name: prefillReplicas + # - name: prefillTensorParallelism + - name: prefillDataParallelism + value: 1 + # If not set, will be set to prefillTensorParallelism * prefillDataParallelism + # - name: prefillNumGpus + + # Rely on default value + # Assume the same for prefill and decode + # - name: targetGpuMemoryUtilization + + # Required + # Assume the same for prefill and decode + # TBD - attempt to read from the cluster + - name: gpuType + value: "NVIDIA-H100-80GB-HBM3" + - name: gpuMemory + value: 80 #GB # Harness / Workload - name: harnessName @@ -37,6 +73,26 @@ spec: pipelineSpec: workspaces: - name: data + params: + - name: maxModelLength + default: "" + - name: decodeReplicas + default: "" + - name: decodeTensorParallelism + default: "" + - name: decodeDataParallelism + default: "" + - name: decodeNumGpus + default: "" + - name: prefillReplicas + default: "" + - name: prefillTensorParallelism + default: "" + - name: prefillDataParallelism + default: "" + - name: prefillNumGpus + default: "" + tasks: - name: run-experiment taskRef: @@ -45,10 +101,43 @@ spec: - name: data workspace: data params: - - name: targetNamespacePrefix - value: $(params.targetNamespacePrefix) - name: model-id value: $(params.model-id) + + # Properties needed to evaluate stack capacity (will it be able to host the model)? + - name: validateCapacity + value: $(params.validateCapacity) + - name: behaviorOnValidationFailure + value: $(params.behaviorOnValidationFailure) + + - name: maxModelLength + value: $(params.maxModelLength) + + - name: decodeReplicas + value: $(params.decodeReplicas) + - name: decodeTensorParallelism + value: $(params.decodeTensorParallelism) + - name: decodeDataParallelism + value: $(params.decodeDataParallelism) + - name: decodeNumGpus + value: $(params.decodeNumGpus) + + - name: prefillReplicas + value: $(params.prefillReplicas) + - name: prefillTensorParallelism + value: $(params.prefillTensorParallelism) + - name: prefillDataParallelism + value: $(params.prefillDataParallelism) + - name: prefillNumGpus + value: $(params.prefillNumGpus) + + - name: gpuType + value: $(params.gpuType) + - name: gpuMemory + value: $(params.gpuMemory) + + - name: targetNamespacePrefix + value: $(params.targetNamespacePrefix) - name: experimentBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/pd-disaggregation/ @@ -86,6 +175,8 @@ spec: - name: debug value: "$(params.debug)" + - name: step-upload-results + value: false - name: pipelineUID value: "$(context.pipelineRun.uid)" @@ -133,18 +224,3 @@ spec: # # - "640" # # - "1280" # # - "2560" - -# LLMDBENCH_VLLM_COMMON_REPLICAS: "2,4" -# decode.replicas -# LLMDBENCH_VLLM_COMMON_TENSOR_PARALLELISM: "8" -# decode.parallelism.tensor - -# LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS: "2,4,6,8" -# prefill.replicas -# LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM: "1,2" -# prefill.parallelism.tensor -# LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS: "1,2,4" -# decode.replicas -# LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM: "2,4,8" -# decodeTensorParallelism -# decode.parallelism.tensor \ No newline at end of file diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml index 11c73b85..822fea78 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -33,6 +33,8 @@ spec: # Control - name: debug value: true + - name: step-upload-results + value: false pipelineSpec: workspaces: @@ -91,6 +93,8 @@ spec: - name: debug value: "$(params.debug)" + - name: step-upload-results + value: "$(params.step-upload-results)" - name: pipelineUID value: "$(context.pipelineRun.uid)" matrix: diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index 64aa21ad..5cc1661b 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -1,3 +1,8 @@ +##### +# This is an example of how the matrix specification works. +# It is currently out of date. +# To test, use pipelinerun-matrix-subset.yaml instead. +##### apiVersion: tekton.dev/v1 kind: PipelineRun metadata: @@ -68,17 +73,17 @@ spec: params: - name: gaiePluginConfig value: - # - "inf-sche-none.yaml" - # - "inf-sche-prefix.yaml" - # - "inf-sche-kv.yaml" + - "inf-sche-none.yaml" + - "inf-sche-prefix.yaml" + - "inf-sche-kv.yaml" - "inf-sche-queue.yaml" - name: question_len value: - # - "100" - # - "300" + - "100" + - "300" - "1000" - name: output_len value: - # - "100" + - "100" - "300" - "1000" diff --git a/tekton-poc/pipeline/steps/inference-perf.yaml b/tekton-poc/pipeline/steps/inference-perf.yaml index 50f91936..b0d82c64 100644 --- a/tekton-poc/pipeline/steps/inference-perf.yaml +++ b/tekton-poc/pipeline/steps/inference-perf.yaml @@ -1,142 +1,5 @@ apiVersion: tekton.dev/v1beta1 kind: StepAction -metadata: - name: inference-perf-prepare-profile -spec: - params: - - name: harnessName - - name: harnessProfile - - name: model-id - - name: namespace - - name: treatmentAnalysis - - name: pipelineUID - env: - - name: REQUESTED_HARNESS_NAME - value: "$(params.harnessName)" - - name: MY_HARNESS_NAME - value: "inference-perf" - - name: HARNESS_PROFILE - value: "$(params.harnessProfile)" - - - name: TREATMENT_ANALYSIS - value: "$(params.treatmentAnalysis)" - - - name: LLMDBENCH_DEPLOY_CURRENT_MODEL - value: "$(params.model-id)" - - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL - value: "http://experiment-gateway-inference-gateway.$(params.namespace).svc.cluster.local:80" - - - name: DATA_ROOT_DIR - value: $(workspaces.data.path) - - name: MY_TASK_NAME - value: $(context.taskRun.name) - - name: MY_PIPELINE_UID - value: $(params.pipelineUID) - - # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 - image: python:3.12.9-slim-bookworm - script: | - #!/bin/bash - - if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then - echo "Requested harness not ${MY_HARNESS_NAME}, skipping" - exit 0 - fi - - # TBD is this necessary or is it already there? - apt-get update - apt-get install -y --no-install-recommends curl ca-certificates jq - curl -fsSL https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 \ - -o /usr/local/bin/yq - chmod +x /usr/local/bin/yq - jq --version - yq --version - - # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L48-L54 - # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh - - EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" - RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" - CONTROL_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" - RUN_DIR=$(pwd) - - echo "🔄 Installing required tools" - apt-get update - apt-get install -y \ - wget \ - && apt-get clean && rm -rf /var/cache/apt - - # Ensure all folders created - mkdir -p $RESULTS_DIR - mkdir -p $CONTROL_DIR/setup - rm -rf $CONTROL_DIR/setup/sed-commands - touch $CONTROL_DIR/setup/sed-commands - mkdir -p ${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/templates - - cd ${RUN_DIR}/vllm-benchmark/ - - # Define constants: input profile template name and location; final profile name and location - workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) - workload_template=${workload}.yaml.in - workload_template_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/templates/${workload_template} - workload_profile=${workload}.yaml - workload_profile_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/${workload_profile} - - echo "🔄 Prepare workload profile" - # Fetch profile template from llmd-benchmark - wget -O ${workload_template_path} \ - --quiet \ - https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/workload/profiles/${MY_HARNESS_NAME}/${workload_template} - - # Apply treatment to profile template to produce final profile - echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL^${LLMDBENCH_DEPLOY_CURRENT_MODEL}^g" >> ${CONTROL_DIR}/setup/sed-commands - echo "s^REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL^${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}^g" >> ${CONTROL_DIR}/setup/sed-commands - echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_TOKENIZER^${LLMDBENCH_DEPLOY_CURRENT_TOKENIZER}^g" >> ${CONTROL_DIR}/setup/sed-commands - echo "---------- sed-commands" - cat ${CONTROL_DIR}/setup/sed-commands - echo "----------" - sed -f ${CONTROL_DIR}/setup/sed-commands ${workload_template_path} > ${workload_profile_path} - - # TBD eliminate the TARGET_FILE env variable - TARGET_FILE=${workload_profile_path} - echo "${TREATMENT_ANALYSIS}" | jq '.updates' > /tmp/updates.json - echo ">>> /tmp/updates.json" - cat /tmp/updates.json - - if [ ! -f "$TARGET_FILE" ]; then - echo "ERROR: File not found: $TARGET_FILE" >&2 - exit 1 - fi - - # Apply updates to JSON or YAML - if [ "$(jq 'length' /tmp/updates.json)" -gt 0 ]; then - ext="${TARGET_FILE##*.}" - tmp="${TARGET_FILE}.tmp" - - # TBD eliminate the json path (copilot generated this); profiles are yaml files - if [ "$ext" = "json" ]; then - jq --slurpfile upds /tmp/updates.json ' - reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) - ' "$TARGET_FILE" > "$tmp" - mv "$tmp" "$TARGET_FILE" - else - # YAML path: YAML → JSON → apply → YAML - yq -o=json '.' "$TARGET_FILE" \ - | jq --slurpfile upds /tmp/updates.json ' - reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) - ' \ - | yq -P > "$tmp" - mv "$tmp" "$TARGET_FILE" - fi - fi - - echo "---------- workload profile" - cat ${workload_profile_path} - echo "----------" - echo "✅ workload profile ready" ---- -apiVersion: tekton.dev/v1beta1 -kind: StepAction metadata: name: inference-perf-run spec: diff --git a/tekton-poc/pipeline/steps/treatment.yaml b/tekton-poc/pipeline/steps/treatment.yaml index b371836f..33888574 100644 --- a/tekton-poc/pipeline/steps/treatment.yaml +++ b/tekton-poc/pipeline/steps/treatment.yaml @@ -7,8 +7,6 @@ spec: Produce '--set/--set-string path=value' flags for factorType and apply values into a JSON/YAML file. Works with flat or nested treatment. image: alpine:3.20 - # Pass params via env (StepAction scripts cannot use $(params.*) directly). - # We'll read these envs inside the script. params: - name: factorType type: string @@ -18,17 +16,9 @@ spec: - name: treatment type: string description: JSON values (flat or nested by key) - # - name: file - # type: string - # description: Target file path (relative to workdir or absolute) - # - name: workdir - # type: string - # description: Working directory (usually the bound workspace path) - # default: /workspace results: - name: treatmentAnalysis description: Space-separated '--set/--set-string path=value' tokens - # workingDir: $(params.workdir) env: - name: SELECTOR value: $(params.factorType) @@ -36,18 +26,10 @@ spec: value: $(params.factorMapping) - name: VAL_JSON value: $(params.treatment) - # - name: TARGET_FILE - # value: $(params.file) script: | #!/bin/sh set -eu apk add --no-cache jq yq >/dev/null - # jq --version - # yq --version - - # echo "$SELECTOR" - # echo "$MAP_JSON" - # echo "$VAL_JSON" # Build updates + flags (uses $val for type checks — fixed version) jq -r -n \ @@ -63,7 +45,7 @@ spec: updates: [ $m | to_entries[] | select($v[.key] != null) - | { path: (.value | split(".")), value: $v[.key] } + | { name: .key, path: (.value | split(".")), value: $v[.key] } ], setArgs: ( [ $m | to_entries[] @@ -80,38 +62,4 @@ spec: end ' > /tmp/out.json - # FLAGS=$(jq -r '.setArgs' /tmp/out.json) - # jq '.updates' /tmp/out.json > /tmp/updates.json - - # if [ ! -f "$TARGET_FILE" ]; then - # echo "ERROR: File not found: $TARGET_FILE" >&2 - # # still write empty result - # printf "" > "$(step.results.treatmentAnalysis.path)" - # exit 1 - # fi - - # # Apply updates to JSON or YAML - # if [ "$(jq 'length' /tmp/updates.json)" -gt 0 ]; then - # ext="${TARGET_FILE##*.}" - # tmp="${TARGET_FILE}.tmp" - - # if [ "$ext" = "json" ]; then - # jq --slurpfile upds /tmp/updates.json ' - # reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) - # ' "$TARGET_FILE" > "$tmp" - # mv "$tmp" "$TARGET_FILE" - # else - # # YAML path: YAML → JSON → apply → YAML - # yq -o=json '.' "$TARGET_FILE" \ - # | jq --slurpfile upds /tmp/updates.json ' - # reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) - # ' \ - # | yq -P > "$tmp" - # mv "$tmp" "$TARGET_FILE" - # fi - # fi - - # printf "%s" "$(cat /tmp/out.json)" - # Emit flags as a step-scoped result - # printf "%s" "$FLAGS" > "$(step.results.treatmentAnalysis.path)" printf "%s" "$(cat /tmp/out.json)" > "$(step.results.treatmentAnalysis.path)" diff --git a/tekton-poc/pipeline/steps/vllm-benchmark.yaml b/tekton-poc/pipeline/steps/vllm-benchmark.yaml index 8daa7aeb..aead6b96 100644 --- a/tekton-poc/pipeline/steps/vllm-benchmark.yaml +++ b/tekton-poc/pipeline/steps/vllm-benchmark.yaml @@ -1,142 +1,5 @@ apiVersion: tekton.dev/v1beta1 kind: StepAction -metadata: - name: vllm-benchmark-prepare-profile -spec: - params: - - name: harnessName - - name: harnessProfile - - name: model-id - - name: namespace - - name: treatmentAnalysis - - name: pipelineUID - env: - - name: REQUESTED_HARNESS_NAME - value: "$(params.harnessName)" - - name: MY_HARNESS_NAME - value: "vllm-benchmark" - - name: HARNESS_PROFILE - value: "$(params.harnessProfile)" - - - name: TREATMENT_ANALYSIS - value: "$(params.treatmentAnalysis)" - - - name: LLMDBENCH_DEPLOY_CURRENT_MODEL - value: "$(params.model-id)" - - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL - value: "http://experiment-gateway-inference-gateway.$(params.namespace).svc.cluster.local:80" - - - name: DATA_ROOT_DIR - value: $(workspaces.data.path) - - name: MY_TASK_NAME - value: $(context.taskRun.name) - - name: MY_PIPELINE_UID - value: $(params.pipelineUID) - - # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 - image: python:3.12.9-slim-bookworm - script: | - #!/bin/bash - - if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then - echo "Requested harness not ${MY_HARNESS_NAME}, skipping" - exit 0 - fi - - # TBD is this necessary or is it already there? - apt-get update - apt-get install -y --no-install-recommends curl ca-certificates jq - curl -fsSL https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 \ - -o /usr/local/bin/yq - chmod +x /usr/local/bin/yq - jq --version - yq --version - - # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L56-L62 - # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh - - EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" - RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" - CONTROL_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" - RUN_DIR=$(pwd) - - echo "🔄 Installing required tools" - apt-get update - apt-get install -y \ - wget \ - && apt-get clean && rm -rf /var/cache/apt - - # Ensure all folders created - mkdir -p $RESULTS_DIR - mkdir -p $CONTROL_DIR/setup - rm -rf $CONTROL_DIR/setup/sed-commands - touch $CONTROL_DIR/setup/sed-commands - mkdir -p ${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/templates - - cd ${RUN_DIR}/vllm-benchmark/ - - # Define constants: input profile template name and location; final profile name and location - workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) - workload_template=${workload}.yaml.in - workload_template_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/templates/${workload_template} - workload_profile=${workload}.yaml - workload_profile_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/${workload_profile} - - echo "🔄 Prepare workload profile" - # Fetch profile template from llmd-benchmark - wget -O ${workload_template_path} \ - --quiet \ - https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/workload/profiles/${MY_HARNESS_NAME}/${workload_template} - - # Apply treatment to profile template to produce final profile - echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL^${LLMDBENCH_DEPLOY_CURRENT_MODEL}^g" >> ${CONTROL_DIR}/setup/sed-commands - echo "s^REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL^${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}^g" >> ${CONTROL_DIR}/setup/sed-commands - echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_TOKENIZER^${LLMDBENCH_DEPLOY_CURRENT_TOKENIZER}^g" >> ${CONTROL_DIR}/setup/sed-commands - echo "---------- sed-commands" - cat ${CONTROL_DIR}/setup/sed-commands - echo "----------" - sed -f ${CONTROL_DIR}/setup/sed-commands ${workload_template_path} > ${workload_profile_path} - - # TBD eliminate the TARGET_FILE env variable - TARGET_FILE=${workload_profile_path} - echo "${TREATMENT_ANALYSIS}" | jq '.updates' > /tmp/updates.json - echo ">>> /tmp/updates.json" - cat /tmp/updates.json - - if [ ! -f "$TARGET_FILE" ]; then - echo "ERROR: File not found: $TARGET_FILE" >&2 - exit 1 - fi - - # Apply updates to JSON or YAML - if [ "$(jq 'length' /tmp/updates.json)" -gt 0 ]; then - ext="${TARGET_FILE##*.}" - tmp="${TARGET_FILE}.tmp" - - # TBD eliminate the json path (copilot generated this); profiles are yaml files - if [ "$ext" = "json" ]; then - jq --slurpfile upds /tmp/updates.json ' - reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) - ' "$TARGET_FILE" > "$tmp" - mv "$tmp" "$TARGET_FILE" - else - # YAML path: YAML → JSON → apply → YAML - yq -o=json '.' "$TARGET_FILE" \ - | jq --slurpfile upds /tmp/updates.json ' - reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) - ' \ - | yq -P > "$tmp" - mv "$tmp" "$TARGET_FILE" - fi - fi - - echo "---------- workload profile" - cat ${workload_profile_path} - echo "----------" - echo "✅ workload profile ready" ---- -apiVersion: tekton.dev/v1beta1 -kind: StepAction metadata: name: vllm-benchmark-run spec: From 62fc6c458b0de298b99f8a79a4017320c202867f Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Mon, 20 Oct 2025 10:08:17 -0400 Subject: [PATCH 42/44] reduce task pod requirements Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/experiment-task.yaml | 179 +++---- tekton-poc/pipeline/pd-disaggregation-pr.yaml | 15 +- .../pipeline/steps/capacity-planner.yaml | 439 ++++++++++++++++++ .../pipeline/steps/workload-profile.yaml | 131 ++++++ 4 files changed, 652 insertions(+), 112 deletions(-) create mode 100644 tekton-poc/pipeline/steps/capacity-planner.yaml create mode 100644 tekton-poc/pipeline/steps/workload-profile.yaml diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/experiment-task.yaml index a2b4840f..8bcf868a 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/experiment-task.yaml @@ -232,42 +232,26 @@ spec: # printf "%s" "$MODELSERVICE_SET_ARGS" - - name: compute-decode-dp + # TBD split into individual steps to compute each value? + - name: compute-capacity-validation-values ref: name: - compute-value + compute-values params: - - name: name - value: "decodeDataParallelism" - - name: value + - name: decodeDataParallelism value: $(params.decodeDataParallelism) - - name: defaultValue - value: 1 - - - name: compute-decode-tp - ref: - name: - compute-value - params: - - name: name - value: "decodeTensorParallelism" - - name: value + - name: decodeTensorParallelism value: $(params.decodeTensorParallelism) - - name: defaultValue - value: 1 - - - name: compute-decode-replicas - ref: - name: - compute-value - params: - - name: name - value: "decodeReplicas" - - name: value + - name: decodeReplicas value: $(params.decodeReplicas) - - name: defaultValue - value: 1 + - name: prefillDataParallelism + value: $(params.prefillDataParallelism) + - name: prefillTensorParallelism + value: $(params.prefillTensorParallelism) + - name: prefillReplicas + value: $(params.prefillReplicas) + # TBD fold into compute-capacity-validation-values ? - name: compute-decode-num-gpus ref: name: @@ -278,66 +262,13 @@ spec: - name: value value: $(params.decodeNumGpus) - name: dp - value: $(steps.compute-decode-dp.results.value) + # value: $(steps.compute-decode-dp.results.value) + value: $(steps.compute-capacity-validation-values.results.decodeDataParallelism) - name: tp - value: $(steps.compute-decode-tp.results.value) - - # - name: display-decode-values - # image: alpine:3.20 - # env: - # - name: REPLICAS - # value: "$(steps.compute-decode-replicas.results.value)" - # - name: TP - # value: "$(steps.compute-decode-tp.results.value)" - # - name: DP - # value: "$(steps.compute-decode-dp.results.value)" - # - name: NUM_GPUS - # value: "$(steps.compute-decode-num-gpus.results.value)" - - # script: | - # #!/bin/sh - - # echo "decodeReplicas = ${REPLICAS}" - # echo "decodeTensorParallelism = ${TP}" - # echo "decodeDataParallelism = ${DP}" - # echo "decodeNumGpus = ${NUM_GPUS}" - - - name: compute-prefill-dp - ref: - name: - compute-value - params: - - name: name - value: "prefillDataParallelism" - - name: value - value: $(params.prefillDataParallelism) - - name: defaultValue - value: 1 - - - name: compute-prefill-tp - ref: - name: - compute-value - params: - - name: name - value: "prefillTensorParallelism" - - name: value - value: $(params.prefillTensorParallelism) - - name: defaultValue - value: 1 - - - name: compute-prefill-replicas - ref: - name: - compute-value - params: - - name: name - value: "prefillReplicas" - - name: value - value: $(params.prefillReplicas) - - name: defaultValue - value: 1 + # value: $(steps.compute-decode-tp.results.value) + value: $(steps.compute-capacity-validation-values.results.decodeTensorParallelism) + # TBD fold into compute-capacity-validation-values ? - name: compute-prefill-num-gpus ref: name: @@ -348,9 +279,11 @@ spec: - name: value value: $(params.prefillNumGpus) - name: dp - value: $(steps.compute-prefill-dp.results.value) + # value: $(steps.compute-prefill-dp.results.value) + value: $(steps.compute-capacity-validation-values.results.prefillDataParallelism) - name: tp - value: $(steps.compute-prefill-tp.results.value) + # value: $(steps.compute-prefill-tp.results.value) + value: $(steps.compute-capacity-validation-values.results.prefillDataParallelism) - name: check-decode-capacity ref: @@ -365,15 +298,19 @@ spec: - name: max_model_len value: $(params.maxModelLength) - name: replicas - value: $(steps.compute-decode-replicas.results.value) + value: $(steps.compute-capacity-validation-values.results.decodeReplicas) - name: tp - value: $(steps.compute-decode-tp.results.value) + value: $(steps.compute-capacity-validation-values.results.decodeTensorParallelism) - name: dp - value: $(steps.compute-decode-dp.results.value) + value: $(steps.compute-capacity-validation-values.results.decodeDataParallelism) - name: gpu_memory value: $(params.gpuMemory) - name: user_requested_gpu_count value: $(steps.compute-decode-num-gpus.results.value) + when: + - input: $(params.validateCapacity) + operator: in + values: [ "true" ] - name: check-prefill-capacity ref: @@ -388,15 +325,19 @@ spec: - name: max_model_len value: $(params.maxModelLength) - name: replicas - value: $(steps.compute-prefill-replicas.results.value) + value: $(steps.compute-capacity-validation-values.results.prefillReplicas) - name: tp - value: $(steps.compute-prefill-tp.results.value) + value: $(steps.compute-capacity-validation-values.results.prefillTensorParallelism) - name: dp - value: $(steps.compute-prefill-dp.results.value) + value: $(steps.compute-capacity-validation-values.results.prefillDataParallelism) - name: gpu_memory value: $(params.gpuMemory) - name: user_requested_gpu_count value: $(steps.compute-prefill-num-gpus.results.value) + when: + - input: $(params.validateCapacity) + operator: in + values: [ "true" ] - name: prepare-namespace image: quay.io/openshift/origin-cli:4.21 @@ -588,9 +529,9 @@ spec: - name: wait-for-model env: - name: DECODE_REPLICAS - value: $(steps.compute-decode-replicas.results.value) + value: $(steps.compute-capacity-validation-values.results.decodeReplicas) - name: PREFILL_REPLICAS - value: $(steps.compute-prefill-replicas.results.value) + value: $(steps.compute-capacity-validation-values.results.prefillReplicas) image: alpine/kubectl:1.34.1 script: | #!/bin/sh @@ -670,13 +611,17 @@ spec: value: $(params.harnessProfile) - name: pipelineUID value: $(params.pipelineUID) - computeResources: - requests: - memory: "32Gi" - cpu: "16" - limits: - memory: "32Gi" - cpu: "16" + when: + - input: $(params.harnessName) + operator: in + values: [ "inference-perf" ] + # computeResources: + # requests: + # memory: "32Gi" + # cpu: "16" + # limits: + # memory: "32Gi" + # cpu: "16" - name: inference-perf-analyze-results ref: @@ -686,6 +631,10 @@ spec: value: $(params.harnessName) - name: pipelineUID value: $(params.pipelineUID) + when: + - input: $(params.harnessName) + operator: in + values: [ "inference-perf" ] - name: vllm-benchmark-run ref: @@ -697,13 +646,17 @@ spec: value: $(params.harnessProfile) - name: pipelineUID value: $(params.pipelineUID) - computeResources: - requests: - memory: "32Gi" - cpu: "16" - limits: - memory: "32Gi" - cpu: "16" + when: + - input: $(params.harnessName) + operator: in + values: [ "vllm-benchmark" ] + # computeResources: + # requests: + # memory: "32Gi" + # cpu: "16" + # limits: + # memory: "32Gi" + # cpu: "16" - name: vllm-benchmark-analyze-results ref: @@ -713,6 +666,10 @@ spec: value: $(params.harnessName) - name: pipelineUID value: $(params.pipelineUID) + when: + - input: $(params.harnessName) + operator: in + values: [ "vllm-benchmark" ] - name: upload-results image: ubuntu:24.04 diff --git a/tekton-poc/pipeline/pd-disaggregation-pr.yaml b/tekton-poc/pipeline/pd-disaggregation-pr.yaml index b78c5891..d97212e0 100644 --- a/tekton-poc/pipeline/pd-disaggregation-pr.yaml +++ b/tekton-poc/pipeline/pd-disaggregation-pr.yaml @@ -5,6 +5,19 @@ metadata: spec: taskRunTemplate: serviceAccountName: helm-installer + taskRunSpecs: + - pipelineTaskName: run-experiment + computeResources: + requests: + memory: "16Gi" + cpu: "8" + # memory: "32Gi" + # cpu: "16" + limits: + memory: "16Gi" + cpu: "8" + # memory: "32Gi" + # cpu: "16" workspaces: - name: data persistentVolumeClaim: @@ -190,7 +203,7 @@ spec: "prefillReplicas": 1, "prefillTensorParallelism": 1, "decodeReplicas": 1, - "decodeTensorParallelism": 1, + "decodeTensorParallelism": 2, "max-concurrency": 1, "num-prompts": 10 } diff --git a/tekton-poc/pipeline/steps/capacity-planner.yaml b/tekton-poc/pipeline/steps/capacity-planner.yaml new file mode 100644 index 00000000..879bec20 --- /dev/null +++ b/tekton-poc/pipeline/steps/capacity-planner.yaml @@ -0,0 +1,439 @@ +# apiVersion: tekton.dev/v1beta1 +# kind: StepAction +# metadata: +# name: compute-value +# spec: +# results: +# - name: value +# params: +# - name: name +# - name: value +# - name: defaultValue +# env: +# - name: PARAMETER_NAME +# value: "$(params.name)" +# - name: PARAMETER_VALUE +# value: $(params.value) +# - name: DEFAULT_VALUE +# value: $(params.defaultValue) +# - name: TREATMENT_ANALYSIS +# value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" +# image: alpine:3.20 +# script: | +# #!/usr/bin/env sh + +# apk add --no-cache jq yq >/dev/null + +# echo "PARAMETER_NAME = ${PARAMETER_NAME}" +# echo "PARAMETER_VALUE = ${PARAMETER_VALUE}" +# echo "DEFAULT_VALUE = ${DEFAULT_VALUE}" + +# if [ -n "${PARAMETER_VALUE}" ]; then +# value="${PARAMETER_VALUE}" +# echo ">>> Using value from parameter: ${value}" +# else +# value=$( +# echo ${TREATMENT_ANALYSIS} \ +# | jq -r ".updates[] | select(.name == \"${PARAMETER_NAME}\") | .value" +# ) +# echo ">>> value from treatment: ${value}" +# if [ -z $value ]; then +# value=${DEFAULT_VALUE} +# echo ">>> Using default value: ${value}" +# fi +# fi + +# echo -n "${value}" > "$(step.results.value.path)" +# --- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: compute-values +spec: + results: + - name: decodeDataParallelism + - name: decodeTensorParallelism + - name: decodeReplicas + - name: prefillDataParallelism + - name: prefillTensorParallelism + - name: prefillReplicas + params: + - name: decodeDataParallelism + - name: decodeTensorParallelism + - name: decodeReplicas + - name: prefillDataParallelism + - name: prefillTensorParallelism + - name: prefillReplicas + env: + - name: DECODE_DP + value: "$(params.decodeDataParallelism)" + - name: DECODE_TP + value: "$(params.decodeTensorParallelism)" + - name: DECODE_REPLICAS + value: "$(params.decodeReplicas)" + - name: PREFILL_DP + value: "$(params.prefillDataParallelism)" + - name: PREFILL_TP + value: "$(params.prefillTensorParallelism)" + - name: PREFILL_REPLICAS + value: "$(params.prefillReplicas)" + - name: TREATMENT_ANALYSIS + value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" + image: alpine:3.20 + script: | + #!/usr/bin/env sh + + apk add --no-cache jq yq >/dev/null + + compute_value() { + _name="$1" + _value="$2" + _default="$3" + + if [ -n "${_value}" ]; then + _result="${_value}" + else + _result=$( + # echo "from treatment" + echo "${TREATMENT_ANALYSIS}" \ + | jq -r ".updates[] | select(.name == \"${_name}\") | .value" + ) + if [ -z $_result ]; then + _result="${_default}" + fi + fi + echo "${_result}" + } + + echo "input DECODE_DP = ${DECODE_DP}" + value=$(compute_value "decodeDataParallelism" "${DECODE_DP}" 1) + echo "output decodeDataParallelism = $value" + echo -n "${value}" > "$(step.results.decodeDataParallelism.path)" + + echo "input DECODE_TP = ${DECODE_TP}" + value=$(compute_value "decodeTensorParallelism" "${DECODE_TP}" 1) + echo "output decodeTensorParallelism = $value" + echo -n "${value}" > "$(step.results.decodeTensorParallelism.path)" + + echo "input DECODE_REPLICAS = ${DECODE_REPLICAS}" + value=$(compute_value "decodeReplicas" "${DECODE_REPLICAS}" 1) + echo "output decodeReplicas = $value" + echo -n "${value}" > "$(step.results.decodeReplicas.path)" + + echo "input PREFILL_DP = ${PREFILL_DP}" + value=$(compute_value "prefillDataParallelism" "${PREFILL_DP}" 1) + echo "output prefillDataParallelism = $value" + echo -n "${value}" > "$(step.results.prefillDataParallelism.path)" + + echo "input PREFILL_TP = ${PREFILL_TP}" + value=$(compute_value "prefillTensorParallelism" "${PREFILL_TP}" 1) + echo "output prefillTensorParallelism = $value" + echo -n "${value}" > "$(step.results.prefillTensorParallelism.path)" + + echo "input PREFILL_REPLICAS = ${PREFILL_REPLICAS}" + value=$(compute_value "prefillReplicas" "${PREFILL_REPLICAS}" 1) + echo "output prefillReplicas = $value" + echo -n "${value}" > "$(step.results.prefillReplicas.path)" +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: compute-num-gpus +spec: + results: + - name: value + params: + - name: name + - name: value + - name: dp + - name: tp + env: + - name: PARAMETER_NAME + value: "$(params.name)" + - name: PARAMETER_VALUE + value: $(params.value) + - name: DP + value: $(params.dp) + - name: TP + value: $(params.tp) + - name: TREATMENT_ANALYSIS + value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" + image: alpine:3.20 + script: | + #!/usr/bin/env sh + + apk add --no-cache jq yq >/dev/null + + echo "PARAMETER_NAME = ${PARAMETER_NAME}" + echo "PARAMETER_VALUE = ${PARAMETER_VALUE}" + echo "DP = ${DP}" + echo "TP = ${TP}" + + if [ -n "${PARAMETER_VALUE}" ]; then + value=${PARAMETER_VALUE} + echo ">>> Using value from parameter: ${value}" + else + value=$( + echo ${TREATMENT_ANALYSIS} \ + | jq -r ".updates[] | select(.name == \"${PARAMETER_NAME}\") | .value" + ) + echo ">>> value from treatment: ${value}" + if [ -z $value ]; then + value=$(( $TP * $DP )) + echo ">>> Using value from computation: $TP * $DP = ${value}" + fi + fi + + echo -n "${value}" > "$(step.results.value.path)" +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: check-capacity +spec: + params: + - name: validateCapacity + default: "true" + - name: behaviorOnValidationFailure + default: terminate # ignore + + - name: model + - name: max_model_len + - name: replicas + - name: tp + - name: dp + - name: gpu_memory + - name: user_requested_gpu_count + - name: gpu_memory_util + default: "0.95" + + - name: py + default: | + import os + import sys + from typing import Tuple + from config_explorer.capacity_planner import * + + def log_failed(msg: str, ignore_if_failed = True): + print(f'❌ {msg}') + if not ignore_if_failed: + sys.exit(1) + + def log_warning(msg): + print(f'âš ī¸ {msg}') + + def log_info(msg): + print(f'â„šī¸ {msg}') + + def get_model_info(model_name: str, hf_token: str, ignore_if_failed: bool) -> ModelInfo | None: + """ + Obtains model info from HF + """ + + try: + return get_model_info_from_hf(model_name, hf_token) + + except GatedRepoError: + log_failed("Model is gated and provided token does not, work. Please double check.", ignore_if_failed) + except HfHubHTTPError as hf_exp: + log_failed(f"Error reaching Hugging Face API: {hf_exp}", ignore_if_failed) + except Exception as e: + log_failed(f"Cannot retrieve ModelInfo: {e}", ignore_if_failed) + + return None + + def get_model_config_and_text_config(model_name: str, hf_token: str, ignore_if_failed: bool) -> Tuple[AutoConfig | None, AutoConfig | None]: + """ + Obtains model config and text config from HF + """ + + try: + config = get_model_config_from_hf(model_name, hf_token) + return config, get_text_config(config) + + except GatedRepoError: + log_failed("Model is gated and provided token does not, work. Please double check.", ignore_if_failed) + except HfHubHTTPError as hf_exp: + log_failed(f"Error reaching Hugging Face API: {hf_exp}", ignore_if_failed) + except Exception as e: + log_failed(f"Cannot retrieve model config: {e}", ignore_if_failed) + + return None, None + + def validate_vllm_params(): + print ("validate_vllm_params() called") + + replicas = int(os.getenv("REPLICAS")) + user_requested_gpu_count = int(os.getenv("USER_REQUESTED_GPU_COUNT")) + tp = int(os.getenv("TP")) + dp = int(os.getenv("DP")) + model = os.getenv("MODEL") + gpu_memory = int(os.getenv("GPU_MEMORY")) + max_model_len = int(os.getenv("MAX_MODEL_LEN")) + gpu_memory_util = float(os.getenv("GPU_MEMORY_UTIL")) + hf_token = os.getenv("HF_TOKEN") + ignore_if_failed = os.getenv("BEHAVIOR_ON_FAILURE") != 'terminate' + + print(f"model = {model}") + print(f"replicas = {replicas}") + print(f"user_requested_gpu_count = {user_requested_gpu_count}") + print(f"tp = {tp}") + print(f"dp = {dp}") + print(f"gpu_memory = {gpu_memory}") + print(f"max_model_len = {max_model_len}") + print(f"gpu_memory_util = {gpu_memory_util}") + print(f"ignore_if_failed = {ignore_if_failed}") + + # Sanity check on user inputs. If GPU memory cannot be determined, return False indicating that the sanity check is incomplete + skip_gpu_tests = False + if gpu_memory is None or gpu_memory == 0: + log_failure("Cannot determine accelerator memory. Please set LLMDBENCH_VLLM_COMMON_ACCELERATOR_MEMORY to enable Capacity Planner. Skipping GPU memory required checks, especially KV cache estimation.", ignore_if_failed) + skip_gpu_tests = True + + per_replica_requirement = gpus_required(tp=tp, dp=dp) + if replicas == 0: + per_replica_requirement = 0 + total_gpu_requirement = per_replica_requirement + + if total_gpu_requirement > user_requested_gpu_count: + log_failed(f"Requested {user_requested_gpu_count} GPUs but it is too low. It must be greater than TP x DP ({tp} x {dp} = {total_gpu_requirement})") + + if total_gpu_requirement < user_requested_gpu_count: + log_warning(f"For each replica, model requires {total_gpu_requirement}, but you requested {user_requested_gpu_count} for the deployment. Some GPUs will be idle.") + + model_info = get_model_info(model, hf_token, ignore_if_failed) + model_config, text_config = get_model_config_and_text_config(model, hf_token, ignore_if_failed) + if model_config is not None: + # Check if parallelism selections are valid + try: + valid_tp_values = find_possible_tp(text_config) + log_info(f"valid tp values are: {valid_tp_values}") + if tp not in valid_tp_values: + log_failed(f"TP={tp} is invalid. Please select from these options ({valid_tp_values}) for {model}.", ignore_if_failed) + else: + log_info(f"TP={tp} is valid.") + except AttributeError: + # Error: config['num_attention_heads'] not in config + log_failed(f"Cannot obtain data on the number of attention heads, cannot find valid tp values: {e}", ignore_if_failed) + + # Check if model context length is valid + valid_max_context_len = 0 + try: + # Error: config['max_positional_embeddings'] not in config + valid_max_context_len = max_context_len(model_config) + log_info(f"The max context length is {valid_max_context_len}") + except AttributeError as e: + log_failed(f"Cannot obtain data on the max context length for model: {e}", ignore_if_failed) + + if max_model_len > valid_max_context_len: + log_failed(f"Max model length = {max_model_len} exceeds the acceptable for {model}. Set LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN to a value below or equal to {valid_max_context_len}", ignore_if_failed) + else: + log_failed("Model config on parameter shape is not available.", ignore_if_failed) + + # Display memory info + if not skip_gpu_tests: + log_info("👉 Collecting GPU information....") + avail_gpu_memory = available_gpu_memory(gpu_memory, gpu_memory_util) + log_info(f"{gpu_memory} GB of memory per GPU, with {gpu_memory} GB x {gpu_memory_util} (gpu_memory_utilization) = {avail_gpu_memory} GB available to use.") + log_info(f"Each model replica requires {per_replica_requirement} GPUs, total available GPU memory = {avail_gpu_memory * per_replica_requirement} GB.") + + # Calculate model memory requirement + log_info("👉 Collecting model information....") + if model_info is not None: + try: + model_params = model_total_params(model_info) + log_info(f"{model} has a total of {model_params} parameters") + + model_mem_req = model_memory_req(model_info, model_config) + log_info(f"{model} requires {model_mem_req} GB of memory") + + # Estimate KV cache memory and max number of requests that can be served in worst case scenario + if not skip_gpu_tests: + log_info("👉 Estimating available KV cache....") + available_kv_cache = allocatable_kv_cache_memory( + model_info, model_config, + gpu_memory, gpu_memory_util, + tp=tp, dp=dp, + ) + log_info(f"Allocatable memory for KV cache {available_kv_cache} GB") + + if available_kv_cache < 0: + log_failed(f"There is not enough GPU memory to stand up model. Exceeds by {abs(available_kv_cache)} GB.", ignore_if_failed) + else: + kv_details = KVCacheDetail(model_info, model_config, max_model_len, batch_size=1) + log_info(f"KV cache memory for a request taking --max-model-len={max_model_len} requires {kv_details.per_request_kv_cache_gb} GB of memory") + + total_concurrent_reqs = max_concurrent_requests( + model_info, model_config, max_model_len, + gpu_memory, gpu_memory_util, + tp=tp, dp=dp, + ) + log_info(f"The vLLM server can process up to {total_concurrent_reqs} number of requests at the same time, assuming the worst case scenario that each request takes --max-model-len") + + except AttributeError as e: + # Model might not have safetensors data on parameters + log_failed(f"Does not have enough information about model to estimate model memory or KV cache: {e}", ignore_if_failed) + else: + log_failed(f"Model info on model's architecture is not available.", ignore_if_failed) + + def main(): + """Main function""" + print("main() called") + validate_vllm_params() + print("main() exiting") + + if __name__ == "__main__": + sys.exit(main()) + env: + - name: VALIDATE_CAPACITY + value: $(params.validateCapacity) + - name: BEHAVIOR_ON_FAILURE + value: $(params.behaviorOnValidationFailure) + + - name: MODEL + value: $(params.model) + + - name: REPLICAS + value: $(params.replicas) + - name: TP + value: $(params.tp) + - name: DP + value: $(params.dp) + - name: GPU_MEMORY + value: $(params.gpu_memory) + - name: USER_REQUESTED_GPU_COUNT + value: $(params.user_requested_gpu_count) + - name: MAX_MODEL_LEN + value: $(params.max_model_len) + - name: GPU_MEMORY_UTIL + value: $(params.gpu_memory_util) + + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + + - name: PY_BIN + value: "$(params.py)" + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/usr/bin/env bash + + if [ "${VALIDATE_CAPACITY}" != "true" ]; then + echo "â„šī¸ Skipping capacity validation" + exit 0 + fi + + # Install git so can install capacity explorer + apt-get update \ + && apt-get install -y git \ + && rm -rf /var/lib/apt/lists/* + python -m pip install --no-cache "config_explorer @ git+https://github.com/llm-d/llm-d-benchmark.git/#subdirectory=config_explorer" + + # run capacity explorer + printf "%s\n" "${PY_BIN}" | python - + + diff --git a/tekton-poc/pipeline/steps/workload-profile.yaml b/tekton-poc/pipeline/steps/workload-profile.yaml new file mode 100644 index 00000000..bc4810fe --- /dev/null +++ b/tekton-poc/pipeline/steps/workload-profile.yaml @@ -0,0 +1,131 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: prepare-workload-profile +spec: + params: + - name: harnessName + - name: harnessProfile + - name: model-id + - name: namespace + - name: treatmentAnalysis + - name: pipelineUID + env: + - name: HARNESS_NAME + value: "$(params.harnessName)" + - name: HARNESS_PROFILE + value: "$(params.harnessProfile)" + + - name: TREATMENT_ANALYSIS + value: "$(params.treatmentAnalysis)" + + - name: LLMDBENCH_DEPLOY_CURRENT_MODEL + value: "$(params.model-id)" + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: "http://experiment-gateway-inference-gateway.$(params.namespace).svc.cluster.local:80" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/bin/bash + + echo "🔄 Preparing workload profile ${HARNESS_PROFILE} for ${HARNESS_NAME}" + + # TBD is this necessary or is it already there? + apt-get update + apt-get install -y --no-install-recommends curl ca-certificates jq + curl -fsSL https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 \ + -o /usr/local/bin/yq + chmod +x /usr/local/bin/yq + jq --version + yq --version + + # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + CONTROL_DIR="${DATA_ROOT_DIR}/${HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + RUN_DIR=$(pwd) + + echo "🔄 Installing required tools" + apt-get update + apt-get install -y \ + wget \ + && apt-get clean && rm -rf /var/cache/apt + + # Ensure all folders created + mkdir -p $RESULTS_DIR + mkdir -p $CONTROL_DIR/setup + rm -rf $CONTROL_DIR/setup/sed-commands + touch $CONTROL_DIR/setup/sed-commands + mkdir -p ${CONTROL_DIR}/workload/profiles/${HARNESS_NAME}/templates + + cd ${RUN_DIR}/vllm-benchmark/ + + # Define constants: input profile template name and location; final profile name and location + workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) + workload_template=${workload}.yaml.in + workload_template_path=${CONTROL_DIR}/workload/profiles/${HARNESS_NAME}/templates/${workload_template} + workload_profile=${workload}.yaml + workload_profile_path=${CONTROL_DIR}/workload/profiles/${HARNESS_NAME}/${workload_profile} + + echo "🔄 Prepare workload profile" + # Fetch profile template from llmd-benchmark + wget -O ${workload_template_path} \ + --quiet \ + https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/workload/profiles/${HARNESS_NAME}/${workload_template} + + # Apply treatment to profile template to produce final profile + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL^${LLMDBENCH_DEPLOY_CURRENT_MODEL}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL^${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_TOKENIZER^${LLMDBENCH_DEPLOY_CURRENT_TOKENIZER}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "---------- sed-commands" + cat ${CONTROL_DIR}/setup/sed-commands + echo "----------" + sed -f ${CONTROL_DIR}/setup/sed-commands ${workload_template_path} > ${workload_profile_path} + + # TBD eliminate the TARGET_FILE env variable + TARGET_FILE=${workload_profile_path} + echo "${TREATMENT_ANALYSIS}" | jq '.updates' > /tmp/updates.json + echo ">>> /tmp/updates.json" + cat /tmp/updates.json + + if [ ! -f "$TARGET_FILE" ]; then + echo "ERROR: File not found: $TARGET_FILE" >&2 + exit 1 + fi + + # Apply updates to JSON or YAML + if [ "$(jq 'length' /tmp/updates.json)" -gt 0 ]; then + ext="${TARGET_FILE##*.}" + tmp="${TARGET_FILE}.tmp" + + # TBD eliminate the json path (copilot generated this); profiles are yaml files + if [ "$ext" = "json" ]; then + jq --slurpfile upds /tmp/updates.json ' + reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + ' "$TARGET_FILE" > "$tmp" + mv "$tmp" "$TARGET_FILE" + else + # YAML path: YAML → JSON → apply → YAML + yq -o=json '.' "$TARGET_FILE" \ + | jq --slurpfile upds /tmp/updates.json ' + reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + ' \ + | yq -P > "$tmp" + mv "$tmp" "$TARGET_FILE" + fi + fi + + echo "---------- workload profile" + cat ${workload_profile_path} + echo "----------" + + echo "✅ workload profile ready" From 218833faa15a4a6b0f55b0872115547929926e73 Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Mon, 20 Oct 2025 11:38:21 -0400 Subject: [PATCH 43/44] rename a few things Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/pd-disaggregation-pr.yaml | 8 +-- .../pipeline/pipelinerun-matrix-subset.yaml | 2 +- tekton-poc/pipeline/pipelinerun-matrix.yaml | 2 +- .../pipeline/pipelinerun-sequential-1.yaml | 72 +++++++++---------- .../pipelinerun-sequential-4-barrier.yaml | 72 +++++++++---------- .../pipelinerun-sequential-4-sliding.yaml | 72 +++++++++---------- ...un-sequential-unroll-gaiePluginConfig.yaml | 8 +-- .../treatment.yaml} | 10 +-- 8 files changed, 123 insertions(+), 123 deletions(-) rename tekton-poc/pipeline/{experiment-task.yaml => tasks/treatment.yaml} (99%) diff --git a/tekton-poc/pipeline/pd-disaggregation-pr.yaml b/tekton-poc/pipeline/pd-disaggregation-pr.yaml index d97212e0..8be4b769 100644 --- a/tekton-poc/pipeline/pd-disaggregation-pr.yaml +++ b/tekton-poc/pipeline/pd-disaggregation-pr.yaml @@ -6,7 +6,7 @@ spec: taskRunTemplate: serviceAccountName: helm-installer taskRunSpecs: - - pipelineTaskName: run-experiment + - pipelineTaskName: treatment computeResources: requests: memory: "16Gi" @@ -107,9 +107,9 @@ spec: default: "" tasks: - - name: run-experiment + - name: treatment taskRef: - name: experiment + name: treatment workspaces: - name: data workspace: data @@ -151,7 +151,7 @@ spec: - name: targetNamespacePrefix value: $(params.targetNamespacePrefix) - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/pd-disaggregation/ - name: s3-keys diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml index 822fea78..f805f9ec 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -51,7 +51,7 @@ spec: value: $(params.targetNamespacePrefix) - name: model-id value: $(params.model-id) - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: s3-keys diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml index 5cc1661b..529ec2b9 100644 --- a/tekton-poc/pipeline/pipelinerun-matrix.yaml +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -52,7 +52,7 @@ spec: value: $(params.targetNamespacePrefix) - name: model-id value: $(params.model-id) - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: s3-keys diff --git a/tekton-poc/pipeline/pipelinerun-sequential-1.yaml b/tekton-poc/pipeline/pipelinerun-sequential-1.yaml index a4b77783..4a2f0089 100644 --- a/tekton-poc/pipeline/pipelinerun-sequential-1.yaml +++ b/tekton-poc/pipeline/pipelinerun-sequential-1.yaml @@ -24,7 +24,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -45,7 +45,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -68,7 +68,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -91,7 +91,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -114,7 +114,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -137,7 +137,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -160,7 +160,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -183,7 +183,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -206,7 +206,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -229,7 +229,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -252,7 +252,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -275,7 +275,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -298,7 +298,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -321,7 +321,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -344,7 +344,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -367,7 +367,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -390,7 +390,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -413,7 +413,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -436,7 +436,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -459,7 +459,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -482,7 +482,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -505,7 +505,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -528,7 +528,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -551,7 +551,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -574,7 +574,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -597,7 +597,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -620,7 +620,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -643,7 +643,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -666,7 +666,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -689,7 +689,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -712,7 +712,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -735,7 +735,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -758,7 +758,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -781,7 +781,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -804,7 +804,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -827,7 +827,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml diff --git a/tekton-poc/pipeline/pipelinerun-sequential-4-barrier.yaml b/tekton-poc/pipeline/pipelinerun-sequential-4-barrier.yaml index 988117a1..1dc4f388 100644 --- a/tekton-poc/pipeline/pipelinerun-sequential-4-barrier.yaml +++ b/tekton-poc/pipeline/pipelinerun-sequential-4-barrier.yaml @@ -24,7 +24,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -45,7 +45,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -66,7 +66,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -87,7 +87,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -108,7 +108,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -134,7 +134,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -160,7 +160,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -186,7 +186,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -212,7 +212,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -238,7 +238,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -264,7 +264,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -290,7 +290,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -316,7 +316,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -342,7 +342,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -368,7 +368,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -394,7 +394,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -420,7 +420,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -446,7 +446,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -472,7 +472,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -498,7 +498,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -524,7 +524,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -550,7 +550,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -576,7 +576,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -602,7 +602,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -628,7 +628,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -654,7 +654,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -680,7 +680,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -706,7 +706,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -732,7 +732,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -758,7 +758,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -784,7 +784,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -810,7 +810,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -836,7 +836,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -862,7 +862,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -888,7 +888,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -914,7 +914,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml diff --git a/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml b/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml index 76f815b6..9a750925 100644 --- a/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml +++ b/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml @@ -24,7 +24,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -45,7 +45,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -66,7 +66,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -87,7 +87,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -108,7 +108,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -131,7 +131,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -154,7 +154,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -177,7 +177,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -200,7 +200,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -223,7 +223,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -246,7 +246,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -269,7 +269,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -292,7 +292,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -315,7 +315,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -338,7 +338,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -361,7 +361,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -384,7 +384,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -407,7 +407,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -430,7 +430,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -453,7 +453,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -476,7 +476,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -499,7 +499,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -522,7 +522,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -545,7 +545,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -568,7 +568,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -591,7 +591,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -614,7 +614,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -637,7 +637,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -660,7 +660,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -683,7 +683,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -706,7 +706,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -729,7 +729,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -752,7 +752,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -775,7 +775,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -798,7 +798,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -821,7 +821,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml diff --git a/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml b/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml index 5c36a680..eae7a3f5 100644 --- a/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml +++ b/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml @@ -24,7 +24,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -55,7 +55,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -80,7 +80,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml @@ -105,7 +105,7 @@ spec: value: kalantar - name: model-id value: Qwen/Qwen3-0.6B - - name: experimentBaseUrl + - name: stackBaseUrl value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ - name: harnessProfile value: shared_prefix_synthetic.yaml diff --git a/tekton-poc/pipeline/experiment-task.yaml b/tekton-poc/pipeline/tasks/treatment.yaml similarity index 99% rename from tekton-poc/pipeline/experiment-task.yaml rename to tekton-poc/pipeline/tasks/treatment.yaml index 8bcf868a..17844190 100644 --- a/tekton-poc/pipeline/experiment-task.yaml +++ b/tekton-poc/pipeline/tasks/treatment.yaml @@ -1,7 +1,7 @@ apiVersion: tekton.dev/v1 kind: Task metadata: - name: experiment + name: treatment spec: description: > Runs an llm-d-benchmark experiment. @@ -52,7 +52,7 @@ spec: - name: gpuType - name: gpuMemory - - name: experimentBaseUrl + - name: stackBaseUrl type: string - name: experimentName type: string @@ -468,7 +468,7 @@ spec: - name: timeout value: 15m - name: valuesYamlUrl - value: "$(params.experimentBaseUrl)/gateway-values.yaml" + value: "$(params.stackBaseUrl)/gateway-values.yaml" - name: dry-run value: $(params.dry-run) @@ -489,7 +489,7 @@ spec: - name: timeout value: 15m - name: valuesYamlUrl - value: "$(params.experimentBaseUrl)/gaie-values.yaml" + value: "$(params.stackBaseUrl)/gaie-values.yaml" - name: treatmentAnalysis value: "$(steps.analyze-gaie-factors.results.treatmentAnalysis)" @@ -514,7 +514,7 @@ spec: - name: timeout value: 15m - name: valuesYamlUrl - value: "$(params.experimentBaseUrl)/ms-values.yaml" + value: "$(params.stackBaseUrl)/ms-values.yaml" - name: extraArgs value: > --set routing.inferencePool.name=$(params.experimentName)-gaie-NAMESPACE_HASH From 168739d8e7a1d42448c06fd5b0abde593e829aea Mon Sep 17 00:00:00 2001 From: Michael Kalantar Date: Thu, 30 Oct 2025 12:50:48 -0400 Subject: [PATCH 44/44] update roles Signed-off-by: Michael Kalantar --- tekton-poc/pipeline/roles.yaml | 4 ++-- tekton-poc/pipeline/steps/stepactions.yaml | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/tekton-poc/pipeline/roles.yaml b/tekton-poc/pipeline/roles.yaml index 5f447233..b1ae2e54 100644 --- a/tekton-poc/pipeline/roles.yaml +++ b/tekton-poc/pipeline/roles.yaml @@ -28,7 +28,7 @@ rules: resources: ["gatewayparameters"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencepools", "inferencemodels"] + resources: ["inferencepools", "inferencemodels", "inferenceobjectives"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: ["authentication.k8s.io"] resources: ["tokenreviews"] @@ -95,7 +95,7 @@ rules: resources: ["gatewayparameters"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: ["inference.networking.x-k8s.io"] - resources: ["inferencepools", "inferencemodels"] + resources: ["inferencepools", "inferencemodels", "inferenceobjectives"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] - apiGroups: ["authentication.k8s.io"] resources: ["tokenreviews", "subjectaccessreviews"] diff --git a/tekton-poc/pipeline/steps/stepactions.yaml b/tekton-poc/pipeline/steps/stepactions.yaml index 282bfbff..fd1d07ee 100644 --- a/tekton-poc/pipeline/steps/stepactions.yaml +++ b/tekton-poc/pipeline/steps/stepactions.yaml @@ -165,6 +165,9 @@ spec: - name: extraArgs type: string default: "" + - name: extraValues + type: string + default: "" - name: treatmentAnalysis type: string default: "" @@ -226,6 +229,8 @@ spec: value: "$(params.valuesYamlUrl)" - name: HELM_EXTRA_ARGS value: "$(params.extraArgs)" + - name: HELM_EXTRA_VALUES + value: "$(params.extraValues)" - name: TREATMENT_ANALYSIS value: "$(params.treatmentAnalysis)" @@ -275,6 +280,14 @@ spec: VALUES_FLAG="-f /tmp/${HELM_RELEASE}-values.yaml" fi + if [ -n "${HELM_EXTRA_VALUES:-}" ]; then + echo ">>> HELM_EXTRA_VALUES" + printf "%s" "${HELM_EXTRA_VALUES}" + printf "%s" "${HELM_EXTRA_VALUES}" > /tmp/${HELM_RELEASE}-extra-values.yaml + VALUES_FLAG="${VALUES_FLAG} -f /tmp/${HELM_RELEASE}-extra-values.yaml" + fi + + # Optional repo add (idempotent via --force-update) if [ -n "${HELM_REPO_NAME:-}" ] && [ -n "${HELM_REPO_URL:-}" ]; then REPO_ADD_FLAGS="--force-update" @@ -324,6 +337,11 @@ spec: echo "==> helm upgrade --install ${HELM_RELEASE} ${CHART_REF} --namespace ${HELM_NAMESPACE} ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS}" # shellcheck disable=SC2086 + helm template \ + "${HELM_RELEASE}" "${CHART_REF}" \ + --namespace "${HELM_NAMESPACE}" \ + ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS} + # shellcheck disable=SC2086 helm template \ "${HELM_RELEASE}" "${CHART_REF}" \ --namespace "${HELM_NAMESPACE}" \