diff --git a/charts/harness/.helmignore b/charts/harness/.helmignore new file mode 100644 index 00000000..898df488 --- /dev/null +++ b/charts/harness/.helmignore @@ -0,0 +1,24 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ + diff --git a/charts/harness/Chart.yaml b/charts/harness/Chart.yaml new file mode 100644 index 00000000..701fc7e4 --- /dev/null +++ b/charts/harness/Chart.yaml @@ -0,0 +1,40 @@ +apiVersion: v2 +name: llm-d-benchark +description: A Helm chart for the experiment harness in llm-d-benchmark + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: "v0.0.1" + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "v0.3.0" + +maintainers: + - name: "Michael Kalantar" + email: "kalantar@us.ibm.com" + url: "https://github.com/kalantar" + +sources: + - https://github.com/llm-d/llm-d-benchmark + +# dependencies: +# - name: common +# repository: https://charts.bitnami.com/bitnami +# tags: +# - bitnami-common +# version: "2.27.0" + diff --git a/charts/harness/templates/_helpers.tpl b/charts/harness/templates/_helpers.tpl new file mode 100644 index 00000000..aa63cc97 --- /dev/null +++ b/charts/harness/templates/_helpers.tpl @@ -0,0 +1,31 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "harness.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + + +{{/* +Create chart name and version as used by the chart label. +Truncated to 63 characrters because Kubernetes label values are limited to this +*/}} +{{- define "harness.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create common labels for the resources managed by this chart. +*/}} +{{- define "harness.labels" -}} +helm.sh/chart: {{ include "harness.chart" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{- define "harness.sanitizeString" -}} +{{- $input := . | lower | replace "." "-" | replace "/" "-" -}} +{{- $input -}} +{{- end -}} \ No newline at end of file diff --git a/charts/harness/templates/harness-pod.yaml b/charts/harness/templates/harness-pod.yaml new file mode 100644 index 00000000..014c265c --- /dev/null +++ b/charts/harness/templates/harness-pod.yaml @@ -0,0 +1,104 @@ +apiVersion: v1 +kind: Pod +metadata: + name: {{ .Values.harness.type }}-launcher + labels: + app: {{ .Values.harness.type }}-launcher +spec: + serviceAccountName: {{ include "harness.name" . }}-runner + containers: + - name: harness + image: "{{ .Values.harness.image.registry }}/{{ .Values.harness.image.repository }}/{{ .Values.harness.image.name }}:{{ .Values.harness.image.tag }}" + imagePullPolicy: {{ .Values.harness.image.pullPolicy }} + securityContext: + runAsUser: 0 + command: ["sh", "-c"] + args: + {{- toYaml .Values.harness.args | nindent 4 }} + env: + - name: LLMDBENCH_RUN_EXPERIMENT_LAUNCHER + value: "1" + - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZE_LOCALLY + value: "0" + - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS + value: "{{ .Values.harness.type }}-llm-d-benchmark.sh" + - name: LLMDBENCH_RUN_EXPERIMENT_ANALYZER + value: "{{ .Values.harness.type }}-analyze_results.sh" + - name: LLMDBENCH_RUN_EXPERIMENT_HARNESS_WORKLOAD_NAME + value: "{{ .Values.experiment.profile.name }}" + - name: LLMDBENCH_RUN_EXPERIMENT_ID + value: "{{ .Values.experiment.identifier }}" + - name: LLMDBENCH_HARNESS_NAME + value: "{{ .Values.harness.type }}" + - name: LLMDBENCH_RUN_EXPERIMENT_RESULTS_DIR + value: "/requests/{{ .Values.harness.type }}_{{ .Values.experiment.identifier }}_{{ .Values.stack.name }}" + - name: LLMDBENCH_CONTROL_WORK_DIR + value: "/requests/{{ .Values.harness.type }}_{{ .Values.experiment.identifier }}_{{ .Values.stack.name }}" + - name: LLMDBENCH_HARNESS_NAMESPACE + value: "{{ .Release.Namespace }}" + - name: LLMDBENCH_HARNESS_STACK_TYPE + value: "{{ .Values.stack.type }}" + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: "{{ .Values.stack.endpointUrl }}" + - name: LLMDBENCH_HARNESS_STACK_NAME + value: {{ include "harness.sanitizeString" .Values.stack.model | quote }} + - name: LLMDBENCH_DEPLOY_METHODS + value: "{{ .Values.stack.deployMethod }}" + - name: LLMDBENCH_MAGIC_ENVAR + value: "harness_pod" + + - name: LLMDBENCH_LLMD_IMAGE_REGISTRY + value: {{ .Values.harness.image.registry }} + - name: LLMDBENCH_LLMD_IMAGE_REPO + value: {{ .Values.harness.image.repository }} + - name: LLMDBENCH_LLMD_IMAGE_NAME + value: {{ .Values.harness.image.name }} + - name: LLMDBENCH_LLMD_IMAGE_TAG + value: {{ .Values.harness.image.tag | quote }} + {{- with .Values.harness.extraEnv }} + - name: {{ .name }} + value: "{{ .value }}" + {{- end }} + + # TBD add_env_vars_to_pod $LLMDBENCH_CONTROL_ENV_VAR_LIST_TO_POD + - name: LLMDBENCH_DEPLOY_CURRENT_MODEL + value: "{{ .Values.stack.model }}" + - name: LLMDBENCH_DEPLOY_CURRENT_MODELID + value: {{ include "harness.sanitizeString" .Values.stack.model | quote }} + - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_REPLICAS + value: "0" + - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_REPLICAS + value: "2" + - name: LLMDBENCH_VLLM_COMMON_AFFINITY + value: "nvidia.com/gpu.product:NVIDIA-H100-80GB-HBM3" + - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_TENSOR_PARALLELISM + value: "4" + - name: LLMDBENCH_VLLM_MODELSERVICE_DECODE_DATA_PARALLELISM + value: "1" + - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_TENSOR_PARALLELISM + value: "1" + - name: LLMDBENCH_VLLM_MODELSERVICE_PREFILL_DATA_PARALLELISM + value: "1" + + - name: HF_TOKEN_SECRET + value: "hf-secret" + - name: HUGGING_FACE_HUB_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + resources: + {{- toYaml .Values.harness.resources | nindent 6 }} + volumeMounts: + - name: results + mountPath: /requests + - name: {{ .Values.harness.type }}-profiles + mountPath: /workspace/profiles/{{ .Values.harness.type }} + volumes: + - name: results + persistentVolumeClaim: + claimName: {{ .Values.harness.resultsPVC }} + - name: {{ .Values.harness.type }}-profiles + configMap: + name: {{ .Values.harness.type }}-profiles + restartPolicy: Never \ No newline at end of file diff --git a/charts/harness/templates/harness-role.yaml b/charts/harness/templates/harness-role.yaml new file mode 100644 index 00000000..7aebcaa4 --- /dev/null +++ b/charts/harness/templates/harness-role.yaml @@ -0,0 +1,19 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: {{ include "harness.name" . }}-job-creator + labels: + {{- include "harness.labels" . | nindent 4 }} +rules: + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["create", "get", "list", "watch", "delete", "patch", "update"] + - apiGroups: [""] + resources: ["serviceaccounts"] + verbs: ["get"] + - apiGroups: [""] + resources: ["pods"] + verbs: ["get", "list", "watch"] + - apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] \ No newline at end of file diff --git a/charts/harness/templates/harness-rolebinding.yaml b/charts/harness/templates/harness-rolebinding.yaml new file mode 100644 index 00000000..ec657601 --- /dev/null +++ b/charts/harness/templates/harness-rolebinding.yaml @@ -0,0 +1,27 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "harness.name" . }}-job-creator-binding + labels: + {{- include "harness.labels" . | nindent 4 }} +subjects: + - kind: ServiceAccount + name: {{ include "harness.name" . }}-runner +roleRef: + kind: Role + name: {{ include "harness.name" . }}-job-creator + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: {{ include "harness.name" . }}-restricted-scc + labels: + {{- include "harness.labels" . | nindent 4 }} +subjects: + - kind: ServiceAccount + name: {{ include "harness.name" . }}-runner +roleRef: + kind: ClusterRole + name: system:openshift:scc:restricted + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/charts/harness/templates/harness-sa.yaml b/charts/harness/templates/harness-sa.yaml new file mode 100644 index 00000000..f6a4a83f --- /dev/null +++ b/charts/harness/templates/harness-sa.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "harness.name" . }}-runner + labels: + {{- include "harness.labels" . | nindent 4 }} diff --git a/charts/harness/templates/inference-perf-profiles.yaml b/charts/harness/templates/inference-perf-profiles.yaml new file mode 100644 index 00000000..285107c6 --- /dev/null +++ b/charts/harness/templates/inference-perf-profiles.yaml @@ -0,0 +1,235 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: inference-perf-profiles +data: + chatbot_sharegpt.yaml: "load:\n type: constant\n stages:\n - rate: 1\n duration: + 120\n - rate: 2\n duration: 120\n - rate: 4\n duration: 120\n - rate: + 8\n duration: 120\napi:\n type: completion\n streaming: true\nserver:\n type: + vllm\n model_name: {{ .Values.stack.model }}\n base_url: {{ .Values.stack.endpointUrl }}\n + \ ignore_eos: true\ntokenizer:\n pretrained_model_name_or_path: {{ .Values.stack.model }}\ndata:\n + \ type: shareGPT\n input_distribution:\n min: 10 # min length + of the synthetic prompts\n max: 1024 # max length of the synthetic + prompts\n output_distribution:\n min: 10 # min length of the output + to be generated\n max: 1024 # max length of the output to be generated + \nreport:\n request_lifecycle:\n summary: true\n per_stage: true\n per_request: + true\nstorage:\n local_storage:\n path: /workspace" + chatbot_synthetic.yaml: |- + load: + type: constant + stages: + - rate: 1 + duration: 120 + - rate: 2 + duration: 120 + - rate: 4 + duration: 120 + - rate: 8 + duration: 120 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: random + input_distribution: + min: 10 # min length of the synthetic prompts + max: 8192 # max length of the synthetic prompts + mean: 4096 # mean length of the synthetic prompts + std: 2048 # standard deviation of the length of the synthetic prompts + total_count: 1000 # total number of prompts to generate to fit the above mentioned distribution constraints + output_distribution: + min: 10 # min length of the output to be generated + max: 2048 # max length of the output to be generated + mean: 1024 # mean length of the output to be generated + std: 512 # standard deviation of the length of the output to be generated + total_count: 1000 # total number of output lengths to generate to fit the above mentioned distribution constraints + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace + code_completion_synthetic.yaml: |- + load: + type: constant + stages: + - rate: 1 + duration: 120 + - rate: 2 + duration: 120 + - rate: 4 + duration: 120 + - rate: 8 + duration: 120 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: random + input_distribution: + min: 10 # min length of the synthetic prompts + max: 4096 # max length of the synthetic prompts + mean: 2048 # mean length of the synthetic prompts + std: 1024 # standard deviation of the length of the synthetic prompts + total_count: 1000 # total number of prompts to generate to fit the above mentioned distribution constraints + output_distribution: + min: 10 # min length of the output to be generated + max: 256 # max length of the output to be generated + mean: 128 # mean length of the output to be generated + std: 64 # standard deviation of the length of the output to be generated + total_count: 1000 # total number of output lengths to generate to fit the above mentioned distribution constraints + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace + sanity_random.yaml: |- + load: + type: constant + stages: + - rate: 1 + duration: 30 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: random + input_distribution: + min: 10 # min length of the synthetic prompts + max: 100 # max length of the synthetic prompts + mean: 50 # mean length of the synthetic prompts + std: 10 # standard deviation of the length of the synthetic prompts + total_count: 100 # total number of prompts to generate to fit the above mentioned distribution constraints + output_distribution: + min: 10 # min length of the output to be generated + max: 100 # max length of the output to be generated + mean: 50 # mean length of the output to be generated + std: 10 # standard deviation of the length of the output to be generated + total_count: 100 # total number of output lengths to generate to fit the above mentioned distribution constraints + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace + shared_prefix_synthetic.yaml: | + load: + type: constant + stages: + - rate: 2 + duration: 50 + - rate: 5 + duration: 50 + # - rate: 8 + # duration: 50 + # - rate: 10 + # duration: 50 + # - rate: 12 + # duration: 50 + # - rate: 15 + # duration: 50 + # - rate: 20 + # duration: 50 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: shared_prefix + shared_prefix: + # Number of distinct shared prefixes + num_groups: {{ .Values.experiment.profile.shared_prefix.num_groups }} + # Number of unique questions per shared prefix + num_prompts_per_group: {{ .Values.experiment.profile.shared_prefix.num_prompts_per_group }} + # Length of the shared prefix (in tokens) + system_prompt_len: {{ .Values.experiment.profile.shared_prefix.system_prompt_len }} + # Length of the unique question part (in tokens) + question_len: {{ .Values.experiment.profile.shared_prefix.question_len }} + # Target length for the model's generated output (in tokens) + output_len: {{ .Values.experiment.profile.shared_prefix.output_len }} + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace + summarization_synthetic.yaml: |- + load: + type: constant + stages: + - rate: 1 + duration: 120 + - rate: 2 + duration: 120 + - rate: 4 + duration: 120 + - rate: 8 + duration: 120 + api: + type: completion + streaming: true + server: + type: vllm + model_name: {{ .Values.stack.model }} + base_url: {{ .Values.stack.endpointUrl }} + ignore_eos: true + tokenizer: + pretrained_model_name_or_path: {{ .Values.stack.model }} + data: + type: random + input_distribution: + min: 10 # min length of the synthetic prompts + max: 4096 # max length of the synthetic prompts + mean: 2048 # mean length of the synthetic prompts + std: 1024 # standard deviation of the length of the synthetic prompts + total_count: 1000 # total number of prompts to generate to fit the above mentioned distribution constraints + output_distribution: + min: 10 # min length of the output to be generated + max: 512 # max length of the output to be generated + mean: 256 # mean length of the output to be generated + std: 128 # standard deviation of the length of the output to be generated + total_count: 1000 # total number of output lengths to generate to fit the above mentioned distribution constraints + report: + request_lifecycle: + summary: true + per_stage: true + per_request: true + storage: + local_storage: + path: /workspace diff --git a/charts/harness/values.yaml b/charts/harness/values.yaml new file mode 100644 index 00000000..faae6341 --- /dev/null +++ b/charts/harness/values.yaml @@ -0,0 +1,40 @@ +harness: + type: inference-perf + resultsPVC: workspace-pvc + image: + registry: ghcr.io + repository: llm-d + name: llm-d-benchmark + tag: v0.3.0rc2 + pullPolicy: Always + extraEnv: [] + args: ["llm-d-benchmark.sh"] + resources: + limits: + cpu: 16 + memory: 32Gi + requests: + cpu: 16 + memory: 32Gi + +stack: + type: "llm-d" + # model: + deployMethod: modelservice + # name + # endpointUrl + +experiment: + # identifier: + profile: + name: sanity_random.yaml + shared_prefix: + num_groups: 32 + num_prompts_per_group: 32 + system_prompt_len: 2048 + question_len: 256 + output_len: 256 + +nameOverride: "" +fullnameOverride: "" + diff --git a/charts/model-download/.helmignore b/charts/model-download/.helmignore new file mode 100644 index 00000000..898df488 --- /dev/null +++ b/charts/model-download/.helmignore @@ -0,0 +1,24 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ + diff --git a/charts/model-download/Chart.yaml b/charts/model-download/Chart.yaml new file mode 100644 index 00000000..04da0386 --- /dev/null +++ b/charts/model-download/Chart.yaml @@ -0,0 +1,40 @@ +apiVersion: v2 +name: llm-d-benchark +description: A Helm chart for model download + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: "v0.0.1" + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "v0.3.0" + +maintainers: + - name: "Michael Kalantar" + email: "kalantar@us.ibm.com" + url: "https://github.com/kalantar" + +sources: + - https://github.com/llm-d/llm-d-benchmark + +# dependencies: +# - name: common +# repository: https://charts.bitnami.com/bitnami +# tags: +# - bitnami-common +# version: "2.27.0" + diff --git a/charts/model-download/templates/_helpers.tpl b/charts/model-download/templates/_helpers.tpl new file mode 100644 index 00000000..2d518662 --- /dev/null +++ b/charts/model-download/templates/_helpers.tpl @@ -0,0 +1,31 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "download.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + + +{{/* +Create chart name and version as used by the chart label. +Truncated to 63 characrters because Kubernetes label values are limited to this +*/}} +{{- define "download.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create common labels for the resources managed by this chart. +*/}} +{{- define "dowload.labels" -}} +helm.sh/chart: {{ include "download.chart" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{- define "download.sanitizeString" -}} +{{- $input := . | lower | replace "." "-" | replace "/" "-" -}} +{{- $input -}} +{{- end -}} \ No newline at end of file diff --git a/charts/model-download/templates/job.yaml b/charts/model-download/templates/job.yaml new file mode 100644 index 00000000..1ad41655 --- /dev/null +++ b/charts/model-download/templates/job.yaml @@ -0,0 +1,42 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: {{ include "download.name" . }}-job +spec: + template: + spec: + containers: + - name: downloader + image: python:3.10 + command: ["/bin/sh", "-c"] + args: + - > + export PATH="${PATH}:${HOME}/.local/bin"; + mkdir -p "${MOUNT_PATH}/${MODEL_PATH}"; + python -m pip install huggingface_hub; + hf auth login --token "${HF_TOKEN}"; + hf download "${HF_MODEL_ID}" --local-dir "/cache/${MODEL_PATH}" + env: + - name: MODEL_PATH + value: models/{{ required "ERROR .Values.hf_model must be set" .Values.hf_model }} + - name: HF_MODEL_ID + value: {{ .Values.hf_model }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: {{ .Values.hf_secret }} + key: HF_TOKEN + - name: HF_HOME + value: /tmp/huggingface + - name: HOME + value: /tmp + - name: MOUNT_PATH + value: /cache + volumeMounts: + - name: model-cache + mountPath: /cache + restartPolicy: OnFailure + volumes: + - name: model-cache + persistentVolumeClaim: + claimName: {{ .Values.pvc.name }} \ No newline at end of file diff --git a/charts/model-download/templates/pvc.yaml b/charts/model-download/templates/pvc.yaml new file mode 100644 index 00000000..3c367832 --- /dev/null +++ b/charts/model-download/templates/pvc.yaml @@ -0,0 +1,14 @@ +{{- if .Values.pvc.create }} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ .Values.pvc.name }} +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: {{ .Values.pvc.size }} + storageClassName: {{ .Values.pvc.storageClass }} + volumeMode: Filesystem +{{- end }} \ No newline at end of file diff --git a/charts/model-download/values.yaml b/charts/model-download/values.yaml new file mode 100644 index 00000000..f4ca639d --- /dev/null +++ b/charts/model-download/values.yaml @@ -0,0 +1,8 @@ +# hf_model: # required +hf_secret: hf-secret + +pvc: + name: model-pvc + create: false + size: 5Gi + storageClass: default diff --git a/tekton-poc/README.md b/tekton-poc/README.md new file mode 100644 index 00000000..44616393 --- /dev/null +++ b/tekton-poc/README.md @@ -0,0 +1,222 @@ +# Benchmarking with Tekton + +This folder contains a proof of concept + +## Tekton Basics +A **Pipeline** is set of **Tasks**. Tasks run in parallel. The execution flow can be controlled implicitly (via one task consume a result of another) or explcitly with mechanisms like `runAfter`, `when` and `finally`. +A **Task** is a sequence of **Steps**. Steps run sequentially. The step can programmatically determine to execute or skip. + +To execute a **Pipeline** create a **PipelineRun**, +an object that identifies: + - the Pipeline to execute and + - the values of any parameters + +Tekton creates a **TaskRun** for each Task in the Pipeline. +A TaskRun is an object that identifies: + - the Task and + - the values of any parameters (passed from the PipelineRun) + +The TaskRun is implemented by a Pod +Each Step is implemented by a Container in the Pod. + +## Supported Benchmarking Use Cases + +Given a matrix of factors and values, measure performance of a model over a matrix of factors/values +Factors may be model deployment related, such as: model, endpoint picker configuration, parallelism, ... +Factors may also be workload related, for example: question_len, output_len,workload_profile, ... + +This proof of concept currently implements a variation of the inference-scheduling [scenairo](https://github.com/llm-d/llm-d-benchmark/blob/main/scenarios/guides/inference-scheduling.sh)/[experiment](https://github.com/llm-d/llm-d-benchmark/blob/main/experiments/inference-scheduling.yaml). + +## Approach + +A single Task measures performance over a single set of values from the factor/values matrix. This task implements steps: + +1. Create/prepare an experiment namespace +2. Deploy a Gateway +3. Configure GAIE +4. Download the model from HuggingFace to a PVC +5. Deploy the model +6. Run the workload for a single set of parameters +7. Upload the results to external storage (s3) +8. Delete the experiment namespace (not yet implemented) + +A PipelineRun is created that embeds a Pipeline containing one Task with a matrix of values for a set of factors. An example is `pipelinerun-matrix.yaml`. + +## Usage + +### Requirements + +1. HF token +2. s3 bucket and necessary keys +3. + +### Setup + +1. Create a namespace where the Tekton pipeline will execute. + ```shell + export $NAMESPACE=your_namespace + kubectl create ns $NAMESPACE + ``` + For convenience, set the current context: + ```shell + kubectl config set-context --current --namespace $NAMESPACE + ``` + +2. Create a secret `hf-secret` containing your HuggingFace token in the namespace. + ```shell + kubectl create secret generic hf-secret \ + --namespace ${NAMESPACE} \ + --from-literal="HF_TOKEN=${HF_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + ``` + +3. Create a secret containing your s3 credentials `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. + +4. Give the task needed permissions + ```shell + envsubst '$NAMESPACE' < pipeline/roles.yaml | kubectl apply -f - + ``` + +5. Create a RWX PVC `workspace-pvc` for storing execution results. This PVC is shared between all tasks. For example: + ```shell + cat < -f +``` + +Describe a `TaskRun`: + +```shell +tkn tr describe +``` + +### Cleanup + +Delete the `PipelineRun`: + +```shell +tkn pr delete -f +``` + +**Note**: The current implementation does not remove the namespaces created by each sweep step. Manually delete them to release all their resources. If you leave them, subsequent executions of the pipeline will attempt to reuse the resources. + +## Managing Parallelism + +The default PipelineSpec (in `pipeline/pipelinerun-matrix.yaml`) executes all the tasks in parallel. It can be modified in a number of ways to reduce the amount of parallel execution (at the expense of time). + +Some examples are provided (**Note** examples need to be updated): + +- `pipeline/pipelinerun-matrix-subset.yaml`: Uses `matrix.include` to list an explicit set of combinations to execute. +- `pipeline/pipelinerun-sequential-1.yaml`: Executes 1 task at a time. Each task depends on the previous one. +- `pipeline/pipelinerun-sequential-4-barrier.yaml`: Executes 4 tasks at a time. When all 4 complete, the next 4 start. +- `pipeline/pipelinerun-sequential-4-sliding.yaml`: Executes 4 tasks at a time. When one task completes another starts. +- `pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml`: Creates one task for each value of one dimention of the matrix. Each is executed in sequence. However, for other dimensions, parallel execution takes place. + +The utility script `utility/transform-pr-parallel.py` can be used to transform a default `PipelineRun` into alternatives as follows: + +1. Unroll a single parameter into one `Task` per value. Each resulting Task defines a matrix over the remaining parameters. + + ```shell + python transform-pr.py pipelinerun-matrix.yaml --unroll gaiePluginConfig -o pr-unrolled.yaml + ``` + +2. Unroll multiple parameters into [their Cartesian product] Tasks. Each resulting Task defines a matrix over the remaining parameters. + + ```shell + python transform-pr.py pipelinerun-matrix.yaml --unroll gaiePluginConfig,question_len -o pr-unrolled-2.yaml + ``` + +3. Unroll all the parameters into [their Cartian product] Tasks. Allow _n_ to run at once. This can be done using a _barrier_ strategy or a _sliding_window_ strategy + + ```shell + # Barrier (default) + python transform-pr.py pipelinerun-matrix.yaml -n 3 -o pr-expanded-barrier.yaml + + # Sliding window + python transform-pr.py pipelinerun-matrix.yaml -n 3 --sliding-window -o pr-expanded-sliding.yaml + ``` + +## Cautions + +- be sure to set the namespace parameter in the pipeline run; this is where the pipeline runs and is the base of the name for each experiment +- the upload of data is not yet implemented +- there are hardcoded assumptions/values about the use case in several places; these will be removed as more use cases are explored + + +# To Do + +- modify script to handle unroll better +- modify script to handle unroll and n together + +- single experiment namespace (possibly different from tekton ns) +- use more stepActions +- incorporate memory planner (Jing) +- PD example (Nick) + - [IN PROGRESS] deployment of the pd scenario + - [DONE] wait for model download + - [NOT STARTED] move from helm chart (job) to step - depends on ns change + - [NOT STARTED] debug --tensor-parallel-size argument + - [DONE] enabling multiple harnesses (inference-perf and vllm-benchmark) + - [DONE] making factors/treatments general (they are hardcoded) + - [DONE] use capacity planner to determine whether or not to continue + - [IN PROGRESS] move step implementations to stepactions + - [NOT STARTED] move from multiple namespaces to single namespace + +- should we have a convert step independent of the analysis step? +- eventually one for analysis based on analysis of converted results + +- wrapper to generate pipelineRun +- generate task? + +- missing steps: validate accelerator configuartion (wrt to cluster) \ No newline at end of file diff --git a/tekton-poc/examples/inference-scheduling/gaie-values.yaml b/tekton-poc/examples/inference-scheduling/gaie-values.yaml new file mode 100644 index 00000000..08d28bc6 --- /dev/null +++ b/tekton-poc/examples/inference-scheduling/gaie-values.yaml @@ -0,0 +1,150 @@ +inferenceExtension: + replicas: 1 + image: + # Either image will work, you just need to bring the correct plugins per image. In this example we will bring the upstream default plugin + ################### + name: llm-d-inference-scheduler + hub: ghcr.io/llm-d + tag: v0.2.1 + pullPolicy: Always + extProcPort: 9002 + extraContainerPorts: + - name: zmq + containerPort: 5557 + protocol: TCP + extraServicePorts: + - name: zmq + port: 5557 + targetPort: 5557 + protocol: TCP + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + pluginsConfigFile: "inf-sche-none.yaml" + pluginsCustomConfig: + inf-sche-none.yaml: | + # Sample EPP configuration for running without P/D with no scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-none.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefix-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 0 + inf-sche-prefix-kv-queue.yaml: | + # Sample EPP configuration for running without P/D with prefix, kv, and queue scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-prefix-kv-queue.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefix-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + - type: kv-cache-scorer + - type: queue-cache-scorer + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 1 + - pluginRef: kv-cache-scorer + weight: 1 + - pluginRef: queue-scorer + weight: 1 + inf-sche-prefix-kv.yaml: | + # Sample EPP configuration for running without P/D with prefix and kv scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-prefix-kv.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefix-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + - type: kv-cache-scorer + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 1 + - pluginRef: kv-cache-scorer + weight: 1 + - pluginRef: queue-scorer + weight: 1 + inf-sche-prefix.yaml: | + # Sample EPP configuration for running without P/D with prefix scorer with weight of 1 + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-prefix.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: prefix-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: prefix-cache-scorer + weight: 1 + inf-sche-queue.yaml: | + # Sample EPP configuration for running without P/D with no scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-queue.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: queue-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: queue-scorer + weight: 1 + inf-sche-kv.yaml: | + # Sample EPP configuration for running without P/D with no scorers + # https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/setup/presets/gaie/inf-sche-kv.yaml + apiVersion: inference.networking.x-k8s.io/v1alpha1 + kind: EndpointPickerConfig + plugins: + - type: kv-cache-scorer + - type: decode-filter + - type: max-score-picker + - type: single-profile-handler + schedulingProfiles: + - name: default + plugins: + - pluginRef: decode-filter + - pluginRef: max-score-picker + - pluginRef: kv-cache-scorer + weight: 1 +inferencePool: + targetPortNumber: 8000 + modelServerType: vllm + apiVersion: "inference.networking.x-k8s.io/v1alpha2" + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: qwen-qwen3-0-6b +provider: + name: none diff --git a/tekton-poc/examples/inference-scheduling/gateway-values.yaml b/tekton-poc/examples/inference-scheduling/gateway-values.yaml new file mode 100644 index 00000000..b22f8140 --- /dev/null +++ b/tekton-poc/examples/inference-scheduling/gateway-values.yaml @@ -0,0 +1,8 @@ +gateway: + gatewayClassName: kgateway + service: + type: NodePort + destinationRule: + host: gaie-inference-scheduling-epp.kalantar-is.svc.cluster.local + gatewayParameters: + enabled: true diff --git a/tekton-poc/examples/inference-scheduling/ms-values.yaml b/tekton-poc/examples/inference-scheduling/ms-values.yaml new file mode 100644 index 00000000..a02e894b --- /dev/null +++ b/tekton-poc/examples/inference-scheduling/ms-values.yaml @@ -0,0 +1,237 @@ +fullnameOverride: qwen-qwen3-0-6b +multinode: false + +modelArtifacts: + uri: pvc://model-pvc/models/Qwen/Qwen3-0.6B + size: 300Gi + authSecretName: "hf-secret" + name: Qwen/Qwen3-0.6B + +routing: + servicePort: 8000 + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: experiment-gateway-inference-gateway + proxy: + image: "ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0" + secure: false + connector: nixlv2 + debugLevel: 3 + inferenceModel: + create: true + inferencePool: + create: false + name: experiment-gaie + httpRoute: + create: true + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: experiment-gaie + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + matches: + - path: + type: PathPrefix + value: /qwen-qwen3-0-6b/ + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: experiment-gaie + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + + epp: + create: false + +decode: + create: true + replicas: 2 + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + - NVIDIA-H100-80GB-HBM3 + parallelism: + data: 1 + tensor: 1 + annotations: + deployed-by: jchen + modelservice: llm-d-benchmark + podAnnotations: + deployed-by: jchen + modelservice: llm-d-benchmark + #no____config + containers: + - name: "vllm" + mountModelVolume: true + image: "ghcr.io/llm-d/llm-d:v0.2.0" + modelCommand: vllmServe + + args: + - "--enforce-eager" + - "--block-size" + - "64" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--tensor-parallel-size" + - "1" + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + env: + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + resources: + limits: + memory: 64Gi + cpu: "16" + + nvidia.com/gpu: "1" + + requests: + memory: 64Gi + cpu: "16" + + nvidia.com/gpu: "1" + + extraConfig: + startupProbe: + httpGet: + path: /health + port: 8200 + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: 8200 + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8200 + failureThreshold: 3 + periodSeconds: 5 + + ports: + - containerPort: 5557 + protocol: TCP + - containerPort: 8200 + name: metrics + protocol: TCP + volumeMounts: [] + volumes: [] + +prefill: + create: false + replicas: 0 + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + - NVIDIA-H100-80GB-HBM3 + parallelism: + data: 1 + tensor: 1 + annotations: + deployed-by: jchen + modelservice: llm-d-benchmark + podAnnotations: + deployed-by: jchen + modelservice: llm-d-benchmark + #no____config + containers: + - name: "vllm" + mountModelVolume: true + image: "ghcr.io/llm-d/llm-d:v0.2.0" + modelCommand: vllmServe + + args: + - "--disable-log-requests" + - "--max-model-len" + - "16000" + - "--tensor-parallel-size" + - "1" + env: + - name: VLLM_IS_PREFILL + value: "1" + - name: UCX_TLS + value: "cuda_ipc,cuda_copy,tcp" + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + resources: + limits: + memory: 40Gi + cpu: "4" + + nvidia.com/gpu: "0" + + requests: + memory: 40Gi + cpu: "4" + + nvidia.com/gpu: "0" + + extraConfig: + startupProbe: + httpGet: + path: /health + port: 8000 + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: 8000 + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8000 + failureThreshold: 3 + periodSeconds: 5 + + ports: + - containerPort: 5557 + protocol: TCP + - containerPort: 8200 + name: metrics + protocol: TCP + volumeMounts: [] + volumes: [] \ No newline at end of file diff --git a/tekton-poc/examples/pd-disaggregation/gaie-values.yaml b/tekton-poc/examples/pd-disaggregation/gaie-values.yaml new file mode 100644 index 00000000..b860acef --- /dev/null +++ b/tekton-poc/examples/pd-disaggregation/gaie-values.yaml @@ -0,0 +1,36 @@ +inferenceExtension: + replicas: 1 + image: + name: llm-d-inference-scheduler + hub: ghcr.io/llm-d + tag: v0.2.1 + pullPolicy: Always + extProcPort: 9002 + extraContainerPorts: + - name: zmq + containerPort: 5557 + protocol: TCP + extraServicePorts: + - name: zmq + port: 5557 + targetPort: 5557 + protocol: TCP + env: + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + pluginsConfigFile: "plugins-v2.yaml" + +inferencePool: + targetPortNumber: 8000 + modelServerType: vllm + apiVersion: "inference.networking.x-k8s.io/v1alpha2" + modelServers: + matchLabels: + llm-d.ai/inferenceServing: "true" + llm-d.ai/model: meta-llama-llama-3-1-8b-instruct +provider: + name: none + diff --git a/tekton-poc/examples/pd-disaggregation/gateway-values.yaml b/tekton-poc/examples/pd-disaggregation/gateway-values.yaml new file mode 100644 index 00000000..3f836050 --- /dev/null +++ b/tekton-poc/examples/pd-disaggregation/gateway-values.yaml @@ -0,0 +1,8 @@ +gateway: + gatewayClassName: kgateway + service: + type: NodePort + destinationRule: + host: experiment-gaie-685a862b-epp.kalantar-is.svc.cluster.local + gatewayParameters: + enabled: true diff --git a/tekton-poc/examples/pd-disaggregation/ms-values.yaml b/tekton-poc/examples/pd-disaggregation/ms-values.yaml new file mode 100644 index 00000000..7a5be879 --- /dev/null +++ b/tekton-poc/examples/pd-disaggregation/ms-values.yaml @@ -0,0 +1,260 @@ +fullnameOverride: meta-llama-llama-3-1-8b-instruct +multinode: false + +modelArtifacts: + uri: pvc://model-pvc/models/meta-llama/Llama-3.1-8B-Instruct + size: 300Gi + authSecretName: "hf-secret" + name: meta-llama/Llama-3.1-8B-Instruct + +routing: + servicePort: 8000 + parentRefs: + - group: gateway.networking.k8s.io + kind: Gateway + name: experiment-gateway-inference-gateway + proxy: + image: "ghcr.io/llm-d/llm-d-routing-sidecar:v0.3.0" + secure: false + connector: nixlv2 + debugLevel: 3 + inferenceModel: + create: true + inferencePool: + create: false + name: meta-lla-1b4505f6-instruct-gaie + httpRoute: + create: true + rules: + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: meta-lla-1b4505f6-instruct-gaie + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + matches: + - path: + type: PathPrefix + value: /meta-llama-llama-3-1-8b-instruct/ + filters: + - type: URLRewrite + urlRewrite: + path: + type: ReplacePrefixMatch + replacePrefixMatch: / + - backendRefs: + - group: inference.networking.x-k8s.io + kind: InferencePool + name: meta-lla-1b4505f6-instruct-gaie + port: 8000 + weight: 1 + timeouts: + backendRequest: 0s + request: 0s + + epp: + create: false + +decode: + create: true + replicas: 3 + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + - NVIDIA-H100-80GB-HBM3 + parallelism: + data: 1 + tensor: 4 + annotations: + deployed-by: nick + modelservice: llm-d-benchmark + podAnnotations: + deployed-by: nick + modelservice: llm-d-benchmark + k8s.v1.cni.cncf.io/networks: multi-nic-compute + #no____config + containers: + - name: "vllm" + mountModelVolume: true + image: "ghcr.io/llm-d/llm-d:v0.2.0" + modelCommand: vllmServe + + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + env: + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: "rc,sm,cuda_ipc,cuda_copy,tcp" + - name: UCX_SOCKADDR_TLS_PRIORITY + value: "tcp" + - name: UCX_NET_DEVICES + value: mlx5_1:1 + - name: NCCL_IB_HCA + value: mlx5_1 + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + resources: + limits: + memory: 128Gi + cpu: "32" + + nvidia.com/gpu: "4" + rdma/roce_gdr: "1" + requests: + memory: 128Gi + cpu: "32" + + nvidia.com/gpu: "4" + rdma/roce_gdr: "1" + extraConfig: + startupProbe: + httpGet: + path: /health + port: 8200 + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: 8200 + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8200 + failureThreshold: 3 + periodSeconds: 5 + #no____config + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 16Gi + +prefill: + create: true + replicas: 1 + acceleratorTypes: + labelKey: nvidia.com/gpu.product + labelValues: + - NVIDIA-H100-80GB-HBM3 + parallelism: + data: 1 + tensor: 4 + annotations: + deployed-by: nick + modelservice: llm-d-benchmark + podAnnotations: + deployed-by: nick + modelservice: llm-d-benchmark + k8s.v1.cni.cncf.io/networks: multi-nic-compute + #no____config + containers: + - name: "vllm" + mountModelVolume: true + image: "ghcr.io/llm-d/llm-d:v0.2.0" + modelCommand: vllmServe + + args: + - "--block-size" + - "128" + - "--kv-transfer-config" + - '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' + - "--disable-log-requests" + - "--disable-uvicorn-access-log" + - "--max-model-len" + - "16000" + env: + - name: VLLM_IS_PREFILL + value: "1" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: UCX_TLS + value: "rc,sm,cuda_ipc,cuda_copy,tcp" + - name: UCX_SOCKADDR_TLS_PRIORITY + value: "tcp" + - name: UCX_NET_DEVICES + value: mlx5_1:1 + - name: NCCL_IB_HCA + value: mlx5_1 + - name: VLLM_NIXL_SIDE_CHANNEL_PORT + value: "5557" + - name: VLLM_NIXL_SIDE_CHANNEL_HOST + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: VLLM_LOGGING_LEVEL + value: DEBUG + - name: VLLM_ALLOW_LONG_MAX_MODEL_LEN + value: "1" + resources: + limits: + memory: 128Gi + cpu: "32" + + nvidia.com/gpu: "4" + rdma/roce_gdr: "1" + requests: + memory: 128Gi + cpu: "32" + + nvidia.com/gpu: "4" + rdma/roce_gdr: "1" + extraConfig: + startupProbe: + httpGet: + path: /health + port: 8000 + failureThreshold: 60 + initialDelaySeconds: 30 + periodSeconds: 30 + timeoutSeconds: 5 + livenessProbe: + tcpSocket: + port: 8000 + failureThreshold: 3 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8000 + failureThreshold: 3 + periodSeconds: 5 + #no____config + volumeMounts: + - name: dshm + mountPath: /dev/shm + volumes: + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 16Gi + diff --git a/tekton-poc/pipeline/pd-disaggregation-pr.yaml b/tekton-poc/pipeline/pd-disaggregation-pr.yaml new file mode 100644 index 00000000..8be4b769 --- /dev/null +++ b/tekton-poc/pipeline/pd-disaggregation-pr.yaml @@ -0,0 +1,239 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: pd +spec: + taskRunTemplate: + serviceAccountName: helm-installer + taskRunSpecs: + - pipelineTaskName: treatment + computeResources: + requests: + memory: "16Gi" + cpu: "8" + # memory: "32Gi" + # cpu: "16" + limits: + memory: "16Gi" + cpu: "8" + # memory: "32Gi" + # cpu: "16" + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + params: + - name: targetNamespacePrefix + # This can be anything. + value: $(context.pipelineRun.namespace) + - name: model-id + value: "meta-llama/Llama-3.1-8B-Instruct" + # value: "meta-llama/Llama-3.1-70B-Instruct" + + # Properties needed to evaluate stack capacity (will it be able to host the model)? + - name: validateCapacity + value: true + - name: behaviorOnValidationFailure + value: terminate + - name: maxModelLength + value: 16000 + # will be set via treatment below + # - name: decodeReplicas + # - name: decodeTensorParallelism + - name: decodeDataParallelism + value: 1 + # If not set, will be set to decodeTensorParallelism * decodeDataParallelism + # - name: decodeNumGpus + + # will be set via treatment below + # - name: prefillReplicas + # - name: prefillTensorParallelism + - name: prefillDataParallelism + value: 1 + # If not set, will be set to prefillTensorParallelism * prefillDataParallelism + # - name: prefillNumGpus + + # Rely on default value + # Assume the same for prefill and decode + # - name: targetGpuMemoryUtilization + + # Required + # Assume the same for prefill and decode + # TBD - attempt to read from the cluster + - name: gpuType + value: "NVIDIA-H100-80GB-HBM3" + - name: gpuMemory + value: 80 #GB + + # Harness / Workload + - name: harnessName + value: vllm-benchmark + - name: harnessProfile + value: random_concurrent.yaml + + # Output Location + - name: s3-keys + value: ibm-cos-secret + - name: s3-bucket + value: "cloud-object-storage-cos-standard-ere" + - name: s3-endpoint + value: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + + # Control + - name: debug + value: true + + pipelineSpec: + workspaces: + - name: data + params: + - name: maxModelLength + default: "" + - name: decodeReplicas + default: "" + - name: decodeTensorParallelism + default: "" + - name: decodeDataParallelism + default: "" + - name: decodeNumGpus + default: "" + - name: prefillReplicas + default: "" + - name: prefillTensorParallelism + default: "" + - name: prefillDataParallelism + default: "" + - name: prefillNumGpus + default: "" + + tasks: + - name: treatment + taskRef: + name: treatment + workspaces: + - name: data + workspace: data + params: + - name: model-id + value: $(params.model-id) + + # Properties needed to evaluate stack capacity (will it be able to host the model)? + - name: validateCapacity + value: $(params.validateCapacity) + - name: behaviorOnValidationFailure + value: $(params.behaviorOnValidationFailure) + + - name: maxModelLength + value: $(params.maxModelLength) + + - name: decodeReplicas + value: $(params.decodeReplicas) + - name: decodeTensorParallelism + value: $(params.decodeTensorParallelism) + - name: decodeDataParallelism + value: $(params.decodeDataParallelism) + - name: decodeNumGpus + value: $(params.decodeNumGpus) + + - name: prefillReplicas + value: $(params.prefillReplicas) + - name: prefillTensorParallelism + value: $(params.prefillTensorParallelism) + - name: prefillDataParallelism + value: $(params.prefillDataParallelism) + - name: prefillNumGpus + value: $(params.prefillNumGpus) + + - name: gpuType + value: $(params.gpuType) + - name: gpuMemory + value: $(params.gpuMemory) + + - name: targetNamespacePrefix + value: $(params.targetNamespacePrefix) + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/pd-disaggregation/ + + - name: s3-keys + value: $(params.s3-keys) + - name: s3-bucket + value: $(params.s3-bucket) + - name: s3-endpoint + value: $(params.s3-endpoint) + + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + + - name: factorMapping + value: | + { + "modelservice": { + "prefillReplicas": "prefill.replicas", + "prefillTensorParallelism": "prefill.parallelism.tensor", + "decodeReplicas": "decode.replicas", + "decodeTensorParallelism": "decode.parallelism.tensor" + }, + "gaie": { + "gaiePluginConfig": "inferenceExtension.pluginsConfigFile" + }, + "workload": { + "max-concurrency": "max-concurrency", + "num_prompts": "num-prompts", + "question_len": "data.shared_prefix.question_len", + "output_len": "data.shared_prefix.output_len" + } + } + + - name: debug + value: "$(params.debug)" + - name: step-upload-results + value: false + - name: pipelineUID + value: "$(context.pipelineRun.uid)" + + matrix: + include: + - name: combo-0 + params: + - name: treatment + value: | + { + "prefillReplicas": 1, + "prefillTensorParallelism": 1, + "decodeReplicas": 1, + "decodeTensorParallelism": 2, + "max-concurrency": 1, + "num-prompts": 10 + } + # - name: combo-1 + # params: + # - name: treatment + # value: | + # { + # "prefillReplicas": 1, + # "prefillTensorParallelism": 2, + # "decodeReplicas": 1, + # "decodeTensorParallelism": 1, + # "max-concurrency": 1, + # "num-prompts": 10 + # } + + # params: + # - name: max-concurrency + # value: + # - "1" + # # - "8" + # # - "32" + # # - "64" + # # - "128" + # # - "256" + # - name: num-prompts + # value: + # - "10" + # # - "80" + # # - "320" + # # - "640" + # # - "1280" + # # - "2560" diff --git a/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml new file mode 100644 index 00000000..f805f9ec --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-matrix-subset.yaml @@ -0,0 +1,120 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + params: + - name: targetNamespacePrefix + # This can be anything. + value: $(context.pipelineRun.namespace) + - name: model-id + value: "Qwen/Qwen3-0.6B" + + # Harness / Workload + - name: harnessName + value: inference-perf + - name: harnessProfile + value: shared_prefix_synthetic_short.yaml + + # Output Location + - name: s3-keys + value: ibm-cos-secret + - name: s3-bucket + value: "cloud-object-storage-cos-standard-ere" + - name: s3-endpoint + value: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + + # Control + - name: debug + value: true + - name: step-upload-results + value: false + + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: targetNamespacePrefix + value: $(params.targetNamespacePrefix) + - name: model-id + value: $(params.model-id) + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + + - name: s3-keys + value: $(params.s3-keys) + - name: s3-bucket + value: $(params.s3-bucket) + - name: s3-endpoint + value: $(params.s3-endpoint) + + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + + - name: factorMapping + value: | + { + "modelservice": { + "prefillReplicas": "prefill.replicas", + "prefillTensorParallelism": "prefill.parallelism.tensor", + "decodeReplicas": "decode.replicas", + "decodeTensorParallelism": "decode.parallelism.tensor" + }, + "gaie": { + "gaiePluginConfig": "inferenceExtension.pluginsConfigFile" + }, + "workload": { + "max-concurrency": "max-concurrency", + "num_prompts": "num-prompts", + "question_len": "data.shared_prefix.question_len", + "output_len": "data.shared_prefix.output_len" + } + } + + - name: max-concurrency + value: "1" + - name: num-prompts + value: "10" + + - name: debug + value: "$(params.debug)" + - name: step-upload-results + value: "$(params.step-upload-results)" + - name: pipelineUID + value: "$(context.pipelineRun.uid)" + matrix: + include: + - name: combo-0 + params: + - name: treatment + value: | + { + "gaiePluginConfig": "inf-sche-queue.yaml", + "question_len": 100, + "output_len": 100 + } + - name: combo-1 + params: + - name: treatment + value: | + { + "gaiePluginConfig": "inf-sche-prefix.yaml", + "question_len": 300, + "output_len": 300 + } + diff --git a/tekton-poc/pipeline/pipelinerun-matrix.yaml b/tekton-poc/pipeline/pipelinerun-matrix.yaml new file mode 100644 index 00000000..529ec2b9 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-matrix.yaml @@ -0,0 +1,89 @@ +##### +# This is an example of how the matrix specification works. +# It is currently out of date. +# To test, use pipelinerun-matrix-subset.yaml instead. +##### +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + params: + - name: targetNamespacePrefix + # This can be anything. + value: $(context.pipelineRun.namespace) + - name: model-id + value: "Qwen/Qwen3-0.6B" + + # Harness / Workload + - name: harnessProfile + value: shared_prefix_synthetic_short.yaml + + # Output Location + - name: s3-keys + value: ibm-cos-secret + - name: s3-bucket + value: "cloud-object-storage-cos-standard-ere" + - name: s3-endpoint + value: "https://s3.us-east.cloud-object-storage.appdomain.cloud" + + # Control + - name: debug + value: false + + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: targetNamespacePrefix + value: $(params.targetNamespacePrefix) + - name: model-id + value: $(params.model-id) + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + + - name: s3-keys + value: $(params.s3-keys) + - name: s3-bucket + value: $(params.s3-bucket) + - name: s3-endpoint + value: $(params.s3-endpoint) + + - name: harnessProfile + value: $(params.harnessProfile) + + - name: debug + value: "$(params.debug)" + - name: pipelineUID + value: "$(context.pipelineRun.uid)" + matrix: + params: + - name: gaiePluginConfig + value: + - "inf-sche-none.yaml" + - "inf-sche-prefix.yaml" + - "inf-sche-kv.yaml" + - "inf-sche-queue.yaml" + - name: question_len + value: + - "100" + - "300" + - "1000" + - name: output_len + value: + - "100" + - "300" + - "1000" diff --git a/tekton-poc/pipeline/pipelinerun-sequential-1.yaml b/tekton-poc/pipeline/pipelinerun-sequential-1.yaml new file mode 100644 index 00000000..4a2f0089 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-sequential-1.yaml @@ -0,0 +1,841 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment-0 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + - name: run-experiment-1 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-0 + - name: run-experiment-2 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-1 + - name: run-experiment-3 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-2 + - name: run-experiment-4 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-3 + - name: run-experiment-5 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-4 + - name: run-experiment-6 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-5 + - name: run-experiment-7 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-6 + - name: run-experiment-8 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-7 + - name: run-experiment-9 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-8 + - name: run-experiment-10 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-9 + - name: run-experiment-11 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-10 + - name: run-experiment-12 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-11 + - name: run-experiment-13 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-12 + - name: run-experiment-14 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-13 + - name: run-experiment-15 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-14 + - name: run-experiment-16 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-15 + - name: run-experiment-17 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-16 + - name: run-experiment-18 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-17 + - name: run-experiment-19 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-18 + - name: run-experiment-20 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-19 + - name: run-experiment-21 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-20 + - name: run-experiment-22 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-21 + - name: run-experiment-23 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-22 + - name: run-experiment-24 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-23 + - name: run-experiment-25 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-24 + - name: run-experiment-26 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-25 + - name: run-experiment-27 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-26 + - name: run-experiment-28 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-27 + - name: run-experiment-29 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-28 + - name: run-experiment-30 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-29 + - name: run-experiment-31 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-30 + - name: run-experiment-32 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-31 + - name: run-experiment-33 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-32 + - name: run-experiment-34 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-33 + - name: run-experiment-35 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-34 diff --git a/tekton-poc/pipeline/pipelinerun-sequential-4-barrier.yaml b/tekton-poc/pipeline/pipelinerun-sequential-4-barrier.yaml new file mode 100644 index 00000000..1dc4f388 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-sequential-4-barrier.yaml @@ -0,0 +1,931 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment-0 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + - name: run-experiment-1 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + - name: run-experiment-2 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + - name: run-experiment-3 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + - name: run-experiment-4 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-0 + - run-experiment-1 + - run-experiment-2 + - run-experiment-3 + - name: run-experiment-5 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-0 + - run-experiment-1 + - run-experiment-2 + - run-experiment-3 + - name: run-experiment-6 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-0 + - run-experiment-1 + - run-experiment-2 + - run-experiment-3 + - name: run-experiment-7 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-0 + - run-experiment-1 + - run-experiment-2 + - run-experiment-3 + - name: run-experiment-8 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-4 + - run-experiment-5 + - run-experiment-6 + - run-experiment-7 + - name: run-experiment-9 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-4 + - run-experiment-5 + - run-experiment-6 + - run-experiment-7 + - name: run-experiment-10 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-4 + - run-experiment-5 + - run-experiment-6 + - run-experiment-7 + - name: run-experiment-11 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-4 + - run-experiment-5 + - run-experiment-6 + - run-experiment-7 + - name: run-experiment-12 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-8 + - run-experiment-9 + - run-experiment-10 + - run-experiment-11 + - name: run-experiment-13 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-8 + - run-experiment-9 + - run-experiment-10 + - run-experiment-11 + - name: run-experiment-14 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-8 + - run-experiment-9 + - run-experiment-10 + - run-experiment-11 + - name: run-experiment-15 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-8 + - run-experiment-9 + - run-experiment-10 + - run-experiment-11 + - name: run-experiment-16 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-12 + - run-experiment-13 + - run-experiment-14 + - run-experiment-15 + - name: run-experiment-17 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-12 + - run-experiment-13 + - run-experiment-14 + - run-experiment-15 + - name: run-experiment-18 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-12 + - run-experiment-13 + - run-experiment-14 + - run-experiment-15 + - name: run-experiment-19 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-12 + - run-experiment-13 + - run-experiment-14 + - run-experiment-15 + - name: run-experiment-20 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-16 + - run-experiment-17 + - run-experiment-18 + - run-experiment-19 + - name: run-experiment-21 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-16 + - run-experiment-17 + - run-experiment-18 + - run-experiment-19 + - name: run-experiment-22 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-16 + - run-experiment-17 + - run-experiment-18 + - run-experiment-19 + - name: run-experiment-23 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-16 + - run-experiment-17 + - run-experiment-18 + - run-experiment-19 + - name: run-experiment-24 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-20 + - run-experiment-21 + - run-experiment-22 + - run-experiment-23 + - name: run-experiment-25 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-20 + - run-experiment-21 + - run-experiment-22 + - run-experiment-23 + - name: run-experiment-26 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-20 + - run-experiment-21 + - run-experiment-22 + - run-experiment-23 + - name: run-experiment-27 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-20 + - run-experiment-21 + - run-experiment-22 + - run-experiment-23 + - name: run-experiment-28 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-24 + - run-experiment-25 + - run-experiment-26 + - run-experiment-27 + - name: run-experiment-29 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-24 + - run-experiment-25 + - run-experiment-26 + - run-experiment-27 + - name: run-experiment-30 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-24 + - run-experiment-25 + - run-experiment-26 + - run-experiment-27 + - name: run-experiment-31 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-24 + - run-experiment-25 + - run-experiment-26 + - run-experiment-27 + - name: run-experiment-32 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-28 + - run-experiment-29 + - run-experiment-30 + - run-experiment-31 + - name: run-experiment-33 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-28 + - run-experiment-29 + - run-experiment-30 + - run-experiment-31 + - name: run-experiment-34 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-28 + - run-experiment-29 + - run-experiment-30 + - run-experiment-31 + - name: run-experiment-35 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-28 + - run-experiment-29 + - run-experiment-30 + - run-experiment-31 diff --git a/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml b/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml new file mode 100644 index 00000000..9a750925 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-sequential-4-sliding.yaml @@ -0,0 +1,835 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment-0 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + - name: run-experiment-1 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + - name: run-experiment-2 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + - name: run-experiment-3 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + - name: run-experiment-4 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-0 + - name: run-experiment-5 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-1 + - name: run-experiment-6 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-2 + - name: run-experiment-7 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-3 + - name: run-experiment-8 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-4 + - name: run-experiment-9 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-5 + - name: run-experiment-10 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-6 + - name: run-experiment-11 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-7 + - name: run-experiment-12 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-8 + - name: run-experiment-13 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-9 + - name: run-experiment-14 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-10 + - name: run-experiment-15 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-11 + - name: run-experiment-16 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-12 + - name: run-experiment-17 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-13 + - name: run-experiment-18 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-14 + - name: run-experiment-19 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-15 + - name: run-experiment-20 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-16 + - name: run-experiment-21 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-17 + - name: run-experiment-22 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-18 + - name: run-experiment-23 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-19 + - name: run-experiment-24 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-20 + - name: run-experiment-25 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-21 + - name: run-experiment-26 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-22 + - name: run-experiment-27 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '100' + runAfter: + - run-experiment-23 + - name: run-experiment-28 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '300' + runAfter: + - run-experiment-24 + - name: run-experiment-29 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '100' + - name: output_len + value: '1000' + runAfter: + - run-experiment-25 + - name: run-experiment-30 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '100' + runAfter: + - run-experiment-26 + - name: run-experiment-31 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '300' + runAfter: + - run-experiment-27 + - name: run-experiment-32 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '300' + - name: output_len + value: '1000' + runAfter: + - run-experiment-28 + - name: run-experiment-33 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '100' + runAfter: + - run-experiment-29 + - name: run-experiment-34 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '300' + runAfter: + - run-experiment-30 + - name: run-experiment-35 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + - name: question_len + value: '1000' + - name: output_len + value: '1000' + runAfter: + - run-experiment-31 diff --git a/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml b/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml new file mode 100644 index 00000000..eae7a3f5 --- /dev/null +++ b/tekton-poc/pipeline/pipelinerun-sequential-unroll-gaiePluginConfig.yaml @@ -0,0 +1,119 @@ +apiVersion: tekton.dev/v1 +kind: PipelineRun +metadata: + name: experiment-matrix-run +spec: + taskRunTemplate: + serviceAccountName: helm-installer + workspaces: + - name: data + persistentVolumeClaim: + claimName: workspace-pvc + pipelineSpec: + workspaces: + - name: data + tasks: + - name: run-experiment-0 + taskRef: + name: experiment + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-none.yaml + matrix: + params: + - name: question_len + value: &id001 + - '100' + - '300' + - '1000' + - name: output_len + value: &id002 + - '100' + - '300' + - '1000' + - name: run-experiment-1 + taskRef: + name: experiment + runAfter: + - run-experiment-0 + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-prefix.yaml + matrix: + params: + - name: question_len + value: *id001 + - name: output_len + value: *id002 + - name: run-experiment-2 + taskRef: + name: experiment + runAfter: + - run-experiment-1 + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-kv.yaml + matrix: + params: + - name: question_len + value: *id001 + - name: output_len + value: *id002 + - name: run-experiment-3 + taskRef: + name: experiment + runAfter: + - run-experiment-2 + workspaces: + - name: data + workspace: data + params: + - name: namespace + value: kalantar + - name: model-id + value: Qwen/Qwen3-0.6B + - name: stackBaseUrl + value: https://raw.githubusercontent.com/kalantar/llm-d-benchmark/refs/heads/tekton-poc/tekton-poc/examples/inference-scheduling/ + - name: harnessProfile + value: shared_prefix_synthetic.yaml + - name: gaiePluginConfig + value: inf-sche-queue.yaml + matrix: + params: + - name: question_len + value: *id001 + - name: output_len + value: *id002 diff --git a/tekton-poc/pipeline/roles.yaml b/tekton-poc/pipeline/roles.yaml new file mode 100644 index 00000000..b1ae2e54 --- /dev/null +++ b/tekton-poc/pipeline/roles.yaml @@ -0,0 +1,123 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: helm-installer + namespace: ${NAMESPACE} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: helm-installer-clusterrole +rules: +- apiGroups: [""] + resources: ["pods", "services", "namespaces", "persistentvolumeclaims", "secrets", "configmaps", "serviceaccounts"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] +- apiGroups: ["apps"] + resources: ["deployments", "replicasets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["rbac.authorization.k8s.io"] + resources: ["roles", "rolebindings", "clusterroles", "clusterrolebindings"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["gateway.networking.k8s.io"] + resources: ["gateways", "httproutes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["gateway.kgateway.dev"] + resources: ["gatewayparameters"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools", "inferencemodels", "inferenceobjectives"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["authentication.k8s.io"] + resources: ["tokenreviews"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["authorization.k8s.io"] + resources: ["subjectaccessreviews"] + verbs: ["create"] +- apiGroups: ["route.openshift.io"] + resources: ["routes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["security.openshift.io"] + resources: ["securitycontextconstraints"] + resourceNames: ["anyuid", "restricted", "privileged"] + verbs: ["use"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: helm-installer-crb-${NAMESPACE} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: helm-installer-clusterrole +subjects: +- kind: ServiceAccount + name: helm-installer + namespace: ${NAMESPACE} +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: helm-installer-restricted-scc + namespace: ${NAMESPACE} +subjects: + - kind: ServiceAccount + name: helm-installer +roleRef: + kind: ClusterRole + name: system:openshift:scc:restricted + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: helm-access + namespace: ${NAMESPACE} +rules: +- apiGroups: [""] + resources: ["secrets", "configmaps", "services", "pods", "namespaces", "serviceaccounts", "persistentvolumeclaims"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: [""] + resources: ["pods/log"] + verbs: ["get"] +- apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["apps"] + resources: ["deployments", "replicasets"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["gateway.networking.k8s.io"] + resources: ["gateways", "httproutes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["gateway.kgateway.dev"] + resources: ["gatewayparameters"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["inference.networking.x-k8s.io"] + resources: ["inferencepools", "inferencemodels", "inferenceobjectives"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["authentication.k8s.io"] + resources: ["tokenreviews", "subjectaccessreviews"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["route.openshift.io"] + resources: ["routes"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] +- apiGroups: ["security.openshift.io"] + resources: ["securitycontextconstraints"] + resourceNames: ["restricted"] + verbs: ["use"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: helm-access-binding + namespace: ${NAMESPACE} +subjects: +- kind: ServiceAccount + name: helm-installer + namespace: ${NAMESPACE} +roleRef: + kind: Role + name: helm-access + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/tekton-poc/pipeline/steps/capacity-planner.yaml b/tekton-poc/pipeline/steps/capacity-planner.yaml new file mode 100644 index 00000000..879bec20 --- /dev/null +++ b/tekton-poc/pipeline/steps/capacity-planner.yaml @@ -0,0 +1,439 @@ +# apiVersion: tekton.dev/v1beta1 +# kind: StepAction +# metadata: +# name: compute-value +# spec: +# results: +# - name: value +# params: +# - name: name +# - name: value +# - name: defaultValue +# env: +# - name: PARAMETER_NAME +# value: "$(params.name)" +# - name: PARAMETER_VALUE +# value: $(params.value) +# - name: DEFAULT_VALUE +# value: $(params.defaultValue) +# - name: TREATMENT_ANALYSIS +# value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" +# image: alpine:3.20 +# script: | +# #!/usr/bin/env sh + +# apk add --no-cache jq yq >/dev/null + +# echo "PARAMETER_NAME = ${PARAMETER_NAME}" +# echo "PARAMETER_VALUE = ${PARAMETER_VALUE}" +# echo "DEFAULT_VALUE = ${DEFAULT_VALUE}" + +# if [ -n "${PARAMETER_VALUE}" ]; then +# value="${PARAMETER_VALUE}" +# echo ">>> Using value from parameter: ${value}" +# else +# value=$( +# echo ${TREATMENT_ANALYSIS} \ +# | jq -r ".updates[] | select(.name == \"${PARAMETER_NAME}\") | .value" +# ) +# echo ">>> value from treatment: ${value}" +# if [ -z $value ]; then +# value=${DEFAULT_VALUE} +# echo ">>> Using default value: ${value}" +# fi +# fi + +# echo -n "${value}" > "$(step.results.value.path)" +# --- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: compute-values +spec: + results: + - name: decodeDataParallelism + - name: decodeTensorParallelism + - name: decodeReplicas + - name: prefillDataParallelism + - name: prefillTensorParallelism + - name: prefillReplicas + params: + - name: decodeDataParallelism + - name: decodeTensorParallelism + - name: decodeReplicas + - name: prefillDataParallelism + - name: prefillTensorParallelism + - name: prefillReplicas + env: + - name: DECODE_DP + value: "$(params.decodeDataParallelism)" + - name: DECODE_TP + value: "$(params.decodeTensorParallelism)" + - name: DECODE_REPLICAS + value: "$(params.decodeReplicas)" + - name: PREFILL_DP + value: "$(params.prefillDataParallelism)" + - name: PREFILL_TP + value: "$(params.prefillTensorParallelism)" + - name: PREFILL_REPLICAS + value: "$(params.prefillReplicas)" + - name: TREATMENT_ANALYSIS + value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" + image: alpine:3.20 + script: | + #!/usr/bin/env sh + + apk add --no-cache jq yq >/dev/null + + compute_value() { + _name="$1" + _value="$2" + _default="$3" + + if [ -n "${_value}" ]; then + _result="${_value}" + else + _result=$( + # echo "from treatment" + echo "${TREATMENT_ANALYSIS}" \ + | jq -r ".updates[] | select(.name == \"${_name}\") | .value" + ) + if [ -z $_result ]; then + _result="${_default}" + fi + fi + echo "${_result}" + } + + echo "input DECODE_DP = ${DECODE_DP}" + value=$(compute_value "decodeDataParallelism" "${DECODE_DP}" 1) + echo "output decodeDataParallelism = $value" + echo -n "${value}" > "$(step.results.decodeDataParallelism.path)" + + echo "input DECODE_TP = ${DECODE_TP}" + value=$(compute_value "decodeTensorParallelism" "${DECODE_TP}" 1) + echo "output decodeTensorParallelism = $value" + echo -n "${value}" > "$(step.results.decodeTensorParallelism.path)" + + echo "input DECODE_REPLICAS = ${DECODE_REPLICAS}" + value=$(compute_value "decodeReplicas" "${DECODE_REPLICAS}" 1) + echo "output decodeReplicas = $value" + echo -n "${value}" > "$(step.results.decodeReplicas.path)" + + echo "input PREFILL_DP = ${PREFILL_DP}" + value=$(compute_value "prefillDataParallelism" "${PREFILL_DP}" 1) + echo "output prefillDataParallelism = $value" + echo -n "${value}" > "$(step.results.prefillDataParallelism.path)" + + echo "input PREFILL_TP = ${PREFILL_TP}" + value=$(compute_value "prefillTensorParallelism" "${PREFILL_TP}" 1) + echo "output prefillTensorParallelism = $value" + echo -n "${value}" > "$(step.results.prefillTensorParallelism.path)" + + echo "input PREFILL_REPLICAS = ${PREFILL_REPLICAS}" + value=$(compute_value "prefillReplicas" "${PREFILL_REPLICAS}" 1) + echo "output prefillReplicas = $value" + echo -n "${value}" > "$(step.results.prefillReplicas.path)" +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: compute-num-gpus +spec: + results: + - name: value + params: + - name: name + - name: value + - name: dp + - name: tp + env: + - name: PARAMETER_NAME + value: "$(params.name)" + - name: PARAMETER_VALUE + value: $(params.value) + - name: DP + value: $(params.dp) + - name: TP + value: $(params.tp) + - name: TREATMENT_ANALYSIS + value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" + image: alpine:3.20 + script: | + #!/usr/bin/env sh + + apk add --no-cache jq yq >/dev/null + + echo "PARAMETER_NAME = ${PARAMETER_NAME}" + echo "PARAMETER_VALUE = ${PARAMETER_VALUE}" + echo "DP = ${DP}" + echo "TP = ${TP}" + + if [ -n "${PARAMETER_VALUE}" ]; then + value=${PARAMETER_VALUE} + echo ">>> Using value from parameter: ${value}" + else + value=$( + echo ${TREATMENT_ANALYSIS} \ + | jq -r ".updates[] | select(.name == \"${PARAMETER_NAME}\") | .value" + ) + echo ">>> value from treatment: ${value}" + if [ -z $value ]; then + value=$(( $TP * $DP )) + echo ">>> Using value from computation: $TP * $DP = ${value}" + fi + fi + + echo -n "${value}" > "$(step.results.value.path)" +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: check-capacity +spec: + params: + - name: validateCapacity + default: "true" + - name: behaviorOnValidationFailure + default: terminate # ignore + + - name: model + - name: max_model_len + - name: replicas + - name: tp + - name: dp + - name: gpu_memory + - name: user_requested_gpu_count + - name: gpu_memory_util + default: "0.95" + + - name: py + default: | + import os + import sys + from typing import Tuple + from config_explorer.capacity_planner import * + + def log_failed(msg: str, ignore_if_failed = True): + print(f'❌ {msg}') + if not ignore_if_failed: + sys.exit(1) + + def log_warning(msg): + print(f'⚠️ {msg}') + + def log_info(msg): + print(f'ℹ️ {msg}') + + def get_model_info(model_name: str, hf_token: str, ignore_if_failed: bool) -> ModelInfo | None: + """ + Obtains model info from HF + """ + + try: + return get_model_info_from_hf(model_name, hf_token) + + except GatedRepoError: + log_failed("Model is gated and provided token does not, work. Please double check.", ignore_if_failed) + except HfHubHTTPError as hf_exp: + log_failed(f"Error reaching Hugging Face API: {hf_exp}", ignore_if_failed) + except Exception as e: + log_failed(f"Cannot retrieve ModelInfo: {e}", ignore_if_failed) + + return None + + def get_model_config_and_text_config(model_name: str, hf_token: str, ignore_if_failed: bool) -> Tuple[AutoConfig | None, AutoConfig | None]: + """ + Obtains model config and text config from HF + """ + + try: + config = get_model_config_from_hf(model_name, hf_token) + return config, get_text_config(config) + + except GatedRepoError: + log_failed("Model is gated and provided token does not, work. Please double check.", ignore_if_failed) + except HfHubHTTPError as hf_exp: + log_failed(f"Error reaching Hugging Face API: {hf_exp}", ignore_if_failed) + except Exception as e: + log_failed(f"Cannot retrieve model config: {e}", ignore_if_failed) + + return None, None + + def validate_vllm_params(): + print ("validate_vllm_params() called") + + replicas = int(os.getenv("REPLICAS")) + user_requested_gpu_count = int(os.getenv("USER_REQUESTED_GPU_COUNT")) + tp = int(os.getenv("TP")) + dp = int(os.getenv("DP")) + model = os.getenv("MODEL") + gpu_memory = int(os.getenv("GPU_MEMORY")) + max_model_len = int(os.getenv("MAX_MODEL_LEN")) + gpu_memory_util = float(os.getenv("GPU_MEMORY_UTIL")) + hf_token = os.getenv("HF_TOKEN") + ignore_if_failed = os.getenv("BEHAVIOR_ON_FAILURE") != 'terminate' + + print(f"model = {model}") + print(f"replicas = {replicas}") + print(f"user_requested_gpu_count = {user_requested_gpu_count}") + print(f"tp = {tp}") + print(f"dp = {dp}") + print(f"gpu_memory = {gpu_memory}") + print(f"max_model_len = {max_model_len}") + print(f"gpu_memory_util = {gpu_memory_util}") + print(f"ignore_if_failed = {ignore_if_failed}") + + # Sanity check on user inputs. If GPU memory cannot be determined, return False indicating that the sanity check is incomplete + skip_gpu_tests = False + if gpu_memory is None or gpu_memory == 0: + log_failure("Cannot determine accelerator memory. Please set LLMDBENCH_VLLM_COMMON_ACCELERATOR_MEMORY to enable Capacity Planner. Skipping GPU memory required checks, especially KV cache estimation.", ignore_if_failed) + skip_gpu_tests = True + + per_replica_requirement = gpus_required(tp=tp, dp=dp) + if replicas == 0: + per_replica_requirement = 0 + total_gpu_requirement = per_replica_requirement + + if total_gpu_requirement > user_requested_gpu_count: + log_failed(f"Requested {user_requested_gpu_count} GPUs but it is too low. It must be greater than TP x DP ({tp} x {dp} = {total_gpu_requirement})") + + if total_gpu_requirement < user_requested_gpu_count: + log_warning(f"For each replica, model requires {total_gpu_requirement}, but you requested {user_requested_gpu_count} for the deployment. Some GPUs will be idle.") + + model_info = get_model_info(model, hf_token, ignore_if_failed) + model_config, text_config = get_model_config_and_text_config(model, hf_token, ignore_if_failed) + if model_config is not None: + # Check if parallelism selections are valid + try: + valid_tp_values = find_possible_tp(text_config) + log_info(f"valid tp values are: {valid_tp_values}") + if tp not in valid_tp_values: + log_failed(f"TP={tp} is invalid. Please select from these options ({valid_tp_values}) for {model}.", ignore_if_failed) + else: + log_info(f"TP={tp} is valid.") + except AttributeError: + # Error: config['num_attention_heads'] not in config + log_failed(f"Cannot obtain data on the number of attention heads, cannot find valid tp values: {e}", ignore_if_failed) + + # Check if model context length is valid + valid_max_context_len = 0 + try: + # Error: config['max_positional_embeddings'] not in config + valid_max_context_len = max_context_len(model_config) + log_info(f"The max context length is {valid_max_context_len}") + except AttributeError as e: + log_failed(f"Cannot obtain data on the max context length for model: {e}", ignore_if_failed) + + if max_model_len > valid_max_context_len: + log_failed(f"Max model length = {max_model_len} exceeds the acceptable for {model}. Set LLMDBENCH_VLLM_COMMON_MAX_MODEL_LEN to a value below or equal to {valid_max_context_len}", ignore_if_failed) + else: + log_failed("Model config on parameter shape is not available.", ignore_if_failed) + + # Display memory info + if not skip_gpu_tests: + log_info("👉 Collecting GPU information....") + avail_gpu_memory = available_gpu_memory(gpu_memory, gpu_memory_util) + log_info(f"{gpu_memory} GB of memory per GPU, with {gpu_memory} GB x {gpu_memory_util} (gpu_memory_utilization) = {avail_gpu_memory} GB available to use.") + log_info(f"Each model replica requires {per_replica_requirement} GPUs, total available GPU memory = {avail_gpu_memory * per_replica_requirement} GB.") + + # Calculate model memory requirement + log_info("👉 Collecting model information....") + if model_info is not None: + try: + model_params = model_total_params(model_info) + log_info(f"{model} has a total of {model_params} parameters") + + model_mem_req = model_memory_req(model_info, model_config) + log_info(f"{model} requires {model_mem_req} GB of memory") + + # Estimate KV cache memory and max number of requests that can be served in worst case scenario + if not skip_gpu_tests: + log_info("👉 Estimating available KV cache....") + available_kv_cache = allocatable_kv_cache_memory( + model_info, model_config, + gpu_memory, gpu_memory_util, + tp=tp, dp=dp, + ) + log_info(f"Allocatable memory for KV cache {available_kv_cache} GB") + + if available_kv_cache < 0: + log_failed(f"There is not enough GPU memory to stand up model. Exceeds by {abs(available_kv_cache)} GB.", ignore_if_failed) + else: + kv_details = KVCacheDetail(model_info, model_config, max_model_len, batch_size=1) + log_info(f"KV cache memory for a request taking --max-model-len={max_model_len} requires {kv_details.per_request_kv_cache_gb} GB of memory") + + total_concurrent_reqs = max_concurrent_requests( + model_info, model_config, max_model_len, + gpu_memory, gpu_memory_util, + tp=tp, dp=dp, + ) + log_info(f"The vLLM server can process up to {total_concurrent_reqs} number of requests at the same time, assuming the worst case scenario that each request takes --max-model-len") + + except AttributeError as e: + # Model might not have safetensors data on parameters + log_failed(f"Does not have enough information about model to estimate model memory or KV cache: {e}", ignore_if_failed) + else: + log_failed(f"Model info on model's architecture is not available.", ignore_if_failed) + + def main(): + """Main function""" + print("main() called") + validate_vllm_params() + print("main() exiting") + + if __name__ == "__main__": + sys.exit(main()) + env: + - name: VALIDATE_CAPACITY + value: $(params.validateCapacity) + - name: BEHAVIOR_ON_FAILURE + value: $(params.behaviorOnValidationFailure) + + - name: MODEL + value: $(params.model) + + - name: REPLICAS + value: $(params.replicas) + - name: TP + value: $(params.tp) + - name: DP + value: $(params.dp) + - name: GPU_MEMORY + value: $(params.gpu_memory) + - name: USER_REQUESTED_GPU_COUNT + value: $(params.user_requested_gpu_count) + - name: MAX_MODEL_LEN + value: $(params.max_model_len) + - name: GPU_MEMORY_UTIL + value: $(params.gpu_memory_util) + + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + + - name: PY_BIN + value: "$(params.py)" + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/usr/bin/env bash + + if [ "${VALIDATE_CAPACITY}" != "true" ]; then + echo "ℹ️ Skipping capacity validation" + exit 0 + fi + + # Install git so can install capacity explorer + apt-get update \ + && apt-get install -y git \ + && rm -rf /var/lib/apt/lists/* + python -m pip install --no-cache "config_explorer @ git+https://github.com/llm-d/llm-d-benchmark.git/#subdirectory=config_explorer" + + # run capacity explorer + printf "%s\n" "${PY_BIN}" | python - + + diff --git a/tekton-poc/pipeline/steps/inference-perf.yaml b/tekton-poc/pipeline/steps/inference-perf.yaml new file mode 100644 index 00000000..b0d82c64 --- /dev/null +++ b/tekton-poc/pipeline/steps/inference-perf.yaml @@ -0,0 +1,229 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: inference-perf-run +spec: + params: + - name: harnessName + - name: harnessProfile + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "inference-perf" + - name: HARNESS_PROFILE + value: "$(params.harnessProfile)" + + - name: GIT_REPO_URL + value: "https://github.com/kubernetes-sigs/inference-perf.git" + - name: GIT_REVISION + value: "main" + - name: GIT_COMMIT + value: "1ccc48b6bb9c9abb61558b719041fb000b265e59" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/usr/bin/env bash + + # https://github.com/llm-d/llm-d-benchmark/blob/main/workload/harnesses/inference-perf-llm-d-benchmark.sh + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L56-L62 + # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + CONTROL_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + RUN_DIR=$(pwd) + + # TODO figure out which are actually needed for each step + echo "🔄 Installing required tools" + apt-get update + apt-get install -y \ + git \ + pip \ + yq \ + && apt-get clean && rm -rf /var/cache/apt + + echo "🔄 Cloning and installing harness: ${MY_HARNESS_NAME}" + git clone --branch ${GIT_REVISION} ${GIT_REPO_URL} + cd inference-perf + git checkout ${GIT_COMMIT} + pip install . + + # profile name and location + workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) + workload_profile=${workload}.yaml + workload_profile_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/${workload_profile} + + # update .storage.local_storage.path in profile + pushd "$RESULTS_DIR" + yq '.storage["local_storage"]["path"] = '\"${RESULTS_DIR}\" <"${workload_profile_path}" -y >${workload_profile} + + # run inference-perf + inference-perf --config_file "$(realpath ./${workload_profile})" > >(tee -a ${RESULTS_DIR}/stdout.log) 2> >(tee -a ${RESULTS_DIR}/stderr.log >&2) + export LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC=$? + + # If benchmark harness returned with an error, exit here + if [[ $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC -ne 0 ]]; then + echo "❌ Harness returned with error $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC" + exit $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC + fi + echo "✅ Harness completed successfully." +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: inference-perf-analyze-results +spec: + params: + - name: harnessName + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "inference-perf" + + - name: GIT_REPO_URL + value: "https://github.com/kubernetes-sigs/inference-perf.git" + - name: GIT_REVISION + value: "main" + - name: GIT_COMMIT + value: "1ccc48b6bb9c9abb61558b719041fb000b265e59" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + +# https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/usr/bin/env bash + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + echo "🔄 Installing requirements" + apt-get update + apt-get install -y \ + git \ + pip \ + && apt-get clean && rm -rf /var/cache/apt + + git clone --branch ${GIT_REVISION} ${GIT_REPO_URL} + cd inference-perf + git checkout ${GIT_COMMIT} + pip install . + + cat < requirements-analysis.txt + matplotlib>=3.7.0 + numpy>=2.3.1 + seaborn>=0.12.0 + pandas>=2.2.3 + pydantic>=2.11.7 + PyYAML>=6.0.2 + scipy>=1.16.0 + requests>=2.32.5 + EOF + + cat requirements-analysis.txt + pip --version + pip install --no-cache-dir \ + --disable-pip-version-check \ + --upgrade \ + -r ./requirements-analysis.txt \ + --root-user-action=ignore + pip list + + # Download covert python from llm-d-benchmark + # TBD: should the python be embedded in the step? A separate step perhaps. + export ROOT_DIR=workload/report + export BRANCH=main + + cat < >(tee -a $RESULTS_DIR/stderr.log >&2) + # Report errors but don't quit + export RUN_EXPERIMENT_CONVERT_RC=$? + if [[ $RUN_EXPERIMENT_CONVERT_RC -ne 0 ]]; then + echo "./convert.py returned with error $RUN_EXPERIMENT_CONVERT_RC converting: $result" + fi + done + + # Define function to call analysis so can call multiple times + # https://github.com/llm-d/llm-d-benchmark/blob/main/analysis/inference-perf-analyze_results.sh + analyze_results () { + mkdir -p $RESULTS_DIR/analysis + sleep 60 + tm=$(date) + inference-perf --analyze "$RESULTS_DIR" + ec=$? + find $RESULTS_DIR -type f -newermt "${tm}" -exec mv -t "$RESULTS_DIR"/analysis {} + + return $ec + } + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/llm-d-benchmark.sh#L63-L74 + echo "🔄 Running analysis" + # Try to run analysis twice then give up + analyze_results + ec=$? + if [[ $ec -ne 0 ]]; then + echo "execution of analyzer failed, wating 120 seconds and trying again" + sleep 120 + set -x + analyze_results + fi + # Return with error code of first iteration of experiment analyzer + # TBD modify this message depending on success + echo "✅ Results analyzed and reports generated" + exit $ec diff --git a/tekton-poc/pipeline/steps/stepactions.yaml b/tekton-poc/pipeline/steps/stepactions.yaml new file mode 100644 index 00000000..fd1d07ee --- /dev/null +++ b/tekton-poc/pipeline/steps/stepactions.yaml @@ -0,0 +1,354 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: create-rwx-pvc +spec: + params: + - name: name + type: string + - name: namespace + type: string + - name: size + type: string + default: "1Gi" + - name: storage-class + type: string + default: "default" + + - name: dry-run + type: string + default: "false" + env: + - name: NAME + value: $(params.name) + # - name: TARGET_NAMESPACE_RESULT + # value: $(results.targetNamespace.path) + - name: NAMESPACE + value: $(params.namespace) + - name: SIZE + value: $(params.size) + - name: STORAGE_CLASS + value: $(params.storage-class) + - name: DRY_RUN + value: $(params.dry-run) + image: alpine/kubectl:1.34.1 + script: | + #!/bin/sh + if [ "${DRY_RUN}" = "true" ]; then + echo ">> skipping" + exit 0 + fi + + # NAMESPACE=$(cat $TARGET_NAMESPACE_RESULT) + + cat <- + Chart ref or name. Examples: + - "nginx" (used with repoName to form repoName/nginx) + - "bitnami/nginx" + - "oci://registry.example.com/myorg/mychart" + - name: version + type: string + default: "" + description: Optional chart version + + # Repo management (add/update) + - name: repoName + type: string + default: "" + description: If set with repoUrl, the action will 'helm repo add' and 'helm repo update' + - name: repoUrl + type: string + default: "" + description: Chart repository URL + - name: updateRepo + type: string + default: "true" + description: '"true" to run helm repo update' + + # Repo auth/TLS (optional) + - name: repoUsername + type: string + default: "" + - name: repoPassword + type: string + default: "" + - name: repoPassCredentials + type: string + default: "false" + description: '"true" to pass credentials to all domains' + - name: repoInsecureSkipTLSVerify + type: string + default: "false" + - name: repoCAFile + type: string + default: "" + - name: repoCertFile + type: string + default: "" + - name: repoKeyFile + type: string + default: "" + + # Install/upgrade behavior + - name: namespace + type: string + default: "default" + - name: createNamespace + type: string + default: "true" + - name: wait + type: string + default: "true" + - name: timeout + type: string + default: "10m0s" + + # Values and extra args + - name: valuesYaml + type: string + default: "" + - name: valuesYamlUrl + type: string + default: "" + - name: extraArgs + type: string + default: "" + - name: extraValues + type: string + default: "" + - name: treatmentAnalysis + type: string + default: "" + + - name: dry-run + type: string + default: "false" + # ---------- Params -> env (StepActions don't interpolate $(params.*) directly in script) ---------- + env: + - name: GIT_URL + value: $(params.git_url) + - name: GIT_REVISION + value: $(params.git_revision) + - name: GIT_DEPTH + value: $(params.depth) + - name: CHECKOUT_DIR + value: $(params.checkout_dir) + + - name: HELM_RELEASE + value: "$(params.releaseName)" + - name: HELM_CHART + value: "$(params.chart)" + - name: HELM_VERSION + value: "$(params.version)" + + - name: HELM_REPO_NAME + value: "$(params.repoName)" + - name: HELM_REPO_URL + value: "$(params.repoUrl)" + - name: HELM_REPO_UPDATE + value: "$(params.updateRepo)" + + - name: HELM_REPO_USERNAME + value: "$(params.repoUsername)" + - name: HELM_REPO_PASSWORD + value: "$(params.repoPassword)" + - name: HELM_REPO_PASS_CREDS + value: "$(params.repoPassCredentials)" + - name: HELM_REPO_INSECURE + value: "$(params.repoInsecureSkipTLSVerify)" + - name: HELM_REPO_CA_FILE + value: "$(params.repoCAFile)" + - name: HELM_REPO_CERT_FILE + value: "$(params.repoCertFile)" + - name: HELM_REPO_KEY_FILE + value: "$(params.repoKeyFile)" + + - name: HELM_NAMESPACE + value: "$(params.namespace)" + - name: HELM_CREATE_NAMESPACE + value: "$(params.createNamespace)" + - name: HELM_WAIT + value: "$(params.wait)" + - name: HELM_TIMEOUT + value: "$(params.timeout)" + - name: HELM_VALUES_YAML + value: "$(params.valuesYaml)" + - name: HELM_VALUES_YAML_URL + value: "$(params.valuesYamlUrl)" + - name: HELM_EXTRA_ARGS + value: "$(params.extraArgs)" + - name: HELM_EXTRA_VALUES + value: "$(params.extraValues)" + + - name: TREATMENT_ANALYSIS + value: "$(params.treatmentAnalysis)" + + - name: DRY_RUN + value: $(params.dry-run) + + script: | + #!/usr/bin/env sh + set -eu + + if [ "${DRY_RUN}" = "true" ]; then + echo ">> skipping" + exit 0 + fi + + apk add --no-cache jq >/dev/null + + echo ">>> helm step: treatment" + printf "%s" "${TREATMENT_ANALYSIS}" + + SHA256CMD=$(type -p gsha256sum || type -p sha256sum) + NAMESPACE_HASH=$(echo -n "$HELM_NAMESPACE" | $SHA256CMD | awk '{print $1}' | cut -c1-8) + HELM_RELEASE=$(echo "$HELM_RELEASE" | sed "s/NAMESPACE_HASH/$NAMESPACE_HASH/g") + + # if a GIT_URL is defined, clone the project; we will use helm chart from this + if [ -n "${GIT_URL:-}" ]; then + mkdir -p "$CHECKOUT_DIR" + rm -rf "$CHECKOUT_DIR/.git" || true + echo "Cloning $GIT_URL @ $GIT_REVISION into $CHECKOUT_DIR" + git init "$CHECKOUT_DIR" + git -C "$CHECKOUT_DIR" remote add origin "$GIT_URL" + git -C "$CHECKOUT_DIR" fetch --depth "$GIT_DEPTH" origin "$GIT_REVISION" + git -C "$CHECKOUT_DIR" checkout FETCH_HEAD + COMMIT=$(git -C "$CHECKOUT_DIR" rev-parse HEAD) + echo "Checked out commit: $COMMIT" + fi + + # Construct optional values file; values overrides url + VALUES_FLAG="" + if [ -n "${HELM_VALUES_YAML_URL:-}" ]; then + VALUES_FLAG="-f ${HELM_VALUES_YAML_URL}" + fi + + if [ -n "${HELM_VALUES_YAML:-}" ]; then + printf "%s" "${HELM_VALUES_YAML}" > /tmp/${HELM_RELEASE}-values.yaml + VALUES_FLAG="-f /tmp/${HELM_RELEASE}-values.yaml" + fi + + if [ -n "${HELM_EXTRA_VALUES:-}" ]; then + echo ">>> HELM_EXTRA_VALUES" + printf "%s" "${HELM_EXTRA_VALUES}" + printf "%s" "${HELM_EXTRA_VALUES}" > /tmp/${HELM_RELEASE}-extra-values.yaml + VALUES_FLAG="${VALUES_FLAG} -f /tmp/${HELM_RELEASE}-extra-values.yaml" + fi + + + # Optional repo add (idempotent via --force-update) + if [ -n "${HELM_REPO_NAME:-}" ] && [ -n "${HELM_REPO_URL:-}" ]; then + REPO_ADD_FLAGS="--force-update" + [ -n "${HELM_REPO_USERNAME:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --username ${HELM_REPO_USERNAME}" + [ -n "${HELM_REPO_PASSWORD:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --password ${HELM_REPO_PASSWORD}" + [ "${HELM_REPO_PASS_CREDS:-false}" = "true" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --pass-credentials" + [ "${HELM_REPO_INSECURE:-false}" = "true" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --insecure-skip-tls-verify" + [ -n "${HELM_REPO_CA_FILE:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --ca-file ${HELM_REPO_CA_FILE}" + [ -n "${HELM_REPO_CERT_FILE:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --cert-file ${HELM_REPO_CERT_FILE}" + [ -n "${HELM_REPO_KEY_FILE:-}" ] && REPO_ADD_FLAGS="$REPO_ADD_FLAGS --key-file ${HELM_REPO_KEY_FILE}" + + echo "==> Adding/refreshing repo ${HELM_REPO_NAME} -> ${HELM_REPO_URL}" + # shellcheck disable=SC2086 + helm repo add bitnami https://charts.bitnami.com/bitnami + helm repo add "${HELM_REPO_NAME}" "${HELM_REPO_URL}" ${REPO_ADD_FLAGS} + # (helm repo add flags documented by Helm) # docs: https://helm.sh/docs/helm/helm_repo_add/ + + if [ "${HELM_REPO_UPDATE:-true}" = "true" ]; then + echo "==> Updating Helm repo cache" + # Update all repos for portability across Helm versions + helm repo update + fi + fi + + # Build common flags + CREATE_NS_FLAG=""; [ "${HELM_CREATE_NAMESPACE:-true}" = "true" ] && CREATE_NS_FLAG="--create-namespace" + WAIT_FLAG=""; [ "${HELM_WAIT:-true}" = "true" ] && WAIT_FLAG="--wait" + VERSION_FLAG=""; [ -n "${HELM_VERSION:-}" ] && VERSION_FLAG="--version ${HELM_VERSION}" + TIMEOUT_FLAG=""; [ -n "${HELM_TIMEOUT:-}" ] && TIMEOUT_FLAG="--timeout ${HELM_TIMEOUT}" + + # Decide final chart reference: + # - If user passed repoName and a bare chart, use repoName/chart. + # - If user passed repo/chart or oci://..., use as-is. + CHART_REF="${HELM_CHART}" + case "${HELM_CHART}" in + */*|oci://*) : ;; + *) if [ -n "${HELM_REPO_NAME:-}" ]; then CHART_REF="${HELM_REPO_NAME}/${HELM_CHART}"; fi ;; + esac + + if [ -n "${TREATMENT_ANALYSIS:-}" ]; then + HELM_EXTRA_ARGS="${HELM_EXTRA_ARGS} $(echo ${TREATMENT_ANALYSIS} | jq -r '.setArgs')" + fi + + if [ -n "${HELM_EXTRA_ARGS:-}" ]; then + HELM_EXTRA_ARGS=$(echo "$HELM_EXTRA_ARGS" | sed "s/NAMESPACE_HASH/$NAMESPACE_HASH/g") + fi + + echo "==> helm upgrade --install ${HELM_RELEASE} ${CHART_REF} --namespace ${HELM_NAMESPACE} ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS}" + # shellcheck disable=SC2086 + helm template \ + "${HELM_RELEASE}" "${CHART_REF}" \ + --namespace "${HELM_NAMESPACE}" \ + ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS} + # shellcheck disable=SC2086 + helm template \ + "${HELM_RELEASE}" "${CHART_REF}" \ + --namespace "${HELM_NAMESPACE}" \ + ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS} \ + | kubectl --namespace "${HELM_NAMESPACE}" apply -f - + # helm upgrade --install \ + # "${HELM_RELEASE}" "${CHART_REF}" \ + # --namespace "${HELM_NAMESPACE}" \ + # ${VERSION_FLAG} ${CREATE_NS_FLAG} ${WAIT_FLAG} ${TIMEOUT_FLAG} ${VALUES_FLAG} ${HELM_EXTRA_ARGS} + diff --git a/tekton-poc/pipeline/steps/treatment.yaml b/tekton-poc/pipeline/steps/treatment.yaml new file mode 100644 index 00000000..33888574 --- /dev/null +++ b/tekton-poc/pipeline/steps/treatment.yaml @@ -0,0 +1,65 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: analyze-treatment +spec: + description: | + Produce '--set/--set-string path=value' flags for factorType and + apply values into a JSON/YAML file. Works with flat or nested treatment. + image: alpine:3.20 + params: + - name: factorType + type: string + - name: factorMapping + type: string + description: JSON mapping + - name: treatment + type: string + description: JSON values (flat or nested by key) + results: + - name: treatmentAnalysis + description: Space-separated '--set/--set-string path=value' tokens + env: + - name: SELECTOR + value: $(params.factorType) + - name: MAP_JSON + value: $(params.factorMapping) + - name: VAL_JSON + value: $(params.treatment) + script: | + #!/bin/sh + set -eu + apk add --no-cache jq yq >/dev/null + + # Build updates + flags (uses $val for type checks — fixed version) + jq -r -n \ + --arg root "$SELECTOR" \ + --argjson map "$MAP_JSON" \ + --argjson vals "$VAL_JSON" ' + ($map[$root] // {}) as $m + | if ($m | type) != "object" then + error("Key not found in mapping: " + $root) + else + (if ($vals[$root] | type) == "object" then $vals[$root] else $vals end) as $v + | { + updates: [ + $m | to_entries[] + | select($v[.key] != null) + | { name: .key, path: (.value | split(".")), value: $v[.key] } + ], + setArgs: ( + [ $m | to_entries[] + | select($v[.key] != null) + | ( $v[.key] ) as $val + | if ( ($val | type) == "string" ) then + "--set-string \(.value)=\($val)" + else + "--set \(.value)=\( if ( ($val|type)=="object" or ($val|type)=="array") then ($val|tojson) else ($val|tostring) end )" + end + ] | join(" ") + ) + } + end + ' > /tmp/out.json + + printf "%s" "$(cat /tmp/out.json)" > "$(step.results.treatmentAnalysis.path)" diff --git a/tekton-poc/pipeline/steps/vllm-benchmark.yaml b/tekton-poc/pipeline/steps/vllm-benchmark.yaml new file mode 100644 index 00000000..aead6b96 --- /dev/null +++ b/tekton-poc/pipeline/steps/vllm-benchmark.yaml @@ -0,0 +1,255 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: vllm-benchmark-run +spec: + params: + - name: harnessName + - name: harnessProfile + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "vllm-benchmark" + - name: HARNESS_PROFILE + value: "$(params.harnessProfile)" + + - name: GIT_REPO_URL + value: "https://github.com/vllm-project/vllm.git" + - name: GIT_REVISION + value: "main" + - name: GIT_COMMIT + value: "b6381ced9c52271f799a8348fcc98c5f40528cdf" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-secret + key: HF_TOKEN + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/bin/bash + + # https://github.com/llm-d/llm-d-benchmark/blob/main/workload/harnesses/vllm-benchmark-llm-d-benchmark.sh + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L56-L62 + # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + CONTROL_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + RUN_DIR=$(pwd) + + # TODO figure out which are actually needed for each step + echo "🔄 Installing required tools" + apt-get update + apt-get install -y \ + git \ + gpg \ + pip \ + yq \ + && apt-get clean && rm -rf /var/cache/apt + + echo "🔄 Cloning and installing harness: ${MY_HARNESS_NAME}" + git clone --branch ${GIT_REVISION} ${GIT_REPO_URL} + cd vllm + git checkout ${GIT_COMMIT} + cd .. + mv -f vllm vllm-benchmark + + # TBD pin versions + cat < requirements-vllm-benchmark.txt + aiohttp + datasets + numpy + pandas + pillow + tqdm + transformers + EOF + + cat requirements-vllm-benchmark.txt + pip --version + pip install --no-cache-dir \ + --disable-pip-version-check \ + --upgrade \ + -r ./requirements-vllm-benchmark.txt \ + --root-user-action=ignore + pip list + + # profile name and location + workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) + workload_profile=${workload}.yaml + workload_profile_path=${CONTROL_DIR}/workload/profiles/${MY_HARNESS_NAME}/${workload_profile} + + # run vllm-benchmark + cp ${workload_profile_path} ${workload_profile} + en=$(cat ${workload_profile} | yq -r .executable) + + echo "pwd = $(pwd)" + echo "RUN_DIR=$RUN_DIR" + echo "running - ${RUN_DIR}/vllm-benchmark/benchmarks/${en}" + ls -l ${RUN_DIR}/vllm-benchmark/benchmarks + python ${RUN_DIR}/vllm-benchmark/benchmarks/${en} --$(cat ${workload_profile} | grep -v "^executable" | yq -r 'to_entries | map("\(.key)=\(.value)") | join(" --")' | sed -e 's^=none ^^g' -e 's^=none$^^g') --seed $(date +%s) --save-result > >(tee -a $RESULTS_DIR/stdout.log) 2> >(tee -a $RESULTS_DIR/stderr.log >&2) + export LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC=$? + find ${RUN_DIR}/vllm-benchmark -maxdepth 1 -mindepth 1 -name '*.json' -exec mv -t "$RESULTS_DIR"/ {} + + + # If benchmark harness returned with an error, exit here + if [[ $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC -ne 0 ]]; then + echo "❌ Harness returned with error $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC" + exit $LLMDBENCH_RUN_EXPERIMENT_HARNESS_RC + fi + echo "✅ Harness completed successfully." +--- +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: vllm-benchmark-analyze-results +spec: + params: + - name: harnessName + - name: pipelineUID + env: + - name: REQUESTED_HARNESS_NAME + value: "$(params.harnessName)" + - name: MY_HARNESS_NAME + value: "vllm-benchmark" + + - name: GIT_REPO_URL + value: "https://github.com/kubernetes-sigs/inference-perf.git" + - name: GIT_REVISION + value: "main" + - name: GIT_COMMIT + value: "1ccc48b6bb9c9abb61558b719041fb000b265e59" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + +# https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/usr/bin/env bash + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${MY_HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + + if [ "${REQUESTED_HARNESS_NAME}" != "${MY_HARNESS_NAME}" ]; then + echo "Requested harness not ${MY_HARNESS_NAME}, skipping" + exit 0 + fi + + echo "🔄 Installing requirements" + # apt-get update + # apt-get install -y \ + # git \ + # pip \ + # && apt-get clean && rm -rf /var/cache/apt + + cat < requirements-analysis.txt + matplotlib>=3.7.0 + numpy>=2.3.1 + seaborn>=0.12.0 + pandas>=2.2.3 + pydantic>=2.11.7 + PyYAML>=6.0.2 + scipy>=1.16.0 + requests>=2.32.5 + EOF + + cat requirements-analysis.txt + pip --version + pip install --no-cache-dir \ + --disable-pip-version-check \ + --upgrade \ + -r ./requirements-analysis.txt \ + --root-user-action=ignore + pip list + + # Download covert python from llm-d-benchmark + # TBD: should the python be embedded in the step? A separate step perhaps. + export ROOT_DIR=workload/report + export BRANCH=main + + cat < >(tee -a $RESULTS_DIR/stderr.log >&2) + # Report errors but don't quit + export RUN_EXPERIMENT_CONVERT_RC=$? + if [[ $RUN_EXPERIMENT_CONVERT_RC -ne 0 ]]; then + echo "./convert.py returned with error $RUN_EXPERIMENT_CONVERT_RC converting: $result" + fi + done + + # Define function to call analysis so can call multiple times + # https://github.com/llm-d/llm-d-benchmark/blob/main/analysis/vllm-benchmark-analyze_results.sh + analyze_results () { + mkdir -p $RESULTS_DIR/analysis + result_start=$(grep -nr "Result ==" $RESULTS_DIR/stdout.log | cut -d ':' -f 1) + total_file_lenght=$(cat $RESULTS_DIR/stdout.log | wc -l) + cat $RESULTS_DIR/stdout.log | sed "$result_start,$total_file_lenght!d" > $RESULTS_DIR/analysis/summary.txt + return $? + } + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/llm-d-benchmark.sh#L63-L74 + echo "🔄 Running analysis" + # Try to run analysis twice then give up + analyze_results + ec=$? + if [[ $ec -ne 0 ]]; then + echo "execution of analyzer failed, wating 120 seconds and trying again" + sleep 120 + set -x + analyze_results + fi + # Return with error code of first iteration of experiment analyzer + # TBD modify this message depending on success + echo "✅ Results analyzed and reports generated" + exit $ec + diff --git a/tekton-poc/pipeline/steps/workload-profile.yaml b/tekton-poc/pipeline/steps/workload-profile.yaml new file mode 100644 index 00000000..bc4810fe --- /dev/null +++ b/tekton-poc/pipeline/steps/workload-profile.yaml @@ -0,0 +1,131 @@ +apiVersion: tekton.dev/v1beta1 +kind: StepAction +metadata: + name: prepare-workload-profile +spec: + params: + - name: harnessName + - name: harnessProfile + - name: model-id + - name: namespace + - name: treatmentAnalysis + - name: pipelineUID + env: + - name: HARNESS_NAME + value: "$(params.harnessName)" + - name: HARNESS_PROFILE + value: "$(params.harnessProfile)" + + - name: TREATMENT_ANALYSIS + value: "$(params.treatmentAnalysis)" + + - name: LLMDBENCH_DEPLOY_CURRENT_MODEL + value: "$(params.model-id)" + - name: LLMDBENCH_HARNESS_STACK_ENDPOINT_URL + value: "http://experiment-gateway-inference-gateway.$(params.namespace).svc.cluster.local:80" + + - name: DATA_ROOT_DIR + value: $(workspaces.data.path) + - name: MY_TASK_NAME + value: $(context.taskRun.name) + - name: MY_PIPELINE_UID + value: $(params.pipelineUID) + + # https://github.com/llm-d/llm-d-benchmark/blob/main/build/Dockerfile#L1C6-L1C33 + image: python:3.12.9-slim-bookworm + script: | + #!/bin/bash + + echo "🔄 Preparing workload profile ${HARNESS_PROFILE} for ${HARNESS_NAME}" + + # TBD is this necessary or is it already there? + apt-get update + apt-get install -y --no-install-recommends curl ca-certificates jq + curl -fsSL https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 \ + -o /usr/local/bin/yq + chmod +x /usr/local/bin/yq + jq --version + yq --version + + # https://github.com/llm-d/llm-d-benchmark/blob/main/setup/run.sh + + EXPERIMENT_ID="experiment-$(echo -n ${MY_PIPELINE_UID} | cut -c1-8)" + RESULTS_DIR="${DATA_ROOT_DIR}/${HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + CONTROL_DIR="${DATA_ROOT_DIR}/${HARNESS_NAME}_${EXPERIMENT_ID}_${MY_TASK_NAME}" + RUN_DIR=$(pwd) + + echo "🔄 Installing required tools" + apt-get update + apt-get install -y \ + wget \ + && apt-get clean && rm -rf /var/cache/apt + + # Ensure all folders created + mkdir -p $RESULTS_DIR + mkdir -p $CONTROL_DIR/setup + rm -rf $CONTROL_DIR/setup/sed-commands + touch $CONTROL_DIR/setup/sed-commands + mkdir -p ${CONTROL_DIR}/workload/profiles/${HARNESS_NAME}/templates + + cd ${RUN_DIR}/vllm-benchmark/ + + # Define constants: input profile template name and location; final profile name and location + workload=$(echo ${HARNESS_PROFILE} | sed 's^\.yaml^^g' ) + workload_template=${workload}.yaml.in + workload_template_path=${CONTROL_DIR}/workload/profiles/${HARNESS_NAME}/templates/${workload_template} + workload_profile=${workload}.yaml + workload_profile_path=${CONTROL_DIR}/workload/profiles/${HARNESS_NAME}/${workload_profile} + + echo "🔄 Prepare workload profile" + # Fetch profile template from llmd-benchmark + wget -O ${workload_template_path} \ + --quiet \ + https://raw.githubusercontent.com/llm-d/llm-d-benchmark/refs/heads/main/workload/profiles/${HARNESS_NAME}/${workload_template} + + # Apply treatment to profile template to produce final profile + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_MODEL^${LLMDBENCH_DEPLOY_CURRENT_MODEL}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_HARNESS_STACK_ENDPOINT_URL^${LLMDBENCH_HARNESS_STACK_ENDPOINT_URL}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "s^REPLACE_ENV_LLMDBENCH_DEPLOY_CURRENT_TOKENIZER^${LLMDBENCH_DEPLOY_CURRENT_TOKENIZER}^g" >> ${CONTROL_DIR}/setup/sed-commands + echo "---------- sed-commands" + cat ${CONTROL_DIR}/setup/sed-commands + echo "----------" + sed -f ${CONTROL_DIR}/setup/sed-commands ${workload_template_path} > ${workload_profile_path} + + # TBD eliminate the TARGET_FILE env variable + TARGET_FILE=${workload_profile_path} + echo "${TREATMENT_ANALYSIS}" | jq '.updates' > /tmp/updates.json + echo ">>> /tmp/updates.json" + cat /tmp/updates.json + + if [ ! -f "$TARGET_FILE" ]; then + echo "ERROR: File not found: $TARGET_FILE" >&2 + exit 1 + fi + + # Apply updates to JSON or YAML + if [ "$(jq 'length' /tmp/updates.json)" -gt 0 ]; then + ext="${TARGET_FILE##*.}" + tmp="${TARGET_FILE}.tmp" + + # TBD eliminate the json path (copilot generated this); profiles are yaml files + if [ "$ext" = "json" ]; then + jq --slurpfile upds /tmp/updates.json ' + reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + ' "$TARGET_FILE" > "$tmp" + mv "$tmp" "$TARGET_FILE" + else + # YAML path: YAML → JSON → apply → YAML + yq -o=json '.' "$TARGET_FILE" \ + | jq --slurpfile upds /tmp/updates.json ' + reduce $upds[0][] as $u (. ; setpath($u.path; $u.value)) + ' \ + | yq -P > "$tmp" + mv "$tmp" "$TARGET_FILE" + fi + fi + + echo "---------- workload profile" + cat ${workload_profile_path} + echo "----------" + + echo "✅ workload profile ready" diff --git a/tekton-poc/pipeline/tasks/treatment.yaml b/tekton-poc/pipeline/tasks/treatment.yaml new file mode 100644 index 00000000..17844190 --- /dev/null +++ b/tekton-poc/pipeline/tasks/treatment.yaml @@ -0,0 +1,755 @@ +apiVersion: tekton.dev/v1 +kind: Task +metadata: + name: treatment +spec: + description: > + Runs an llm-d-benchmark experiment. + + workspaces: + - name: data + + params: + - name: treatment + type: string + description: | + JSON string of factors and values for one treatment. + Includes both infrastructure and workload factors. + + - name: factorMapping + type: string + description: | + JSON string mapping factor to path in source yaml file sorted by purpose. + + - name: targetNamespacePrefix + type: string + default: llmdbench + + - name: model-id + type: string + default: "meta-llama/Llama-3.2-1B-Instruct" + - name: inferencePort + default: 8000 + + # Properties needed to evaluate stack capacity (will it be able to host the model)? + - name: validateCapacity + default: "true" + - name: behaviorOnValidationFailure + default: "terminate" + + - name: maxModelLength + + - name: decodeReplicas + - name: decodeTensorParallelism + - name: decodeDataParallelism + - name: decodeNumGpus + + - name: prefillReplicas + - name: prefillTensorParallelism + - name: prefillDataParallelism + - name: prefillNumGpus + + - name: gpuType + - name: gpuMemory + + - name: stackBaseUrl + type: string + - name: experimentName + type: string + default: "experiment" + + - name: model-pvc-name + type: string + default: model-pvc + - name: model-pvc-size + type: string + default: 300Gi + - name: model-storage-class + type: string + default: ocs-storagecluster-cephfs + + - name: download-job-name + type: string + default: download-job + + - default: llm-d-infra + description: Name of the Helm repository for the Gateway + name: gatewayRepoName + type: string + - default: https://llm-d-incubation.github.io/llm-d-infra/ + description: URL of the Helm repository for the Gateway + name: gatewayRepoUrl + type: string + - name: gatewayChartVersion + type: string + default: "" + description: Optional gateway chart version (used with --version) + + - name: gatewayExtraArgs + type: string + default: "" + description: Optional extra args for the gateway (to append to 'helm upgrade --install') + + - name: gaieChartVersion + type: string + default: "v0.5.1" + description: Optional GAIE chart version (used with --version) + + - name: gaieExtraArgs + type: string + default: "" + description: Optional extra args for GAIE (to append to 'helm upgrade --install') + + - default: llm-d-modelservice + description: Name of the Helm repository for the model engine + name: msRepoName + type: string + - default: https://llm-d-incubation.github.io/llm-d-modelservice/ + description: URL of the Helm repository for the model engine + name: msRepoUrl + type: string + - name: msChartVersion + type: string + default: "" + description: Optional modelservice chart version (used with --version) + + - name: msExtraArgs + type: string + default: "" + description: Optional extra args for the model engine (to append to 'helm upgrade --install') + + - name: modelWaitTimeout + type: string + default: 900 + + - name: llmdbenchImageRegistry + default: "quay.io" + - name: llmdbenchImageRepo + default: "namasluk" + - name: llmdbenchImageName + default: "llm-d-benchmark" + - name: llmdbenchImageTag + default: "251002.1" + + - name: harnessName + type: string + default: inference-perf + - name: harnessProfile + type: string + default: sanity_random.yaml + - name: stackType + type: string + default: lld-d + - name: pipelineUID + type: string + default: experiment + + - name: s3-keys + type: string + default: "s3-keys" + - name: s3-bucket + type: string + - name: s3-endpoint + type: string + + - name: debug + type: string + default: "false" + - name: step-upload-results + type: string + default: "true" + - name: dry-run + type: string + default: "false" + + results: + - name: treatmentAnalysisModelservice + value: $(steps.analyze-modelservice-factors.results.treatmentAnalysis) + - name: treatmentAnalysisGaie + value: $(steps.analyze-gaie-factors.results.treatmentAnalysis) + - name: treatmentAnalysisWorkload + value: $(steps.analyze-workload-factors.results.treatmentAnalysis) + + steps: + - name: log-start + image: alpine:3.20 + script: | + #!/bin/sh + echo "🔄 Starting sweep step for ..." + printf "%s" "$(params.treatment)" + + - name: analyze-modelservice-factors + ref: + name: analyze-treatment + params: + - name: factorType + value: modelservice + - name: factorMapping + value: $(params.factorMapping) + - name: treatment + value: $(params.treatment) + + - name: analyze-gaie-factors + ref: + name: analyze-treatment + params: + - name: factorType + value: gaie + - name: factorMapping + value: $(params.factorMapping) + - name: treatment + value: $(params.treatment) + + - name: analyze-workload-factors + ref: + name: analyze-treatment + params: + - name: factorType + value: workload + - name: factorMapping + value: $(params.factorMapping) + - name: treatment + value: $(params.treatment) + + # - name: display-treatment-analysis + # image: alpine:3.20 + # env: + # - name: MODELSERVICE_SET_ARGS + # value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" + # - name: GAIE_SET_ARGS + # value: "$(steps.analyze-gaie-factors.results.treatmentAnalysis)" + # - name: WORKLOAD_SET_ARGS + # value: "$(steps.analyze-workload-factors.results.treatmentAnalysis)" + + # script: | + # #!/bin/sh + # apk add --no-cache jq yq-go >/dev/null + # jq --version + + # echo "helm upgrade --install ... $(echo ${MODELSERVICE_SET_ARGS} | jq '.setArgs')" + # echo "helm upgrade --install ... $(echo ${GAIE_SET_ARGS} | jq '.setArgs')" + # echo "$(echo ${WORKLOAD_SET_ARGS} | jq '.updates')" + + # printf "%s" "$MODELSERVICE_SET_ARGS" + + # TBD split into individual steps to compute each value? + - name: compute-capacity-validation-values + ref: + name: + compute-values + params: + - name: decodeDataParallelism + value: $(params.decodeDataParallelism) + - name: decodeTensorParallelism + value: $(params.decodeTensorParallelism) + - name: decodeReplicas + value: $(params.decodeReplicas) + - name: prefillDataParallelism + value: $(params.prefillDataParallelism) + - name: prefillTensorParallelism + value: $(params.prefillTensorParallelism) + - name: prefillReplicas + value: $(params.prefillReplicas) + + # TBD fold into compute-capacity-validation-values ? + - name: compute-decode-num-gpus + ref: + name: + compute-num-gpus + params: + - name: name + value: "decodeNumGpus" + - name: value + value: $(params.decodeNumGpus) + - name: dp + # value: $(steps.compute-decode-dp.results.value) + value: $(steps.compute-capacity-validation-values.results.decodeDataParallelism) + - name: tp + # value: $(steps.compute-decode-tp.results.value) + value: $(steps.compute-capacity-validation-values.results.decodeTensorParallelism) + + # TBD fold into compute-capacity-validation-values ? + - name: compute-prefill-num-gpus + ref: + name: + compute-num-gpus + params: + - name: name + value: "prefillNumGpus" + - name: value + value: $(params.prefillNumGpus) + - name: dp + # value: $(steps.compute-prefill-dp.results.value) + value: $(steps.compute-capacity-validation-values.results.prefillDataParallelism) + - name: tp + # value: $(steps.compute-prefill-tp.results.value) + value: $(steps.compute-capacity-validation-values.results.prefillDataParallelism) + + - name: check-decode-capacity + ref: + name: check-capacity + params: + - name: validateCapacity + value: $(params.validateCapacity) + - name: behaviorOnValidationFailure + value: $(params.behaviorOnValidationFailure) + - name: model + value: $(params.model-id) + - name: max_model_len + value: $(params.maxModelLength) + - name: replicas + value: $(steps.compute-capacity-validation-values.results.decodeReplicas) + - name: tp + value: $(steps.compute-capacity-validation-values.results.decodeTensorParallelism) + - name: dp + value: $(steps.compute-capacity-validation-values.results.decodeDataParallelism) + - name: gpu_memory + value: $(params.gpuMemory) + - name: user_requested_gpu_count + value: $(steps.compute-decode-num-gpus.results.value) + when: + - input: $(params.validateCapacity) + operator: in + values: [ "true" ] + + - name: check-prefill-capacity + ref: + name: check-capacity + params: + - name: validateCapacity + value: $(params.validateCapacity) + - name: behaviorOnValidationFailure + value: $(params.behaviorOnValidationFailure) + - name: model + value: $(params.model-id) + - name: max_model_len + value: $(params.maxModelLength) + - name: replicas + value: $(steps.compute-capacity-validation-values.results.prefillReplicas) + - name: tp + value: $(steps.compute-capacity-validation-values.results.prefillTensorParallelism) + - name: dp + value: $(steps.compute-capacity-validation-values.results.prefillDataParallelism) + - name: gpu_memory + value: $(params.gpuMemory) + - name: user_requested_gpu_count + value: $(steps.compute-prefill-num-gpus.results.value) + when: + - input: $(params.validateCapacity) + operator: in + values: [ "true" ] + + - name: prepare-namespace + image: quay.io/openshift/origin-cli:4.21 + script: | + #!/bin/sh + + NAMESPACE="$(params.targetNamespacePrefix)-$(context.taskRun.name)" + DRY_RUN="$(params.dry-run)" + + if [ "${DRY_RUN}" = "true" ]; then + echo ">> skipping" + exit 0 + fi + + kubectl create namespace ${NAMESPACE} \ + --dry-run=client -o yaml | kubectl apply -f - + + HF_TOKEN=$( + kubectl get secret hf-secret \ + --namespace "$(context.taskRun.namespace)" \ + -o jsonpath='{.data.HF_TOKEN}' \ + | tr -d '\n' \ + | base64 -d + ) + + kubectl create secret generic hf-secret \ + --namespace ${NAMESPACE} \ + --from-literal="HF_TOKEN=${HF_TOKEN}" \ + --dry-run=client -o yaml | kubectl apply -f - + + # TBD only if OpenShift + oc adm policy add-scc-to-user anyuid -z helm-installer -n ${NAMESPACE} + # oc adm policy add-scc-to-user privileged -z helm-installer -n ${NAMESPACE} + + # TBD when move from multiple NS to single NS then can move to + # step implementation instead of kubernetes job (replacing the next 2 steps) + # Can't do yet because step executes in a different NS from target. + - name: model-download + ref: + name: helm-upgrade-install + params: + # Location of helm chart + - name: git_url + value: "https://github.com/kalantar/llm-d-benchmark" + - name: git_revision + value: "tekton-poc" + - name: checkout_dir + value: "/tmp/llm-d-benchmark" + + # Helm arguments + - name: releaseName + value: $(params.experimentName)-download + - name: chart + value: /tmp/llm-d-benchmark/charts/model-download + - name: namespace + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) + - name: timeout + value: 15m + # - name: valuesYamlUrl + # value: "/tmp/llm-d-benchmark/charts/model-download/values.yaml" + - name: extraArgs + value: > + --set hf_model=$(params.model-id) + --set pvc.create=true + --set pvc.name=$(params.model-pvc-name) + --set pvc.size=$(params.model-pvc-size) + --set pvc.storageClass=$(params.model-storage-class) + + - name: dry-run + value: $(params.dry-run) + + - name: wait-for-download + image: alpine/kubectl:1.34.1 + env: + - name: JOB_NAME + value: "llm-d-benchark-job" + - name: NAMESPACE + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) + - name: TIMEOUT + value: "300" # seconds + - name: SLEEP_INTERVAL + value: "5" # seconds + script : | + #!/usr/bin/env sh + + echo "⏳ Wait for model to download" + + elapsed=0 + + while [ "$elapsed" -lt "${TIMEOUT}" ]; do + status=$(kubectl get job "${JOB_NAME}" -n "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}') + if [ "$status" = "True" ]; then + echo "✅ Job succeeded" + kubectl delete job "${JOB_NAME}" -n "${NAMESPACE}" --ignore-not-found + exit 0 + fi + + status=$(kubectl get job "${JOB_NAME}" -n "${NAMESPACE}" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}') + if [ "$status" = "True" ]; then + echo "❌ Job failed" + kubectl delete job "${JOB_NAME}" -n "${NAMESPACE}" --ignore-not-found + exit 1 + fi + + sleep "${SLEEP_INTERVAL}" + elapsed=$((elapsed + SLEEP_INTERVAL)) + done + + echo "❌ Timed out waiting for job to complete or fail" + kubectl delete job "${JOB_NAME}" -n "${NAMESPACE}" --ignore-not-found + exit 2 + + - name: gateway + ref: + name: helm-upgrade-install + params: + - name: releaseName + value: $(params.experimentName)-gateway + - name: chart + value: llm-d-infra/llm-d-infra + - name: repoName + value: llm-d-infra + - name: repoUrl + value: https://llm-d-incubation.github.io/llm-d-infra/ + + - name: namespace + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) + - name: timeout + value: 15m + - name: valuesYamlUrl + value: "$(params.stackBaseUrl)/gateway-values.yaml" + + - name: dry-run + value: $(params.dry-run) + + - name: gaie + ref: + name: helm-upgrade-install + params: + - name: releaseName + value: $(params.experimentName)-gaie-NAMESPACE_HASH + - name: chart + value: oci://registry.k8s.io/gateway-api-inference-extension/charts/inferencepool + - name: version + value: $(params.gaieChartVersion) + + - name: namespace + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) + - name: timeout + value: 15m + - name: valuesYamlUrl + value: "$(params.stackBaseUrl)/gaie-values.yaml" + - name: treatmentAnalysis + value: "$(steps.analyze-gaie-factors.results.treatmentAnalysis)" + + - name: dry-run + value: $(params.dry-run) + + - name: model-engine + ref: + name: helm-upgrade-install + params: + - name: releaseName + value: $(params.experimentName)-ms + - name: chart + value: llm-d-modelservice/llm-d-modelservice + - name: repoName + value: llm-d-modelservice + - name: repoUrl + value: https://llm-d-incubation.github.io/llm-d-modelservice/ + + - name: namespace + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) + - name: timeout + value: 15m + - name: valuesYamlUrl + value: "$(params.stackBaseUrl)/ms-values.yaml" + - name: extraArgs + value: > + --set routing.inferencePool.name=$(params.experimentName)-gaie-NAMESPACE_HASH + --set routing.httpRoute.rules[0].backendRefs[0].name=$(params.experimentName)-gaie-NAMESPACE_HASH + --set routing.httpRoute.rules[1].backendRefs[0].name=$(params.experimentName)-gaie-NAMESPACE_HASH + - name: treatmentAnalysis + value: "$(steps.analyze-modelservice-factors.results.treatmentAnalysis)" + + - name: dry-run + value: $(params.dry-run) + + - name: wait-for-model + env: + - name: DECODE_REPLICAS + value: $(steps.compute-capacity-validation-values.results.decodeReplicas) + - name: PREFILL_REPLICAS + value: $(steps.compute-capacity-validation-values.results.prefillReplicas) + image: alpine/kubectl:1.34.1 + script: | + #!/bin/sh + + if [ "$(params.dry-run)" = "true" ]; then + echo ">> skipping" + exit 0 + fi + NAMESPACE="$(params.targetNamespacePrefix)-$(context.taskRun.name)" + MODEL_ID="$(params.model-id)" + MODEL_LABEL=$(echo "$MODEL_ID" | tr '[:upper:]' '[:lower:]' | sed 's/[./]/-/g') + MODEL_START_TIMEOUT="$(params.modelWaitTimeout)" + + echo "⏳ Waiting for pods serving model ${MODEL_ID} to be 'Running'" + echo "Model label = ${MODEL_LABEL}" + + if [ ${DECODE_REPLICAS} -gt 0 ]; then + kubectl --namespace ${NAMESPACE} \ + wait pod \ + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=decode \ + --for=create \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ (decode) pods serving model ${MODEL_ID} created" + fi + + if [ ${PREFILL_REPLICAS} -gt 0 ]; then + kubectl --namespace ${NAMESPACE} \ + wait pod \ + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ + --for=create \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ prefill pods serving model ${MODEL_ID} created" + fi + + if [ ${DECODE_REPLICAS} -gt 0 ]; then + kubectl --namespace ${NAMESPACE} \ + wait pod \ + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=decode \ + --for=condition=Ready=True \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ (decode) pods serving model ${MODEL_ID} ready" + fi + + if [ ${PREFILL_REPLICAS} -gt 0 ]; then + kubectl --namespace ${NAMESPACE} \ + wait pod \ + -l llm-d.ai/model=${MODEL_LABEL},llm-d.ai/role=prefill \ + --for=condition=Ready=True \ + --timeout=${MODEL_START_TIMEOUT}s + echo "✅ prefill pods serving model ${MODEL_ID} ready" + fi + + - name: prepare-workload-profile + ref: + name: prepare-workload-profile + params: + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + - name: treatmentAnalysis + value: $(steps.analyze-workload-factors.results.treatmentAnalysis) + - name: model-id + value: $(params.model-id) + - name: namespace + value: $(params.targetNamespacePrefix)-$(context.taskRun.name) + - name: pipelineUID + value: $(params.pipelineUID) + + - name: inference-perf-run + ref: + name: inference-perf-run + params: + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + - name: pipelineUID + value: $(params.pipelineUID) + when: + - input: $(params.harnessName) + operator: in + values: [ "inference-perf" ] + # computeResources: + # requests: + # memory: "32Gi" + # cpu: "16" + # limits: + # memory: "32Gi" + # cpu: "16" + + - name: inference-perf-analyze-results + ref: + name: inference-perf-analyze-results + params: + - name: harnessName + value: $(params.harnessName) + - name: pipelineUID + value: $(params.pipelineUID) + when: + - input: $(params.harnessName) + operator: in + values: [ "inference-perf" ] + + - name: vllm-benchmark-run + ref: + name: vllm-benchmark-run + params: + - name: harnessName + value: $(params.harnessName) + - name: harnessProfile + value: $(params.harnessProfile) + - name: pipelineUID + value: $(params.pipelineUID) + when: + - input: $(params.harnessName) + operator: in + values: [ "vllm-benchmark" ] + # computeResources: + # requests: + # memory: "32Gi" + # cpu: "16" + # limits: + # memory: "32Gi" + # cpu: "16" + + - name: vllm-benchmark-analyze-results + ref: + name: vllm-benchmark-analyze-results + params: + - name: harnessName + value: $(params.harnessName) + - name: pipelineUID + value: $(params.pipelineUID) + when: + - input: $(params.harnessName) + operator: in + values: [ "vllm-benchmark" ] + + - name: upload-results + image: ubuntu:24.04 + # Tried amazon/aws-cli:2.31.9 but latest tar available via `dnf install tar -y` is 1.34. + # There were sporadic errors "file changed as we read it". It may be caused by the way + # tar identifes file changes in v 1.34 (via ctime). A recommended solution to move to 1.35. + # See https://stackoverflow.com/a/77765876 and tar release notes https://lists.gnu.org/archive/html/info-gnu/2023-07/msg00005.html) + # A smaller image is probably desirable. A restriction is that AWS CLI v2 requires glibc. + workingDir: $(workspaces.data.path) + env: + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: $(params.s3-keys) + key: AWS_ACCESS_KEY_ID + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: $(params.s3-keys) + key: AWS_SECRET_ACCESS_KEY + - name: AWS_EC2_METADATA_DISABLED + value: "true" + script: | + #!/usr/bin/env sh + + if [ "$(params.step-upload-results)" = "false" ]; then + echo "Upload disabled ... skipping." + exit 0 + fi + + apt-get update && \ + apt-get install -y --no-install-recommends ca-certificates curl unzip tar gzip && \ + rm -rf /var/lib/apt/lists/* + + curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o /tmp/awscliv2.zip && \ + unzip /tmp/awscliv2.zip -d /tmp && \ + /tmp/aws/install && \ + rm -rf /tmp/aws /tmp/awscliv2.zip + + tar --version && gzip --version && aws --version + + EXPERIMENT_ID="experiment-$(echo -n $(params.pipelineUID) | cut -c1-8)" + EXPERIMENT_RESULTS_FOLDER="$(params.harnessName)_${EXPERIMENT_ID}_$(context.taskRun.name)" + ARCHIVE_NAME="${EXPERIMENT_RESULTS_FOLDER}.tar.gz" + + tar --version && gzip --version && aws --version + + tar -czf ${ARCHIVE_NAME} \ + -C "$(workspaces.data.path)" ${EXPERIMENT_RESULTS_FOLDER} + + aws s3 cp ${ARCHIVE_NAME} "s3://$(params.s3-bucket)/${ARCHIVE_NAME}" \ + --endpoint-url "$(params.s3-endpoint)" \ + --content-type "application/x-tar" \ + --content-encoding "gzip" \ + --no-progress + # --recursive \ + + rm -rf ${ARCHIVE_NAME} + + echo "✅ Uploaded results to ${ARCHIVE_NAME}" + + - name: delete-namespace + image: alpine/kubectl:1.34.1 + script : | + #!/bin/sh + + NAMESPACE="$(params.targetNamespacePrefix)-$(context.taskRun.name)" + DEBUG="$(params.debug)" + + if [ "$(params.debug)" = "true" ]; then + echo "⚠️ DEBUG=true; leaving namespace ${NAMESPACE} for inspection" + echo "⚠️ Manually clean up resources with \"kubectl delete namespace ${NAMESPACE}\"" + exit 0 + fi + + kubectl delete namespace ${NAMESPACE} + echo "✅ workload namespace ${NAMESPACE} deleted" + + - name: log-completion + image: alpine:3.20 + script: | + #!/bin/sh + echo "✅ Sweep step complete." diff --git a/tekton-poc/utility/transform-pr-parallel.py b/tekton-poc/utility/transform-pr-parallel.py new file mode 100644 index 00000000..426e0a58 --- /dev/null +++ b/tekton-poc/utility/transform-pr-parallel.py @@ -0,0 +1,256 @@ +#!/usr/bin/env python3 +import sys +import yaml +import itertools +import argparse +from copy import deepcopy + +def load_yaml_from_path_or_stdin(path: str): + """Load YAML from a file path or stdin if path == '-'.""" + if path == "-": + try: + return yaml.safe_load(sys.stdin) + except Exception as e: + raise ValueError(f"Failed to read YAML from stdin: {e}") + else: + try: + with open(path, "r") as f: + return yaml.safe_load(f) + except FileNotFoundError: + raise ValueError(f"Input file not found: {path}") + except Exception as e: + raise ValueError(f"Failed to read YAML from '{path}': {e}") + +def dump_yaml_to_path_or_stdout(data, path: str | None, announce_to_stderr: str | None = None): + """ + Write YAML to the given path. If path is None or '-', write to stdout with no extra noise. + If path is a real file, write there and optionally announce to stderr. + """ + if path is None or path == "-": + yaml.safe_dump(data, sys.stdout, sort_keys=False) + else: + with open(path, "w") as f: + yaml.safe_dump(data, f, sort_keys=False) + if announce_to_stderr: + print(announce_to_stderr, file=sys.stderr) + +# -------------------- EXPANSION (existing behavior) -------------------- # +def transform_matrix_to_batched_dict(original_yaml: dict, max_parallel: int, sliding_window: bool): + """ + Expand the matrix task into concrete tasks with runAfter enforcing either: + - barrier batching (default), or + - sliding-window (--sliding-window). + """ + if max_parallel < 1: + raise ValueError("max_parallel must be >= 1") + + try: + pipeline_spec = original_yaml["spec"]["pipelineSpec"] + tasks = pipeline_spec["tasks"] + except Exception: + raise ValueError("Input YAML must contain spec.pipelineSpec.tasks") + + if not isinstance(tasks, list) or len(tasks) == 0: + raise ValueError("spec.pipelineSpec.tasks must be a non-empty list") + + base_task = deepcopy(tasks[0]) + base_name = base_task.get("name", "task") + + # matrix params + matrix_params = {} + for p in base_task.get("matrix", {}).get("params", []): + vals = p.get("value", []) + if not isinstance(vals, list): + vals = [vals] + matrix_params[p["name"]] = vals + + combos = list(itertools.product(*matrix_params.values())) if matrix_params else [tuple()] + total = len(combos) + + new_tasks = [] + for i, combo in enumerate(combos): + t = deepcopy(base_task) + t.pop("matrix", None) + t["name"] = f"{base_name}-{i}" + + t["params"] = deepcopy(base_task.get("params", [])) + [ + {"name": name, "value": value} + for name, value in zip(matrix_params.keys(), combo) + ] + + if sliding_window: + if i >= max_parallel: + t["runAfter"] = [f"{base_name}-{i - max_parallel}"] + else: + t.pop("runAfter", None) + else: + batch_index = i // max_parallel + if batch_index > 0: + prev_start = (batch_index - 1) * max_parallel + prev_end = min(batch_index * max_parallel, total) + t["runAfter"] = [f"{base_name}-{j}" for j in range(prev_start, prev_end)] + else: + t.pop("runAfter", None) + + new_tasks.append(t) + + new_pr = deepcopy(original_yaml) + new_pipeline_spec = deepcopy(pipeline_spec) + new_pipeline_spec["tasks"] = new_tasks + new_pr["spec"]["pipelineSpec"] = new_pipeline_spec + return new_pr + +# -------------------- UNROLLING (new behavior) -------------------- # +def transform_unroll_params_dict(original_yaml: dict, unroll_params: list[str]): + """ + Unroll (hoist) one or more matrix parameters into separate tasks. + + For given unroll_params (subset of the matrix param names): + - Create one task for each Cartesian product of the chosen params' values. + - In each task: + * Set the chosen params as fixed task 'params' (not matrix). + * Keep a 'matrix' of the remaining matrix params (if any). + - Do not add runAfter constraints (preserve original no-dependency behavior). + """ + if not unroll_params: + raise ValueError("unroll_params must be a non-empty list of parameter names") + + try: + pipeline_spec = original_yaml["spec"]["pipelineSpec"] + tasks = pipeline_spec["tasks"] + except Exception: + raise ValueError("Input YAML must contain spec.pipelineSpec.tasks") + + if not isinstance(tasks, list) or len(tasks) == 0: + raise ValueError("spec.pipelineSpec.tasks must be a non-empty list") + + base_task = deepcopy(tasks[0]) + base_name = base_task.get("name", "task") + + # Load matrix params preserving order as a list of (name, values) + matrix_params_list = [] + for p in base_task.get("matrix", {}).get("params", []): + vals = p.get("value", []) + if not isinstance(vals, list): + vals = [vals] + matrix_params_list.append((p["name"], vals)) + + if not matrix_params_list: + raise ValueError("Base task has no matrix to unroll") + + # Validate unroll params are present in matrix + matrix_names = [name for name, _ in matrix_params_list] + unknown = [n for n in unroll_params if n not in matrix_names] + if unknown: + raise ValueError(f"Unroll params not found in matrix: {unknown}. Available: {matrix_names}") + + # Split into "chosen" vs "remaining" + chosen = [(name, vals) for name, vals in matrix_params_list if name in unroll_params] + remaining = [(name, vals) for name, vals in matrix_params_list if name not in unroll_params] + + # Cartesian product over chosen + chosen_names = [name for name, _ in chosen] + chosen_values_lists = [vals for _, vals in chosen] + chosen_combos = list(itertools.product(*chosen_values_lists)) if chosen else [tuple()] + + new_tasks = [] + for i, combo in enumerate(chosen_combos): + t = deepcopy(base_task) + t["name"] = f"{base_name}-{i}" + + # Remove matrix entirely; we will rebuild it only with remaining params + t.pop("matrix", None) + + # Merge original params plus fixed chosen params for this task + t["params"] = deepcopy(base_task.get("params", [])) + [ + {"name": name, "value": value} + for name, value in zip(chosen_names, combo) + ] + + # Rebuild matrix from the remaining params (if any) + if remaining: + t["matrix"] = { + "params": [{"name": name, "value": vals} for name, vals in remaining] + } + else: + # Nothing remains; ensure no stray runAfter or matrix fields + t.pop("matrix", None) + + # Preserve lack of dependencies (no runAfter) unless the base had them explicitly + if "runAfter" in t: + # Typically matrix tasks don't carry runAfter; remove to keep parallelism by default + t.pop("runAfter", None) + + new_tasks.append(t) + + # Replace tasks with our new set + new_pr = deepcopy(original_yaml) + new_pipeline_spec = deepcopy(pipeline_spec) + new_pipeline_spec["tasks"] = new_tasks + new_pr["spec"]["pipelineSpec"] = new_pipeline_spec + return new_pr + +def main(): + parser = argparse.ArgumentParser( + description=( + "Tekton PipelineRun matrix transformer.\n" + "Default: expand the matrix to concrete tasks with barrier batching or sliding-window.\n" + "Use --unroll to split specified matrix params into separate tasks while keeping a reduced matrix." + ) + ) + parser.add_argument("input", help="Input PipelineRun YAML file or '-' for stdin") + + # Mutually exclusive: either unroll OR expand + mode_group = parser.add_mutually_exclusive_group() + mode_group.add_argument( + "--unroll", metavar="PARAMS", + help="Comma-separated matrix parameter names to hoist into tasks (e.g., 'gaiePluginConfig' or 'p1,p2')." + ) + mode_group.add_argument( + "--sliding-window", action="store_true", + help="(Expand mode) Use sliding-window scheduling (each task i depends on i-n). Default is barrier batching." + ) + + # Expansion options (used only if NOT --unroll) + parser.add_argument( + "-n", "--max-parallel", type=int, default=1, + help="(Expand mode) Maximum number of tasks to run in parallel. Default: 1" + ) + + parser.add_argument( + "-o", "--output", default=None, + help="Output file path. Use '-' or omit to write to stdout." + ) + + args = parser.parse_args() + + try: + original = load_yaml_from_path_or_stdin(args.input) + + if args.unroll: + unroll_params = [s.strip() for s in args.unroll.split(",") if s.strip()] + transformed = transform_unroll_params_dict(original_yaml=original, unroll_params=unroll_params) + mode_desc = f"unroll={unroll_params}" + else: + transformed = transform_matrix_to_batched_dict( + original_yaml=original, + max_parallel=args.max_parallel, + sliding_window=args.sliding_window, + ) + mode_desc = "sliding-window" if args.sliding_window else "barrier" + + except Exception as e: + print(f"Error: {e}", file=sys.stderr) + sys.exit(1) + + announce = None + if args.output not in (None, "-"): + if args.unroll: + announce = f"✅ Transformed PipelineRun saved to '{args.output}' ({mode_desc})" + else: + announce = f"✅ Transformed PipelineRun saved to '{args.output}' (mode={mode_desc}, max_parallel={args.max_parallel})" + + dump_yaml_to_path_or_stdout(transformed, args.output, announce_to_stderr=announce) + +if __name__ == "__main__": + main() \ No newline at end of file