Skip to content

Commit 5867f3c

Browse files
authored
feat: preempt support for GPU workers (#366)
* fix: gpu info update * feat: preempt scheduling, fix metrics scheduling bugs, add evict protection * fix: unit test issue * fix: preempt unit testing * fix: lint issue, add qos to priorityClassName converting
1 parent a45ba60 commit 5867f3c

32 files changed

+833
-125
lines changed

.vscode/settings.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
"envtest",
4848
"essd",
4949
"Eventf",
50+
"evictable",
5051
"featuregate",
5152
"finalizer",
5253
"Finalizers",
@@ -133,6 +134,7 @@
133134
"schedulingconfigtemplate",
134135
"schedulingconfigtemplates",
135136
"schedulingcorev",
137+
"schedv",
136138
"serviceaccount",
137139
"shirou",
138140
"shortuuid",

api/v1/gpupool_types.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,12 @@ type QosConfig struct {
238238
Definitions []QosDefinition `json:"definitions,omitempty"`
239239
DefaultQoS QoSLevel `json:"defaultQoS,omitempty"`
240240
Pricing []QosPricing `json:"pricing,omitempty"`
241+
242+
// Eviction protection price ratio applied to cost calculation during protection period
243+
// This multiplier increases pricing for protected workloads to discourage preemption
244+
// +optional
245+
// +kubebuilder:default="1.2"
246+
EvictionProtectionPriceRatio string `json:"evictionProtectionPriceRatio,omitempty"`
241247
}
242248

243249
type QosDefinition struct {

api/v1/gpuresourcequota_types.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,8 @@ type AllocRequest struct {
186186

187187
// record the pod meta for quota check
188188
PodMeta metav1.ObjectMeta
189+
190+
QoS QoSLevel
189191
}
190192

191193
func (p *AllocRequest) Clone() fwk.StateData {

charts/tensor-fusion/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ type: application
1515
# This is the chart version. This version number should be incremented each time you make changes
1616
# to the chart and its templates, including the app version.
1717
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18-
version: 1.5.8
18+
version: 1.5.9
1919

2020
# This is the version number of the application being deployed. This version number should be
2121
# incremented each time you make changes to the application. Versions are not expected to

charts/tensor-fusion/crds/tensor-fusion.ai_gpupools.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -562,6 +562,12 @@ spec:
562562
type: integer
563563
type: object
564564
type: array
565+
evictionProtectionPriceRatio:
566+
default: "1.2"
567+
description: |-
568+
Eviction protection price ratio applied to cost calculation during protection period
569+
This multiplier increases pricing for protected workloads to discourage preemption
570+
type: string
565571
pricing:
566572
items:
567573
properties:

charts/tensor-fusion/crds/tensor-fusion.ai_tensorfusionclusters.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,12 @@ spec:
629629
type: integer
630630
type: object
631631
type: array
632+
evictionProtectionPriceRatio:
633+
default: "1.2"
634+
description: |-
635+
Eviction protection price ratio applied to cost calculation during protection period
636+
This multiplier increases pricing for protected workloads to discourage preemption
637+
type: string
632638
pricing:
633639
items:
634640
properties:

charts/tensor-fusion/templates/controller-deployment.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ spec:
3232
{{- end }}
3333
serviceAccountName: {{ include "tensor-fusion.serviceAccountName" . }}
3434
enableServiceLinks: false
35+
priorityClassName: "system-cluster-critical"
3536
containers:
3637
- name: controller
3738
image: "{{ .Values.controller.image.repository }}:{{ .Values.controller.image.tag | default .Chart.AppVersion }}"

charts/tensor-fusion/templates/gpu-public-gpu-info.yaml

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,18 @@ data:
4545
costPerHour: 1.64
4646
fp16TFlops: 312
4747
48+
- model: A100_PCIe_40GB
49+
fullModelName: "NVIDIA A100-PCIE-40GB"
50+
vendor: NVIDIA
51+
costPerHour: 1.64
52+
fp16TFlops: 312
53+
54+
- model: A100_PCIe_80GB
55+
fullModelName: "NVIDIA A100-PCIE-80GB"
56+
vendor: NVIDIA
57+
costPerHour: 1.64
58+
fp16TFlops: 312
59+
4860
- model: A100_SXM_40G
4961
fullModelName: "NVIDIA A100-SXM4-40GB"
5062
vendor: NVIDIA
@@ -70,13 +82,13 @@ data:
7082
fp16TFlops: 312
7183
7284
- model: A800_PCIe_80G
73-
fullModelName: "NVIDIA A800 80GB PCIe"
85+
fullModelName: "NVIDIA A800-PCIE-80GB"
7486
vendor: NVIDIA
7587
costPerHour: 1.64
7688
fp16TFlops: 312
7789
7890
- model: A800_PCIe_40G
79-
fullModelName: "NVIDIA A800 40GB PCIe"
91+
fullModelName: "NVIDIA A800-PCIE-40GB"
8092
vendor: NVIDIA
8193
costPerHour: 1.64
8294
fp16TFlops: 312
@@ -95,7 +107,7 @@ data:
95107
fp16TFlops: 125
96108
97109
- model: A40
98-
fullModelName: "NVIDIA A40 48GB PCIe"
110+
fullModelName: "NVIDIA A40-PCIE-48GB"
99111
vendor: NVIDIA
100112
costPerHour: 0.4
101113
fp16TFlops: 149.7
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
apiVersion: scheduling.k8s.io/v1
2+
kind: PriorityClass
3+
metadata:
4+
name: tensor-fusion-critical
5+
value: 100000
6+
globalDefault: false
7+
description: "TensorFusion critical priority"
8+
---
9+
apiVersion: scheduling.k8s.io/v1
10+
kind: PriorityClass
11+
metadata:
12+
name: tensor-fusion-high
13+
value: 10000
14+
globalDefault: false
15+
description: "TensorFusion high priority"
16+
---
17+
apiVersion: scheduling.k8s.io/v1
18+
kind: PriorityClass
19+
metadata:
20+
name: tensor-fusion-medium
21+
value: 0
22+
globalDefault: false
23+
description: "TensorFusion medium priority"

charts/tensor-fusion/values.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,8 +169,8 @@ schedulerConfig:
169169
kind: KubeSchedulerConfiguration
170170
clientConnection:
171171
kubeconfig: ""
172-
qps: 50
173-
burst: 100
172+
qps: 1000
173+
burst: 2000
174174
profiles:
175175
# Refer: https://kubernetes.io/docs/reference/scheduling/config/
176176
- schedulerName: tensor-fusion-scheduler

0 commit comments

Comments
 (0)