Skip to content

Commit 5c770c4

Browse files
authored
fix: allocation detail optimization, show detailed GPU pods in CR (#433)
* fix: index allocator, pod webhook priorityClass issue * fix: lint issue * fix: add allocation detail in gpunode/gpu custom resource
1 parent af04aa5 commit 5c770c4

File tree

12 files changed

+431
-76
lines changed

12 files changed

+431
-76
lines changed

api/v1/gpu_types.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,19 @@ type RunningAppDetail struct {
7979

8080
// Worker count
8181
Count int `json:"count"`
82+
83+
// Pod names that are running this workload
84+
// +optional
85+
Pods []*PodGPUInfo `json:"pods,omitempty"`
86+
}
87+
88+
type PodGPUInfo struct {
89+
Name string `json:"name,omitempty"`
90+
Namespace string `json:"namespace,omitempty"`
91+
UID string `json:"uid,omitempty"`
92+
Requests Resource `json:"requests,omitempty"`
93+
Limits Resource `json:"limits,omitempty"`
94+
QoS QoSLevel `json:"qos,omitempty"`
8295
}
8396

8497
// +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating

api/v1/gpunode_funcs.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ func (node *GPUNode) InitializeStatus(initTFlops, initVRAM resource.Quantity, in
1010
TotalTFlops: initTFlops,
1111
TotalVRAM: initVRAM,
1212
TotalGPUs: initGPUs,
13-
AllocationInfo: []*RunningAppDetail{},
13+
AllocatedPods: make(map[string][]*PodGPUInfo),
1414
LoadedModels: &[]string{},
1515
ManagedGPUDeviceIDs: []string{},
1616
ObservedGeneration: node.Generation,

api/v1/gpunode_types.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,10 @@ type GPUNodeStatus struct {
9696
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
9797

9898
// +optional
99-
AllocationInfo []*RunningAppDetail `json:"allocationInfo,omitempty"`
99+
TotalGPUPods int32 `json:"totalGPUPods,omitempty"`
100+
101+
// +optional
102+
AllocatedPods map[string][]*PodGPUInfo `json:"allocatedPods,omitempty"`
100103
}
101104

102105
// +kubebuilder:validation:Enum=Pending;Provisioning;Migrating;Running;Succeeded;Failed;Unknown;Destroying

api/v1/zz_generated.deepcopy.go

Lines changed: 48 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/crds/tensor-fusion.ai_gpunodes.yaml

Lines changed: 75 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -86,21 +86,78 @@ spec:
8686
status:
8787
description: GPUNodeStatus defines the observed state of GPUNode.
8888
properties:
89-
allocationInfo:
90-
items:
91-
properties:
92-
count:
93-
description: Worker count
94-
type: integer
95-
name:
96-
description: Workload name namespace
97-
type: string
98-
namespace:
99-
type: string
100-
required:
101-
- count
102-
type: object
103-
type: array
89+
allocatedPods:
90+
additionalProperties:
91+
items:
92+
properties:
93+
limits:
94+
properties:
95+
compute:
96+
anyOf:
97+
- type: integer
98+
- type: string
99+
description: 0-100 percentage, mutually exclusive with
100+
TFLOPs
101+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
102+
x-kubernetes-int-or-string: true
103+
tflops:
104+
anyOf:
105+
- type: integer
106+
- type: string
107+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
108+
x-kubernetes-int-or-string: true
109+
vram:
110+
anyOf:
111+
- type: integer
112+
- type: string
113+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
114+
x-kubernetes-int-or-string: true
115+
required:
116+
- tflops
117+
- vram
118+
type: object
119+
name:
120+
type: string
121+
namespace:
122+
type: string
123+
qos:
124+
enum:
125+
- low
126+
- medium
127+
- high
128+
- critical
129+
type: string
130+
requests:
131+
properties:
132+
compute:
133+
anyOf:
134+
- type: integer
135+
- type: string
136+
description: 0-100 percentage, mutually exclusive with
137+
TFLOPs
138+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
139+
x-kubernetes-int-or-string: true
140+
tflops:
141+
anyOf:
142+
- type: integer
143+
- type: string
144+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
145+
x-kubernetes-int-or-string: true
146+
vram:
147+
anyOf:
148+
- type: integer
149+
- type: string
150+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
151+
x-kubernetes-int-or-string: true
152+
required:
153+
- tflops
154+
- vram
155+
type: object
156+
uid:
157+
type: string
158+
type: object
159+
type: array
160+
type: object
104161
availableTFlops:
105162
anyOf:
106163
- type: integer
@@ -221,6 +278,9 @@ spec:
221278
- Unknown
222279
- Destroying
223280
type: string
281+
totalGPUPods:
282+
format: int32
283+
type: integer
224284
totalGPUs:
225285
format: int32
226286
type: integer

charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,77 @@ spec:
159159
type: string
160160
namespace:
161161
type: string
162+
pods:
163+
description: Pod names that are running this workload
164+
items:
165+
properties:
166+
limits:
167+
properties:
168+
compute:
169+
anyOf:
170+
- type: integer
171+
- type: string
172+
description: 0-100 percentage, mutually exclusive
173+
with TFLOPs
174+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
175+
x-kubernetes-int-or-string: true
176+
tflops:
177+
anyOf:
178+
- type: integer
179+
- type: string
180+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
181+
x-kubernetes-int-or-string: true
182+
vram:
183+
anyOf:
184+
- type: integer
185+
- type: string
186+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
187+
x-kubernetes-int-or-string: true
188+
required:
189+
- tflops
190+
- vram
191+
type: object
192+
name:
193+
type: string
194+
namespace:
195+
type: string
196+
qos:
197+
enum:
198+
- low
199+
- medium
200+
- high
201+
- critical
202+
type: string
203+
requests:
204+
properties:
205+
compute:
206+
anyOf:
207+
- type: integer
208+
- type: string
209+
description: 0-100 percentage, mutually exclusive
210+
with TFLOPs
211+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
212+
x-kubernetes-int-or-string: true
213+
tflops:
214+
anyOf:
215+
- type: integer
216+
- type: string
217+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
218+
x-kubernetes-int-or-string: true
219+
vram:
220+
anyOf:
221+
- type: integer
222+
- type: string
223+
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
224+
x-kubernetes-int-or-string: true
225+
required:
226+
- tflops
227+
- vram
228+
type: object
229+
uid:
230+
type: string
231+
type: object
232+
type: array
162233
required:
163234
- count
164235
type: object

0 commit comments

Comments
 (0)