Skip to content

Commit dee5311

Browse files
authored
auto-create dra claims (#609)
Signed-off-by: Varun Ramachandra Sekar <[email protected]>
1 parent bae9dd8 commit dee5311

35 files changed

+3781
-266
lines changed

api/apps/v1alpha1/common_types.go

Lines changed: 0 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -328,83 +328,3 @@ type PersistentVolumeClaim struct {
328328
// Annotations for the PVC
329329
Annotations map[string]string `json:"annotations,omitempty"`
330330
}
331-
332-
// DRAResource references exactly one ResourceClaim, either directly
333-
// or by naming a ResourceClaimTemplate which is then turned into a ResourceClaim.
334-
//
335-
// When creating the NIMService pods, it adds a name (`DNS_LABEL` format) to it
336-
// that uniquely identifies the DRA resource.
337-
// +kubebuilder:validation:XValidation:rule="has(self.resourceClaimName) != has(self.resourceClaimTemplateName)",message="exactly one of spec.resourceClaimName and spec.resourceClaimTemplateName must be set."
338-
type DRAResource struct {
339-
// ResourceClaimName is the name of a ResourceClaim object in the same
340-
// namespace as the NIMService.
341-
//
342-
// Exactly one of ResourceClaimName and ResourceClaimTemplateName must
343-
// be set.
344-
//
345-
// +kubebuilder:validation:MinLength=1
346-
// +kubebuilder:validation:MaxLength=253
347-
// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*`
348-
ResourceClaimName *string `json:"resourceClaimName,omitempty"`
349-
350-
// ResourceClaimTemplateName is the name of a ResourceClaimTemplate
351-
// object in the same namespace as the pods for this NIMService.
352-
//
353-
// The template will be used to create a new ResourceClaim, which will
354-
// be bound to the pods created for this NIMService.
355-
//
356-
// Exactly one of ResourceClaimName and ResourceClaimTemplateName must
357-
// be set.
358-
//
359-
// +kubebuilder:validation:MinLength=1
360-
// +kubebuilder:validation:MaxLength=253
361-
// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*`
362-
ResourceClaimTemplateName *string `json:"resourceClaimTemplateName,omitempty"`
363-
364-
// Requests is the list of requests in the referenced ResourceClaim/ResourceClaimTemplate
365-
// to be made available to the model container of the NIMService pods.
366-
//
367-
// If empty, everything from the claim is made available, otherwise
368-
// only the result of this subset of requests.
369-
//
370-
// +kubebuilder:validation:items:MinLength=1
371-
Requests []string `json:"requests,omitempty"`
372-
}
373-
374-
// DRAResourceStatus defines the status of the DRAResource.
375-
// +kubebuilder:validation:XValidation:rule="has(self.resourceClaimStatus) != has(self.resourceClaimTemplateStatus)",message="exactly one of resourceClaimStatus and resourceClaimTemplateStatus must be set."
376-
type DRAResourceStatus struct {
377-
// Name is the pod claim name referenced in the pod spec as `spec.resourceClaims[].name` for this DRA resource.
378-
Name string `json:"name"`
379-
// ResourceClaimStatus is the status of the resource claim in this DRA resource.
380-
//
381-
// Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
382-
ResourceClaimStatus *DRAResourceClaimStatusInfo `json:"resourceClaimStatus,omitempty"`
383-
// ResourceClaimTemplateStatus is the status of the resource claim template in this DRA resource.
384-
//
385-
// Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
386-
ResourceClaimTemplateStatus *DRAResourceClaimTemplateStatusInfo `json:"resourceClaimTemplateStatus,omitempty"`
387-
}
388-
389-
// DRAResourceClaimStatusInfo defines the status of a ResourceClaim referenced in the DRAResource.
390-
type DRAResourceClaimStatusInfo struct {
391-
// Name is the name of the ResourceClaim.
392-
Name string `json:"name"`
393-
// State is the state of the ResourceClaim.
394-
// * pending: the resource claim is pending allocation.
395-
// * deleted: the resource claim has a deletion timestamp set but is not yet finalized.
396-
// * allocated: the resource claim is allocated to a pod.
397-
// * reserved: the resource claim is consumed by a pod.
398-
// This field will have one or more of the above values depending on the status of the resource claim.
399-
//
400-
// +kubebuilder:validation:default=pending
401-
State string `json:"state"`
402-
}
403-
404-
// DRAResourceClaimTemplateStatusInfo defines the status of a ResourceClaimTemplate referenced in the DRAResource.
405-
type DRAResourceClaimTemplateStatusInfo struct {
406-
// Name is the name of the resource claim template.
407-
Name string `json:"name"`
408-
// ResourceClaimStatuses is the statuses of the generated resource claims from this resource claim template.
409-
ResourceClaimStatuses []DRAResourceClaimStatusInfo `json:"resourceClaimStatuses,omitempty"`
410-
}

api/apps/v1alpha1/dra_types.go

Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
/*
2+
Copyright 2025.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package v1alpha1
18+
19+
import (
20+
"fmt"
21+
22+
apiresource "k8s.io/apimachinery/pkg/api/resource"
23+
24+
"github.com/NVIDIA/k8s-nim-operator/internal/k8sutil"
25+
k8sutilcel "github.com/NVIDIA/k8s-nim-operator/internal/k8sutil/cel"
26+
)
27+
28+
// DRAResource references exactly one ResourceClaim, either directly
29+
// or by naming a ResourceClaimTemplate which is then turned into a ResourceClaim.
30+
//
31+
// When creating the NIMService pods, it adds a name (`DNS_LABEL` format) to it
32+
// that uniquely identifies the DRA resource.
33+
// +kubebuilder:validation:XValidation:rule="(has(self.resourceClaimName) ? 1 : 0) + (has(self.resourceClaimTemplateName) ? 1 : 0) + (has(self.claimSpec) ? 1 : 0) == 1",message="exactly one of spec.resourceClaimName, spec.resourceClaimTemplateName, or spec.claimSpec must be set."
34+
type DRAResource struct {
35+
// ResourceClaimName is the name of a DRA resource claim object in the same
36+
// namespace as the NIMService.
37+
//
38+
// Exactly one of ResourceClaimName and ResourceClaimTemplateName must
39+
// be set.
40+
//
41+
// +kubebuilder:validation:MinLength=1
42+
// +kubebuilder:validation:MaxLength=253
43+
// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*`
44+
ResourceClaimName *string `json:"resourceClaimName,omitempty"`
45+
46+
// ResourceClaimTemplateName is the name of a DRA resource claim template
47+
// object in the same namespace as the pods for this NIMService.
48+
//
49+
// The template will be used to create a new DRA resource claim, which will
50+
// be bound to the pods created for this NIMService.
51+
//
52+
// Exactly one of ResourceClaimName and ResourceClaimTemplateName must
53+
// be set.
54+
//
55+
// +kubebuilder:validation:MinLength=1
56+
// +kubebuilder:validation:MaxLength=253
57+
// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*`
58+
ResourceClaimTemplateName *string `json:"resourceClaimTemplateName,omitempty"`
59+
60+
// ClaimSpec is the spec to auto-generate a DRA resource claim/resource claim template. Only one of ClaimSpec, ResourceClaimName or ResourceClaimTemplateName must be specified.
61+
ClaimSpec *DRAClaimSpec `json:"claimSpec,omitempty"`
62+
63+
// Requests is the list of requests in the referenced DRA resource claim/resource claim template
64+
// to be made available to the model container of the NIMService pods.
65+
//
66+
// If empty, everything from the claim is made available, otherwise
67+
// only the result of this subset of requests.
68+
//
69+
// +kubebuilder:validation:items:MinLength=1
70+
Requests []string `json:"requests,omitempty"`
71+
}
72+
73+
// DRADeviceAttributeMatcherValue defines the value of a device attribute to match.
74+
// Exactly one of the fields must be set.
75+
type DRADeviceAttributeMatcherValue struct {
76+
// BoolValue is a true/false value.
77+
BoolValue *bool `json:"boolValue,omitempty"`
78+
// IntValue is a number.
79+
IntValue *int32 `json:"intValue,omitempty"`
80+
// StringValue is a string value.
81+
// +kubebuilder:validation:MaxLength=64
82+
StringValue *string `json:"stringValue,omitempty"`
83+
// VersionValue is a semantic version according to semver.org spec 2.0.0.
84+
// +kubebuilder:validation:MaxLength=64
85+
VersionValue *string `json:"versionValue,omitempty"`
86+
}
87+
88+
func (d *DRADeviceAttributeMatcherValue) GetValue() any {
89+
switch {
90+
case d.BoolValue != nil:
91+
return *d.BoolValue
92+
case d.IntValue != nil:
93+
return int(*d.IntValue)
94+
case d.StringValue != nil:
95+
return *d.StringValue
96+
case d.VersionValue != nil:
97+
return *d.VersionValue
98+
}
99+
return nil
100+
}
101+
102+
func (d *DRADeviceAttributeMatcherValue) GetValueType() k8sutilcel.ValueType {
103+
switch {
104+
case d.BoolValue != nil:
105+
return k8sutilcel.TypeBool
106+
case d.IntValue != nil:
107+
return k8sutilcel.TypeInt
108+
case d.StringValue != nil:
109+
return k8sutilcel.TypeString
110+
case d.VersionValue != nil:
111+
return k8sutilcel.TypeSemver
112+
default:
113+
return k8sutilcel.TypeUnknown
114+
}
115+
}
116+
117+
// DRADeviceAttributeMatcherOp defines the operator to use for matching a device attribute.
118+
type DRADeviceAttributeMatcherOp string
119+
120+
const (
121+
DRADeviceAttributeMatcherOpEqual DRADeviceAttributeMatcherOp = "Equal"
122+
DRADeviceAttributeMatcherOpNotEqual DRADeviceAttributeMatcherOp = "NotEqual"
123+
DRADeviceAttributeMatcherOpGreaterThan DRADeviceAttributeMatcherOp = "GreaterThan"
124+
DRADeviceAttributeMatcherOpGreaterThanOrEqual DRADeviceAttributeMatcherOp = "GreaterThanOrEqual"
125+
DRADeviceAttributeMatcherOpLessThan DRADeviceAttributeMatcherOp = "LessThan"
126+
DRADeviceAttributeMatcherOpLessThanOrEqual DRADeviceAttributeMatcherOp = "LessThanOrEqual"
127+
)
128+
129+
func (d DRADeviceAttributeMatcherOp) GetCELOperator() k8sutilcel.ComparisonOperator {
130+
switch d {
131+
case DRADeviceAttributeMatcherOpEqual:
132+
return k8sutilcel.OpEqual
133+
case DRADeviceAttributeMatcherOpNotEqual:
134+
return k8sutilcel.OpNotEqual
135+
case DRADeviceAttributeMatcherOpGreaterThan:
136+
return k8sutilcel.OpGreater
137+
case DRADeviceAttributeMatcherOpGreaterThanOrEqual:
138+
return k8sutilcel.OpGreaterOrEqual
139+
case DRADeviceAttributeMatcherOpLessThan:
140+
return k8sutilcel.OpLess
141+
case DRADeviceAttributeMatcherOpLessThanOrEqual:
142+
return k8sutilcel.OpLessOrEqual
143+
default:
144+
return k8sutilcel.OpEqual
145+
}
146+
}
147+
148+
// DRADeviceAttributeMatcher defines the matcher expression for a DRA device attribute.
149+
type DRADeviceAttributeMatcher struct {
150+
// Key is the name of the device attribute to match.
151+
// This is either a qualified name or a simple name.
152+
// If it is a simple name, then it is assumed to be prefixed with the DRA driver name.
153+
// Eg: "gpu.nvidia.com/productName" is equivalent to "productName" if the driver name is "gpu.nvidia.com". Otherwise they're treated as 2 different attributes.
154+
// +kubebuilder:validation:MaxLength=64
155+
Key string `json:"key"`
156+
// Op is the operator to use for matching the device attribute. Supported operators are:
157+
// * Equal: The device attribute value must be equal to the value specified in the matcher.
158+
// * NotEqual: The device attribute value must not be equal to the value specified in the matcher.
159+
// * GreaterThan: The device attribute value must be greater than the value specified in the matcher.
160+
// * GreaterThanOrEqual: The device attribute value must be greater than or equal to the value specified in the matcher.
161+
// * LessThan: The device attribute value must be less than the value specified in the matcher.
162+
// * LessThanOrEqual: The device attribute value must be less than or equal to the value specified in the matcher.
163+
//
164+
// +kubebuilder:validation:Enum=Equal;NotEqual;GreaterThan;GreaterThanOrEqual;LessThan;LessThanOrEqual
165+
// +kubebuilder:default=Equal
166+
Op DRADeviceAttributeMatcherOp `json:"op"`
167+
// Value is the value to match the device attribute against.
168+
Value *DRADeviceAttributeMatcherValue `json:"value,omitempty"`
169+
}
170+
171+
func (d *DRADeviceAttributeMatcher) GetCELExpression(driverName string) (string, error) {
172+
domain, name := k8sutil.SplitQualifiedName(d.Key, driverName)
173+
attrKey := fmt.Sprintf("device.attributes[%q].%s", domain, name)
174+
return k8sutilcel.BuildExpr(attrKey, d.Op.GetCELOperator(), d.Value.GetValue(), d.Value.GetValueType())
175+
}
176+
177+
// DRAResourceQuantityMatcherOp defines the operator to use for matching a resource quantity.
178+
type DRAResourceQuantityMatcherOp string
179+
180+
const (
181+
DRAResourceQuantityMatcherOpEqual DRAResourceQuantityMatcherOp = "Equal"
182+
)
183+
184+
func (d DRAResourceQuantityMatcherOp) GetCELOperator() k8sutilcel.ComparisonOperator {
185+
switch d {
186+
case DRAResourceQuantityMatcherOpEqual:
187+
return k8sutilcel.OpEqual
188+
default:
189+
return k8sutilcel.OpEqual
190+
}
191+
}
192+
193+
// DRAResourceQuantityMatcher defines the matcher expression for a DRA device capacity.
194+
type DRAResourceQuantityMatcher struct {
195+
// Key is the name of the resource quantity to match.
196+
// This is either a qualified name or a simple name.
197+
// If it is a simple name, then it is assumed to be prefixed with the DRA driver name.
198+
// Eg: "gpu.nvidia.com/memory" is equivalent to "memory" if the driver name is "gpu.nvidia.com". Otherwise they're treated as 2 different attributes.
199+
// +kubebuilder:validation:MaxLength=64
200+
Key string `json:"key"`
201+
// Op is the operator to use for matching the device capacity. Supported operators are:
202+
// * Equal: The resource quantity value must be equal to the value specified in the matcher.
203+
//
204+
// +kubebuilder:validation:Enum=Equal
205+
// +kubebuilder:default=Equal
206+
Op DRAResourceQuantityMatcherOp `json:"op"`
207+
// Value is the quantity to match the device capacity against.
208+
//
209+
// +kubebuilder:validation:Required
210+
Value *apiresource.Quantity `json:"value,omitempty"`
211+
}
212+
213+
func (d *DRAResourceQuantityMatcher) GetCELExpression(driverName string) (string, error) {
214+
domain, name := k8sutil.SplitQualifiedName(d.Key, driverName)
215+
attrKey := fmt.Sprintf("device.capacity[%q].%s", domain, name)
216+
return k8sutilcel.BuildExpr(attrKey, d.Op.GetCELOperator(), d.Value, k8sutilcel.TypeQuantity)
217+
}
218+
219+
type DRADeviceSpec struct {
220+
// Name is the name of the device request to use in the generated claim spec.
221+
// Must be a valid DNS_LABEL.
222+
// +kubebuilder:validation:MinLength=1
223+
// +kubebuilder:validation:MaxLength=253
224+
// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*`
225+
Name string `json:"name"`
226+
// Count is the number of devices to request.
227+
// +kubebuilder:default=1
228+
Count uint32 `json:"count"`
229+
// DeviceClassName references a specific DeviceClass to inherit configuration and selectors from.
230+
// +kubebuilder:default=gpu.nvidia.com
231+
DeviceClassName string `json:"deviceClassName"`
232+
// DriverName is the name of the DRA driver providing the capacity information.
233+
// Must be a DNS subdomain.
234+
//
235+
// +kubebuilder:validation:MaxLength=253
236+
// +kubebuilder:validation:Pattern=`^[a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*`
237+
// +kubebuilder:default=gpu.nvidia.com
238+
DriverName string `json:"driverName,omitempty"`
239+
// MatchAttributes defines the criteria which must be satisfied by the device attributes of a device.
240+
// +kubebuilder:validation:MaxSize=20
241+
MatchAttributes []DRADeviceAttributeMatcher `json:"matchAttributes,omitempty"`
242+
// MatchCapacity defines the criteria which must be satisfied by the device capacity of a device.
243+
// +kubebuilder:validation:MaxSize=12
244+
MatchCapacity []DRAResourceQuantityMatcher `json:"matchCapacity,omitempty"`
245+
}
246+
247+
// DRAClaimSpec defines the spec for generating a DRA resource claim/resource claim template.
248+
type DRAClaimSpec struct {
249+
// GenerateName is an optional name prefix to use for generating the resource claim/resource claim template.
250+
// +kubebuilder:validation:MinLength=1
251+
// +kubebuilder:validation:MaxLength=16
252+
GenerateName string `json:"generateName,omitempty"`
253+
// +kubebuilder:validation:MinSize=1
254+
Devices []DRADeviceSpec `json:"devices"`
255+
// TODO: Warn that if set to false, then this NIMService cannot be scaled up.
256+
IsTemplate *bool `json:"isTemplate,omitempty"`
257+
}
258+
259+
func (d *DRAClaimSpec) IsTemplateSpec() bool {
260+
return d.IsTemplate != nil && *d.IsTemplate
261+
}
262+
263+
func (d *DRAClaimSpec) GetNamePrefix() string {
264+
namePrefix := d.GenerateName
265+
if namePrefix != "" {
266+
return namePrefix
267+
}
268+
if d.IsTemplateSpec() {
269+
return "claimtemplate"
270+
}
271+
return "claim"
272+
}
273+
274+
// DRAResourceStatus defines the status of the DRAResource.
275+
// +kubebuilder:validation:XValidation:rule="has(self.resourceClaimStatus) != has(self.resourceClaimTemplateStatus)",message="exactly one of resourceClaimStatus and resourceClaimTemplateStatus must be set."
276+
type DRAResourceStatus struct {
277+
// Name is the pod claim name referenced in the pod spec as `spec.resourceClaims[].name` for this DRA resource.
278+
Name string `json:"name"`
279+
// ResourceClaimStatus is the status of the resource claim in this DRA resource.
280+
//
281+
// Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
282+
ResourceClaimStatus *DRAResourceClaimStatusInfo `json:"resourceClaimStatus,omitempty"`
283+
// ResourceClaimTemplateStatus is the status of the resource claim template in this DRA resource.
284+
//
285+
// Exactly one of resourceClaimStatus and resourceClaimTemplateStatus will be set.
286+
ResourceClaimTemplateStatus *DRAResourceClaimTemplateStatusInfo `json:"resourceClaimTemplateStatus,omitempty"`
287+
}
288+
289+
// DRAResourceClaimStatusInfo defines the status of a ResourceClaim referenced in the DRAResource.
290+
type DRAResourceClaimStatusInfo struct {
291+
// Name is the name of the ResourceClaim.
292+
Name string `json:"name"`
293+
// State is the state of the ResourceClaim.
294+
// * pending: the resource claim is pending allocation.
295+
// * deleted: the resource claim has a deletion timestamp set but is not yet finalized.
296+
// * allocated: the resource claim is allocated to a pod.
297+
// * reserved: the resource claim is consumed by a pod.
298+
// This field will have one or more of the above values depending on the status of the resource claim.
299+
//
300+
// +kubebuilder:validation:default=pending
301+
State string `json:"state"`
302+
}
303+
304+
// DRAResourceClaimTemplateStatusInfo defines the status of a ResourceClaimTemplate referenced in the DRAResource.
305+
type DRAResourceClaimTemplateStatusInfo struct {
306+
// Name is the name of the resource claim template.
307+
Name string `json:"name"`
308+
// ResourceClaimStatuses is the statuses of the generated resource claims from this resource claim template.
309+
ResourceClaimStatuses []DRAResourceClaimStatusInfo `json:"resourceClaimStatuses,omitempty"`
310+
}

0 commit comments

Comments
 (0)