Skip to content

Commit 07c73f9

Browse files
committed
fix: pod index split
1 parent 0a08d57 commit 07c73f9

File tree

14 files changed

+473
-562
lines changed

14 files changed

+473
-562
lines changed
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
{{- if lookup "apiextensions.k8s.io/v1" "CustomResourceDefinition" "karpenter.sh" "NodeOverlay" -}}
2+
apiVersion: karpenter.sh/v1alpha1
3+
kind: NodeOverlay
4+
metadata:
5+
name: tensor-fusion-overlay
6+
spec:
7+
requirements: []
8+
capacity:
9+
tensor-fusion.ai/index_0: 28
10+
tensor-fusion.ai/index_1: 28
11+
tensor-fusion.ai/index_2: 28
12+
tensor-fusion.ai/index_3: 28
13+
tensor-fusion.ai/index_4: 28
14+
tensor-fusion.ai/index_5: 28
15+
tensor-fusion.ai/index_6: 28
16+
tensor-fusion.ai/index_7: 28
17+
tensor-fusion.ai/index_8: 28
18+
tensor-fusion.ai/index_9: 28
19+
tensor-fusion.ai/index_a: 28
20+
tensor-fusion.ai/index_b: 28
21+
tensor-fusion.ai/index_c: 28
22+
tensor-fusion.ai/index_d: 28
23+
tensor-fusion.ai/index_e: 28
24+
tensor-fusion.ai/index_f: 28
25+
{{- end }}

internal/constants/constants.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,11 @@ const (
9898
// Additional worker pod template is set by user with /worker-pod-template annotation
9999
WorkerPodTemplateAnnotation = Domain + "/worker-pod-template"
100100

101-
// Pod index annotation for Device Plugin communication (1-512)
102-
PodIndexAnnotation = Domain + "/index"
101+
// Pod index annotation for Device Plugin communication (1-128)
102+
// When it's in annotation, use this string, when it's in resource limits, use it as prefix
103+
PodIndexAnnotation = Domain + "/index"
104+
PodIndexDelimiter = "_"
105+
PodDeviceAllocatedAnnotation = Domain + "/allocated"
103106

104107
WorkloadModeAnnotation = Domain + "/workload-mode"
105108
WorkloadModeDynamic = "dynamic"
@@ -244,6 +247,8 @@ const KarpenterNodePoolKind = "NodePool"
244247
const AcceleratorLabelVendor = Domain + "/hardware-vendor"
245248

246249
const (
247-
IndexRangeStart = 1
248-
IndexRangeEnd = 512
250+
// 16x8 dummy index device at max
251+
// tensor-fusion.ai/index_0: 1 to tensor-fusion.ai/index_f: 8
252+
IndexKeyLength = 16
253+
IndexModLength = 8
249254
)

internal/hypervisor/api/worker_types.go

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
package api
22

33
import (
4+
"time"
5+
46
tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
57
)
68

@@ -21,11 +23,36 @@ type WorkerInfo struct {
2123
TemplateID string
2224
Annotations map[string]string
2325
PodIndex string
26+
27+
DeletedAt time.Time
2428
}
2529

2630
type WorkerAllocation struct {
2731
WorkerInfo *WorkerInfo
2832

2933
// the complete or partitioned device info
3034
DeviceInfos []*DeviceInfo
35+
36+
Envs map[string]string
37+
38+
Mounts []*Mount
39+
40+
Devices []*DeviceSpec
41+
}
42+
43+
// DeviceSpec specifies a host device to mount into a container.
44+
type DeviceSpec struct {
45+
GuestPath string `json:"guestPath,omitempty"`
46+
47+
HostPath string `json:"hostPath,omitempty"`
48+
49+
Permissions string `json:"permissions,omitempty"`
50+
}
51+
52+
// Mount specifies a host volume to mount into a container.
53+
// where device library or tools are installed on host and container
54+
type Mount struct {
55+
GuestPath string `json:"guestPath,omitempty"`
56+
57+
HostPath string `json:"hostPath,omitempty"`
3158
}

internal/hypervisor/backend/kubernetes/apiserver.go renamed to internal/hypervisor/backend/kubernetes/api_client.go

Lines changed: 20 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -33,30 +33,30 @@ func init() {
3333
utilruntime.Must(tfv1.AddToScheme(scheme))
3434
}
3535

36-
// APIServer provides CRUD operations for GPU resources
37-
type APIServer struct {
36+
// APIClient provides CRUD operations for GPU resources
37+
type APIClient struct {
3838
client client.Client
3939
ctx context.Context
4040
}
4141

42-
// NewAPIServer creates a new API server instance with an existing client
43-
func NewAPIServer(ctx context.Context, k8sClient client.Client) *APIServer {
44-
return &APIServer{
42+
// NewAPIClient creates a new API client instance with an existing client
43+
func NewAPIClient(ctx context.Context, k8sClient client.Client) *APIClient {
44+
return &APIClient{
4545
client: k8sClient,
4646
ctx: ctx,
4747
}
4848
}
4949

50-
// NewAPIServerFromConfig creates a new API server instance from a rest.Config
51-
func NewAPIServerFromConfig(ctx context.Context, restConfig *rest.Config) (*APIServer, error) {
50+
// NewAPIClientFromConfig creates a new API client instance from a rest.Config
51+
func NewAPIClientFromConfig(ctx context.Context, restConfig *rest.Config) (*APIClient, error) {
5252
k8sClient, err := client.New(restConfig, client.Options{
5353
Scheme: scheme,
5454
})
5555
if err != nil {
5656
return nil, fmt.Errorf("failed to create Kubernetes client: %w", err)
5757
}
5858

59-
return &APIServer{
59+
return &APIClient{
6060
client: k8sClient,
6161
ctx: ctx,
6262
}, nil
@@ -76,7 +76,7 @@ type GPUInfo struct {
7676
}
7777

7878
// CreateOrUpdateGPU creates or updates a GPU resource with metadata and status
79-
func (a *APIServer) CreateOrUpdateGPU(gpuNode *tfv1.GPUNode, info GPUInfo) (*tfv1.GPU, error) {
79+
func (a *APIClient) CreateOrUpdateGPU(gpuNode *tfv1.GPUNode, info GPUInfo) (*tfv1.GPU, error) {
8080
if len(gpuNode.OwnerReferences) == 0 {
8181
return nil, fmt.Errorf("GPUNode %s has no owner references", gpuNode.Name)
8282
}
@@ -144,7 +144,7 @@ func (a *APIServer) CreateOrUpdateGPU(gpuNode *tfv1.GPUNode, info GPUInfo) (*tfv
144144
}
145145

146146
// setGPUStatus sets the GPU status fields from GPUInfo
147-
func (a *APIServer) setGPUStatus(gpu *tfv1.GPU, info GPUInfo) {
147+
func (a *APIClient) setGPUStatus(gpu *tfv1.GPU, info GPUInfo) {
148148
gpu.Status.Capacity = &tfv1.Resource{
149149
Vram: resource.MustParse(fmt.Sprintf("%dMi", info.VRAMBytes/bytesPerMiB)),
150150
Tflops: info.TFlops,
@@ -171,7 +171,7 @@ func (a *APIServer) setGPUStatus(gpu *tfv1.GPU, info GPUInfo) {
171171
}
172172

173173
// GetGPU retrieves a GPU resource by UUID
174-
func (a *APIServer) GetGPU(uuid string) (*tfv1.GPU, error) {
174+
func (a *APIClient) GetGPU(uuid string) (*tfv1.GPU, error) {
175175
gpu := &tfv1.GPU{}
176176
if err := a.client.Get(a.ctx, client.ObjectKey{Name: uuid}, gpu); err != nil {
177177
return nil, fmt.Errorf("failed to get GPU %s: %w", uuid, err)
@@ -180,7 +180,7 @@ func (a *APIServer) GetGPU(uuid string) (*tfv1.GPU, error) {
180180
}
181181

182182
// ListGPUs lists all GPU resources
183-
func (a *APIServer) ListGPUs() (*tfv1.GPUList, error) {
183+
func (a *APIClient) ListGPUs() (*tfv1.GPUList, error) {
184184
gpuList := &tfv1.GPUList{}
185185
if err := a.client.List(a.ctx, gpuList); err != nil {
186186
return nil, fmt.Errorf("failed to list GPUs: %w", err)
@@ -189,7 +189,7 @@ func (a *APIServer) ListGPUs() (*tfv1.GPUList, error) {
189189
}
190190

191191
// UpdateGPUStatus updates the status of a GPU resource using merge patch
192-
func (a *APIServer) UpdateGPUStatus(gpu *tfv1.GPU) error {
192+
func (a *APIClient) UpdateGPUStatus(gpu *tfv1.GPU) error {
193193
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
194194
current := &tfv1.GPU{}
195195
if err := a.client.Get(a.ctx, client.ObjectKeyFromObject(gpu), current); err != nil {
@@ -203,7 +203,7 @@ func (a *APIServer) UpdateGPUStatus(gpu *tfv1.GPU) error {
203203
}
204204

205205
// patchGPUStatus patches a specific GPU status field using a function
206-
func (a *APIServer) patchGPUStatus(uuid string, updateFn func(*tfv1.GPU)) error {
206+
func (a *APIClient) patchGPUStatus(uuid string, updateFn func(*tfv1.GPU)) error {
207207
return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
208208
gpu, err := a.GetGPU(uuid)
209209
if err != nil {
@@ -217,21 +217,21 @@ func (a *APIServer) patchGPUStatus(uuid string, updateFn func(*tfv1.GPU)) error
217217
}
218218

219219
// UpdateGPUAvailableResources updates the available resources of a GPU
220-
func (a *APIServer) UpdateGPUAvailableResources(uuid string, available *tfv1.Resource) error {
220+
func (a *APIClient) UpdateGPUAvailableResources(uuid string, available *tfv1.Resource) error {
221221
return a.patchGPUStatus(uuid, func(gpu *tfv1.GPU) {
222222
gpu.Status.Available = available
223223
})
224224
}
225225

226226
// UpdateGPUPhase updates the phase of a GPU
227-
func (a *APIServer) UpdateGPUPhase(uuid string, phase tfv1.TensorFusionGPUPhase) error {
227+
func (a *APIClient) UpdateGPUPhase(uuid string, phase tfv1.TensorFusionGPUPhase) error {
228228
return a.patchGPUStatus(uuid, func(gpu *tfv1.GPU) {
229229
gpu.Status.Phase = phase
230230
})
231231
}
232232

233233
// GetGPUNode retrieves a GPUNode resource by name
234-
func (a *APIServer) GetGPUNode(name string) (*tfv1.GPUNode, error) {
234+
func (a *APIClient) GetGPUNode(name string) (*tfv1.GPUNode, error) {
235235
gpuNode := &tfv1.GPUNode{}
236236
if err := a.client.Get(a.ctx, client.ObjectKey{Name: name}, gpuNode); err != nil {
237237
return nil, fmt.Errorf("failed to get GPUNode %s: %w", name, err)
@@ -240,7 +240,7 @@ func (a *APIServer) GetGPUNode(name string) (*tfv1.GPUNode, error) {
240240
}
241241

242242
// UpdateGPUNodeStatus updates the status of a GPUNode resource
243-
func (a *APIServer) UpdateGPUNodeStatus(
243+
func (a *APIClient) UpdateGPUNodeStatus(
244244
gpuNode *tfv1.GPUNode,
245245
totalTFlops, totalVRAM resource.Quantity,
246246
totalGPUs int32,
@@ -259,7 +259,7 @@ func (a *APIServer) UpdateGPUNodeStatus(
259259
}
260260

261261
// updateGPUNodeStatus updates GPUNode status fields
262-
func (a *APIServer) updateGPUNodeStatus(
262+
func (a *APIClient) updateGPUNodeStatus(
263263
status *tfv1.GPUNodeStatus,
264264
totalTFlops, totalVRAM resource.Quantity,
265265
totalGPUs int32,
@@ -277,7 +277,7 @@ func (a *APIServer) updateGPUNodeStatus(
277277
}
278278

279279
// DeleteGPU deletes a GPU resource
280-
func (a *APIServer) DeleteGPU(uuid string) error {
280+
func (a *APIClient) DeleteGPU(uuid string) error {
281281
gpu := &tfv1.GPU{
282282
ObjectMeta: metav1.ObjectMeta{
283283
Name: uuid,

0 commit comments

Comments
 (0)