NexusGPU
diff --git a/‎charts/tensor-fusion/templates/node-overlay.yaml‎
Lines changed: 25 additions & 0 deletions b/‎charts/tensor-fusion/templates/node-overlay.yaml‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎internal/constants/constants.go‎
Lines changed: 9 additions & 4 deletions b/‎internal/constants/constants.go‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎internal/hypervisor/api/worker_types.go‎
Lines changed: 27 additions & 0 deletions b/‎internal/hypervisor/api/worker_types.go‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎internal/hypervisor/backend/kubernetes/apiserver.go‎ renamed to ‎internal/hypervisor/backend/kubernetes/api_client.go‎
Lines changed: 20 additions & 20 deletions b/‎internal/hypervisor/backend/kubernetes/apiserver.go‎ renamed to ‎internal/hypervisor/backend/kubernetes/api_client.go‎
Lines changed: 20 additions & 20 deletions
@@ -0,0 +1,25 @@
+{{- if lookup "apiextensions.k8s.io/v1" "CustomResourceDefinition" "karpenter.sh" "NodeOverlay" -}}
+apiVersion: karpenter.sh/v1alpha1
+kind: NodeOverlay
+metadata:
+  name: tensor-fusion-overlay
+spec:
+  requirements: []
+  capacity:
+    tensor-fusion.ai/index_0: 28
+    tensor-fusion.ai/index_1: 28
+    tensor-fusion.ai/index_2: 28
+    tensor-fusion.ai/index_3: 28
+    tensor-fusion.ai/index_4: 28
+    tensor-fusion.ai/index_5: 28
+    tensor-fusion.ai/index_6: 28
+    tensor-fusion.ai/index_7: 28
+    tensor-fusion.ai/index_8: 28
+    tensor-fusion.ai/index_9: 28
+    tensor-fusion.ai/index_a: 28
+    tensor-fusion.ai/index_b: 28
+    tensor-fusion.ai/index_c: 28
+    tensor-fusion.ai/index_d: 28
+    tensor-fusion.ai/index_e: 28
+    tensor-fusion.ai/index_f: 28
+{{- end }}
@@ -98,8 +98,11 @@ const (
 	// Additional worker pod template is set by user with /worker-pod-template annotation
 	WorkerPodTemplateAnnotation = Domain + "/worker-pod-template"
 
-	// Pod index annotation for Device Plugin communication (1-512)
-	PodIndexAnnotation = Domain + "/index"
+	// Pod index annotation for Device Plugin communication (1-128)
+	// When it's in annotation, use this string, when it's in resource limits, use it as prefix
+	PodIndexAnnotation           = Domain + "/index"
+	PodIndexDelimiter            = "_"
+	PodDeviceAllocatedAnnotation = Domain + "/allocated"
 
 	WorkloadModeAnnotation = Domain + "/workload-mode"
 	WorkloadModeDynamic    = "dynamic"
@@ -244,6 +247,8 @@ const KarpenterNodePoolKind = "NodePool"
 const AcceleratorLabelVendor = Domain + "/hardware-vendor"
 
 const (
-	IndexRangeStart = 1
-	IndexRangeEnd   = 512
+	// 16x8 dummy index device at max
+	// tensor-fusion.ai/index_0: 1 to tensor-fusion.ai/index_f: 8
+	IndexKeyLength = 16
+	IndexModLength = 8
 )
@@ -1,6 +1,8 @@
 package api
 
 import (
+	"time"
+
 	tfv1 "github.com/NexusGPU/tensor-fusion/api/v1"
 )
 
@@ -21,11 +23,36 @@ type WorkerInfo struct {
 	TemplateID        string
 	Annotations       map[string]string
 	PodIndex          string
+
+	DeletedAt time.Time
 }
 
 type WorkerAllocation struct {
 	WorkerInfo *WorkerInfo
 
 	// the complete or partitioned device info
 	DeviceInfos []*DeviceInfo
+
+	Envs map[string]string
+
+	Mounts []*Mount
+
+	Devices []*DeviceSpec
+}
+
+// DeviceSpec specifies a host device to mount into a container.
+type DeviceSpec struct {
+	GuestPath string `json:"guestPath,omitempty"`
+
+	HostPath string `json:"hostPath,omitempty"`
+
+	Permissions string `json:"permissions,omitempty"`
+}
+
+// Mount specifies a host volume to mount into a container.
+// where device library or tools are installed on host and container
+type Mount struct {
+	GuestPath string `json:"guestPath,omitempty"`
+
+	HostPath string `json:"hostPath,omitempty"`
 }
@@ -33,30 +33,30 @@ func init() {
 	utilruntime.Must(tfv1.AddToScheme(scheme))
 }
 
-// APIServer provides CRUD operations for GPU resources
-type APIServer struct {
+// APIClient provides CRUD operations for GPU resources
+type APIClient struct {
 	client client.Client
 	ctx    context.Context
 }
 
-// NewAPIServer creates a new API server instance with an existing client
-func NewAPIServer(ctx context.Context, k8sClient client.Client) *APIServer {
-	return &APIServer{
+// NewAPIClient creates a new API client instance with an existing client
+func NewAPIClient(ctx context.Context, k8sClient client.Client) *APIClient {
+	return &APIClient{
 		client: k8sClient,
 		ctx:    ctx,
 	}
 }
 
-// NewAPIServerFromConfig creates a new API server instance from a rest.Config
-func NewAPIServerFromConfig(ctx context.Context, restConfig *rest.Config) (*APIServer, error) {
+// NewAPIClientFromConfig creates a new API client instance from a rest.Config
+func NewAPIClientFromConfig(ctx context.Context, restConfig *rest.Config) (*APIClient, error) {
 	k8sClient, err := client.New(restConfig, client.Options{
 		Scheme: scheme,
 	})
 	if err != nil {
 		return nil, fmt.Errorf("failed to create Kubernetes client: %w", err)
 	}
 
-	return &APIServer{
+	return &APIClient{
 		client: k8sClient,
 		ctx:    ctx,
 	}, nil
@@ -76,7 +76,7 @@ type GPUInfo struct {
 }
 
 // CreateOrUpdateGPU creates or updates a GPU resource with metadata and status
-func (a *APIServer) CreateOrUpdateGPU(gpuNode *tfv1.GPUNode, info GPUInfo) (*tfv1.GPU, error) {
+func (a *APIClient) CreateOrUpdateGPU(gpuNode *tfv1.GPUNode, info GPUInfo) (*tfv1.GPU, error) {
 	if len(gpuNode.OwnerReferences) == 0 {
 		return nil, fmt.Errorf("GPUNode %s has no owner references", gpuNode.Name)
 	}
@@ -144,7 +144,7 @@ func (a *APIServer) CreateOrUpdateGPU(gpuNode *tfv1.GPUNode, info GPUInfo) (*tfv
 }
 
 // setGPUStatus sets the GPU status fields from GPUInfo
-func (a *APIServer) setGPUStatus(gpu *tfv1.GPU, info GPUInfo) {
+func (a *APIClient) setGPUStatus(gpu *tfv1.GPU, info GPUInfo) {
 	gpu.Status.Capacity = &tfv1.Resource{
 		Vram:   resource.MustParse(fmt.Sprintf("%dMi", info.VRAMBytes/bytesPerMiB)),
 		Tflops: info.TFlops,
@@ -171,7 +171,7 @@ func (a *APIServer) setGPUStatus(gpu *tfv1.GPU, info GPUInfo) {
 }
 
 // GetGPU retrieves a GPU resource by UUID
-func (a *APIServer) GetGPU(uuid string) (*tfv1.GPU, error) {
+func (a *APIClient) GetGPU(uuid string) (*tfv1.GPU, error) {
 	gpu := &tfv1.GPU{}
 	if err := a.client.Get(a.ctx, client.ObjectKey{Name: uuid}, gpu); err != nil {
 		return nil, fmt.Errorf("failed to get GPU %s: %w", uuid, err)
@@ -180,7 +180,7 @@ func (a *APIServer) GetGPU(uuid string) (*tfv1.GPU, error) {
 }
 
 // ListGPUs lists all GPU resources
-func (a *APIServer) ListGPUs() (*tfv1.GPUList, error) {
+func (a *APIClient) ListGPUs() (*tfv1.GPUList, error) {
 	gpuList := &tfv1.GPUList{}
 	if err := a.client.List(a.ctx, gpuList); err != nil {
 		return nil, fmt.Errorf("failed to list GPUs: %w", err)
@@ -189,7 +189,7 @@ func (a *APIServer) ListGPUs() (*tfv1.GPUList, error) {
 }
 
 // UpdateGPUStatus updates the status of a GPU resource using merge patch
-func (a *APIServer) UpdateGPUStatus(gpu *tfv1.GPU) error {
+func (a *APIClient) UpdateGPUStatus(gpu *tfv1.GPU) error {
 	return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
 		current := &tfv1.GPU{}
 		if err := a.client.Get(a.ctx, client.ObjectKeyFromObject(gpu), current); err != nil {
@@ -203,7 +203,7 @@ func (a *APIServer) UpdateGPUStatus(gpu *tfv1.GPU) error {
 }
 
 // patchGPUStatus patches a specific GPU status field using a function
-func (a *APIServer) patchGPUStatus(uuid string, updateFn func(*tfv1.GPU)) error {
+func (a *APIClient) patchGPUStatus(uuid string, updateFn func(*tfv1.GPU)) error {
 	return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
 		gpu, err := a.GetGPU(uuid)
 		if err != nil {
@@ -217,21 +217,21 @@ func (a *APIServer) patchGPUStatus(uuid string, updateFn func(*tfv1.GPU)) error
 }
 
 // UpdateGPUAvailableResources updates the available resources of a GPU
-func (a *APIServer) UpdateGPUAvailableResources(uuid string, available *tfv1.Resource) error {
+func (a *APIClient) UpdateGPUAvailableResources(uuid string, available *tfv1.Resource) error {
 	return a.patchGPUStatus(uuid, func(gpu *tfv1.GPU) {
 		gpu.Status.Available = available
 	})
 }
 
 // UpdateGPUPhase updates the phase of a GPU
-func (a *APIServer) UpdateGPUPhase(uuid string, phase tfv1.TensorFusionGPUPhase) error {
+func (a *APIClient) UpdateGPUPhase(uuid string, phase tfv1.TensorFusionGPUPhase) error {
 	return a.patchGPUStatus(uuid, func(gpu *tfv1.GPU) {
 		gpu.Status.Phase = phase
 	})
 }
 
 // GetGPUNode retrieves a GPUNode resource by name
-func (a *APIServer) GetGPUNode(name string) (*tfv1.GPUNode, error) {
+func (a *APIClient) GetGPUNode(name string) (*tfv1.GPUNode, error) {
 	gpuNode := &tfv1.GPUNode{}
 	if err := a.client.Get(a.ctx, client.ObjectKey{Name: name}, gpuNode); err != nil {
 		return nil, fmt.Errorf("failed to get GPUNode %s: %w", name, err)
@@ -240,7 +240,7 @@ func (a *APIServer) GetGPUNode(name string) (*tfv1.GPUNode, error) {
 }
 
 // UpdateGPUNodeStatus updates the status of a GPUNode resource
-func (a *APIServer) UpdateGPUNodeStatus(
+func (a *APIClient) UpdateGPUNodeStatus(
 	gpuNode *tfv1.GPUNode,
 	totalTFlops, totalVRAM resource.Quantity,
 	totalGPUs int32,
@@ -259,7 +259,7 @@ func (a *APIServer) UpdateGPUNodeStatus(
 }
 
 // updateGPUNodeStatus updates GPUNode status fields
-func (a *APIServer) updateGPUNodeStatus(
+func (a *APIClient) updateGPUNodeStatus(
 	status *tfv1.GPUNodeStatus,
 	totalTFlops, totalVRAM resource.Quantity,
 	totalGPUs int32,
@@ -277,7 +277,7 @@ func (a *APIServer) updateGPUNodeStatus(
 }
 
 // DeleteGPU deletes a GPU resource
-func (a *APIServer) DeleteGPU(uuid string) error {
+func (a *APIClient) DeleteGPU(uuid string) error {
 	gpu := &tfv1.GPU{
 		ObjectMeta: metav1.ObjectMeta{
 			Name: uuid,