NexusGPU · Code2Life · Nov 17, 2025 · Nov 18, 2025 · Nov 18, 2025 · Nov 18, 2025
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -77,7 +77,7 @@ jobs:
           build-args: |
             GO_LDFLAGS=-X 'github.com/NexusGPU/tensor-fusion/internal/version.BuildVersion=${{ needs.release.outputs.version }}'
 
-  publish_node_discovery_image:
+  publish_hypervisor_image:
     needs:
       - release
     if: needs.release.outputs.published == 'true' || github.event_name == 'workflow_dispatch'
@@ -95,7 +95,7 @@ jobs:
       - id: meta
         uses: docker/metadata-action@v5
         with:
-          images: tensorfusion/tensor-fusion-node-discovery
+          images: tensorfusion/tensor-fusion-hypervisor
           tags: ${{ github.event_name == 'workflow_dispatch' && steps.set_tag.outputs.tag || format('type=semver,pattern={{{{version}}}},value={0}', needs.release.outputs.version) }}
 
       - name: Login to DockerHub
@@ -104,12 +104,12 @@ jobs:
           username: ${{ secrets.DOCKER_USERNAME }}
           password: ${{ secrets.DOCKER_PASSWORD }}
 
-      - name: Build and push node discovery
+      - name: Build and push hypervisor
         uses: docker/build-push-action@v6
         with:
           context: .
           push: true
-          file: dockerfile/node-discovery.Dockerfile
+          file: dockerfile/hypervisor.Dockerfile
           tags: ${{ steps.meta.outputs.tags }}
           labels: ${{ steps.meta.outputs.labels }}
           no-cache: true
diff --git a/.gitignore b/.gitignore
@@ -40,4 +40,13 @@ __debug*
 vendor
 logs
 
-*.prof
+*.prof
+
+provider/build
+
+cmd/hypervisor/hypervisor
+*.o
+
+_obj
+
+metrics.log
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -21,15 +21,18 @@
             ]
         },
         {
-            "name": "Debug Discovery",
+            "name": "Debug Hypervisor",
             "type": "go",
             "request": "launch",
             "mode": "auto",
+            "console": "integratedTerminal",
             "env": {
-                "HOSTNAME": "mocknode",
-                "KUBECONFIG": "~/.kube/config",
+                "KUBECONFIG": "~/.kube/config-local-studio",
+                "HYPERVISOR_PORT": "8042",
+                "GPU_NODE_NAME": "ubuntu",
             },
-            "program": "${workspaceFolder}/cmd/nodediscovery/main.go",
+            "cwd": "${workspaceFolder}",
+            "program": "${workspaceFolder}/cmd/hypervisor/main.go",
         },
         {
             "name": "Debug Dev Env Operator",
@@ -62,7 +65,8 @@
                 "ENABLE_WEBHOOKS": "false",
                 "ENABLE_SCHEDULER": "true",
                 "ENABLE_CR_CONTROLLER": "true",
-                "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION": "true"
+                "NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION": "true",
+                "IMPERSONATE_SERVICE_ACCOUNT": "system:serviceaccount:tensor-fusion-sys:tensor-fusion-sys"
             },
             "args": [
                 "--metrics-path", "${workspaceFolder}/logs/metrics.log",

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -12,18 +12,24 @@
         "apimachinery",
         "apimachineryruntime",
         "apiruntime",
+        "apiserver",
         "apiutil",
         "automount",
         "AWSGPU",
         "batchv",
         "Biren",
+        "bubbletea",
+        "BUILDPLATFORM",
+        "buildx",
         "burstable",
         "Cambricon",
         "CDNA",
         "Cerebras",
         "certgen",
         "certificaterequests",
         "certmanager",
+        "CFLAGS",
+        "charmbracelet",
         "clientcmd",
         "clientcmdapi",
         "clientgoscheme",
@@ -45,27 +51,35 @@
         "datanode",
         "deepcopy",
         "defaultbinder",
+        "deviceplugin",
         "dylib",
         "eastus",
         "envtest",
         "essd",
         "Eventf",
+        "eventhandlers",
         "evictable",
         "featuregate",
         "finalizer",
         "Finalizers",
         "frameworkruntime",
+        "fsnotify",
         "FULLTEXT",
+        "GOBIN",
         "goconst",
         "gocyclo",
         "goerrors",
+        "golangci",
         "golint",
         "Gomega",
         "gonic",
+        "GOPATH",
         "gopsutil",
         "gorm",
         "gosec",
+        "GPGPU",
         "gpuallocator",
+        "GPUIDs",
         "gpunode",
         "gpunodeclaim",
         "gpunodeclaims",
@@ -86,8 +100,11 @@
         "imageutils",
         "indexallocator",
         "influxdata",
+        "Infof",
         "internalcache",
         "internalqueue",
+        "intstr",
+        "IVSHMEM",
         "jsonpatch",
         "karpenter",
         "karpv",
@@ -99,9 +116,12 @@
         "kubescheduler",
         "kubeschedulerconfig",
         "kustomization",
+        "libaccelerator",
         "libcuda",
         "libnvidia",
         "lineprotocol",
+        "lipgloss",
+        "LOCALBIN",
         "mapstructure",
         "metav",
         "metricsserver",
@@ -113,26 +133,33 @@
         "nindent",
         "nodeclaim",
         "nodeclassref",
+        "nodelist",
         "noderesources",
         "nolint",
         "NUMA",
+        "nvdp",
         "Nvlink",
         "NVML",
         "objs",
         "omitempty",
         "onsi",
+        "pids",
+        "pluginapi",
+        "podname",
         "portallocator",
         "Postable",
         "printcolumn",
         "prometheusagents",
         "prometheuses",
         "prometheusrules",
         "queuesort",
+        "Radeon",
         "RDNA",
         "readyz",
         "replicaset",
         "replicasets",
         "rolebinding",
+        "RTXA",
         "runbook",
         "runpod",
         "samber",
@@ -145,12 +172,18 @@
         "schedv",
         "serviceaccount",
         "shirou",
+        "shmem",
         "shortuuid",
         "statefulset",
         "statefulsets",
+        "stdbool",
+        "stddef",
+        "stdint",
+        "stdlib",
         "strategicpatch",
         "strategicpatches",
         "stretchr",
+        "strncpy",
         "subresource",
         "Tabler",
         "tensorfusion",
@@ -165,6 +198,8 @@
         "testutil",
         "tflops",
         "timberio",
+        "Timeslicing",
+        "tmpfs",
         "Tmpl",
         "tokenreviews",
         "Tolerations",
@@ -173,9 +208,15 @@
         "utilerrors",
         "utilruntime",
         "vgpu",
+        "Warningf",
         "webhookcorev",
+        "workerstate",
         "workloadprofiles",
         "workqueue",
         "Xlarge"
-    ]
+    ],
+    "files.associations": {
+        "__locale": "cpp",
+        "bitset": "cpp"
+    }
 }
diff --git a/Makefile b/Makefile
@@ -110,6 +110,26 @@ build: manifests generate fmt vet ## Build manager binary.
 run: manifests generate fmt vet ## Run a controller from your host.
 	go run ./cmd/main.go
 
+.PHONY: build-provider
+build-provider: ## Build accelerator stub library.
+	$(MAKE) -C provider stub
+
+.PHONY: build-hypervisor
+build-hypervisor: build-provider ## Build hypervisor binary with CGO enabled.
+	@PROVIDER_DIR=$$(pwd)/provider; \
+	CGO_ENABLED=1 \
+	CGO_CFLAGS="-I$$PROVIDER_DIR" \
+	go build -o bin/hypervisor ./cmd/hypervisor
+
+.PHONY: build-hypervisor-tui
+build-hypervisor-tui: 
+	go build -o bin/hypervisor-tui ./cmd/hypervisor-tui
+
+
+.PHONY: clean-cache
+clean-cache: ## Clean Go build cache.
+	go clean -cache -testcache
+
 # If you wish to build the manager image targeting other platforms you can use the --platform flag.
 # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
 # More info: https://docs.docker.com/develop/develop-images/build_enhancements/

diff --git a/README.md b/README.md
@@ -57,30 +57,34 @@ Tensor Fusion is a state-of-the-art **GPU virtualization and pooling solution**
 
 - [x] Fractional GPU and flexible oversubscription
 - [x] Remote GPU sharing with SOTA GPU-over-IP technology, less than 4% performance loss
-- [x] GPU VRAM expansion and hot/warm/cold tiering
-- [ ] None NVIDIA GPU/NPU vendor support
+- [x] GPU VRAM expansion and hot/cold tiering
+- [x] None NVIDIA GPU/NPU vendor support
 
 ### Pooling & Scheduling & Management
 
 - [x] GPU/NPU pool management in Kubernetes
-- [x] GPU-first scheduling and allocation, with single TFlops/MB precision
-- [x] GPU node auto provisioning/termination
+- [x] GPU-first scheduling and allocation, with 1 TFLOPs, 1% Computing, 1 MB precision
+- [x] GPU node auto provisioning/termination, Karpenter integration
 - [x] GPU compaction/bin-packing
+- [x] Take full control of GPU allocation with precision targeting by vendor, model, device index, and more
 - [x] Seamless onboarding experience for Pytorch, TensorFlow, llama.cpp, vLLM, Tensor-RT, SGlang and all popular AI training/serving frameworks
+- [x] Seamless migration from existing NVIDIA operator and device-plugin stack
 - [x] Centralized Dashboard & Control Plane
 - [x] GPU-first autoscaling policies, auto set requests/limits/replicas
 - [x] Request multiple vGPUs with group scheduling for large models
 - [x] Support different QoS levels
+- [x] Hardware partitioned mode isolation like NVIDIA Dynamic MIG
+- [x] Support Kubernetes dynamic resource allocation (DRA) API
 
 ### Enterprise Features
 
 - [x] GPU live-migration, snapshot and restore GPU context cross cluster
 - [ ] AI model registry and preloading, build your own private MaaS(Model-as-a-Service)
-- [ ] Advanced auto-scaling policies, scale to zero, rebalance of hot GPUs
+- [x] Advanced auto-scaling policies, scale to zero, rebalance of hot GPUs
 - [ ] Advanced observability features, detailed metrics & tracing/profiling of CUDA calls
-- [ ] Monetize your GPU cluster by multi-tenancy usage measurement & billing report
-- [ ] Enterprise level high availability and resilience, support topology aware scheduling, GPU node auto failover etc.
-- [ ] Enterprise level security, complete on-premise deployment support
+- [x] Monetize your GPU cluster by multi-tenancy usage measurement & billing report
+- [x] Enterprise level high availability and resilience, support topology aware scheduling, GPU node auto failover etc.
+- [x] Enterprise level security, complete on-premise deployment support
 - [ ] Enterprise level compliance, SSO/SAML support, advanced audit, ReBAC control, SOC2 and other compliance reports available
 
 ### 🗳️ Platform Support

diff --git a/api/v1/gpu_types.go b/api/v1/gpu_types.go
@@ -38,6 +38,10 @@ type GPUStatus struct {
 
 	UUID string `json:"uuid"`
 
+	// +optional
+	// +kubebuilder:default=soft
+	IsolationMode IsolationModeType `json:"isolationMode,omitempty"`
+
 	// +optional
 	Index *int32 `json:"index,omitempty"`
 
@@ -61,6 +65,16 @@ type GPUStatus struct {
 
 	// +optional
 	RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`
+
+	// +optional
+	// PartitionTemplates contains available partition templates for this GPU (e.g., MIG profiles)
+	// Reported from discovery, each template has fixed resource allocation
+	PartitionTemplates []PartitionTemplate `json:"partitionTemplates,omitempty"`
+
+	// +optional
+	// AllocatedPartitions tracks allocated partitions on this GPU
+	// Key is partitionUUID, value contains template info and allocated resources
+	AllocatedPartitions map[string]AllocatedPartition `json:"allocatedPartitions,omitempty"`
 }
 
 // +kubebuilder:validation:Enum=tensor-fusion;nvidia-device-plugin
@@ -94,6 +108,44 @@ type PodGPUInfo struct {
 	QoS       QoSLevel `json:"qos,omitempty"`
 }
 
+// PartitionTemplate represents a hardware partition template (e.g., MIG profile)
+// Only stores template ID and name in GPU status. Detailed resource information
+// is stored in public GPU info config.
+type PartitionTemplate struct {
+	// TemplateID is the unique identifier for this partition template (e.g., "1g.24gb", "4g.94gb")
+	TemplateID string `json:"templateId"`
+
+	// Name is a human-readable name for this template
+	Name string `json:"name"`
+}
+
+// AllocatedPartition represents an allocated partition on a GPU
+// Key in AllocatedPartitions map is podUID
+type AllocatedPartition struct {
+	// TemplateID is the template used to create this partition
+	TemplateID string `json:"templateId"`
+
+	// PodUID is the UID of the pod using this partition (used as map key)
+	PodUID string `json:"podUid"`
+
+	// PodName is the name of the pod using this partition
+	PodName string `json:"podName"`
+
+	// Namespace is the namespace of the pod using this partition
+	Namespace string `json:"namespace"`
+
+	// AllocatedAt is when this partition was allocated
+	AllocatedAt metav1.Time `json:"allocatedAt"`
+
+	// AllocatedSlotStart is the starting slot position where this partition is allocated
+	// This is the actual hardware slot position (0-based index)
+	AllocatedSlotStart *uint32 `json:"allocatedSlotStart,omitempty"`
+
+	// AllocatedSlotEnd is the ending slot position (exclusive) where this partition is allocated
+	// The partition occupies slots [AllocatedSlotStart, AllocatedSlotEnd)
+	AllocatedSlotEnd *uint32 `json:"allocatedSlotEnd,omitempty"`
+}
+
 // +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
 type TensorFusionGPUPhase string