Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
a3086eb
fix: extract limiter and accelerator to c ABI
Code2Life Nov 17, 2025
e07ccce
fix: bump deps, inject-container convert from limits
Code2Life Nov 18, 2025
79e5780
Revert "fix: bump deps, inject-container convert from limits"
Code2Life Nov 18, 2025
aa33429
feat: add device controller
Code2Life Nov 18, 2025
8610098
fix: refactor hypervisor
Code2Life Nov 19, 2025
36d40f4
feat: partitioned scheduling
Code2Life Nov 19, 2025
47c047b
fix: support partition allocation in scheduler
Code2Life Nov 20, 2025
5a48fe6
fix: lint issues
Code2Life Nov 20, 2025
3bdb472
fix: unit test issues
Code2Life Nov 20, 2025
b461229
chore: lint
Code2Life Nov 20, 2025
a101ec8
fix: optimize wording
Code2Life Nov 20, 2025
315ded6
fix: update cr info
0x5457 Nov 20, 2025
fed09d4
fix: unit test issues
0x5457 Nov 20, 2025
dcb87f2
fix: update readme
Code2Life Nov 20, 2025
2be5b06
fix: hypervisor debug and public manifests
Code2Life Nov 20, 2025
51f828c
fix: optimize hypervisor pod watcher
Code2Life Nov 21, 2025
9a6c238
fix: partition mode issues, refactor hypervisor
Code2Life Nov 23, 2025
d02d230
fix: compile issues
Code2Life Nov 23, 2025
a603cc5
fix: tui issue
Code2Life Nov 23, 2025
1c2a9b9
fix: hypervisor refactor
Code2Life Nov 23, 2025
a8f8eb7
fix: lint issue
Code2Life Nov 23, 2025
e515e2a
fix: optimize typing
Code2Life Nov 26, 2025
48507d0
fix: optimize hypervisor
Code2Life Nov 27, 2025
096f150
fix: bump deps
Code2Life Nov 28, 2025
1e242d5
fix: hypervisor name mismatch and test case issue
Code2Life Nov 28, 2025
d9b6428
fix: optimize interface
Code2Life Nov 28, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ jobs:
build-args: |
GO_LDFLAGS=-X 'github.com/NexusGPU/tensor-fusion/internal/version.BuildVersion=${{ needs.release.outputs.version }}'

publish_node_discovery_image:
publish_hypervisor_image:
needs:
- release
if: needs.release.outputs.published == 'true' || github.event_name == 'workflow_dispatch'
Expand All @@ -95,7 +95,7 @@ jobs:
- id: meta
uses: docker/metadata-action@v5
with:
images: tensorfusion/tensor-fusion-node-discovery
images: tensorfusion/tensor-fusion-hypervisor
tags: ${{ github.event_name == 'workflow_dispatch' && steps.set_tag.outputs.tag || format('type=semver,pattern={{{{version}}}},value={0}', needs.release.outputs.version) }}

- name: Login to DockerHub
Expand All @@ -104,12 +104,12 @@ jobs:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Build and push node discovery
- name: Build and push hypervisor
uses: docker/build-push-action@v6
with:
context: .
push: true
file: dockerfile/node-discovery.Dockerfile
file: dockerfile/hypervisor.Dockerfile
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
no-cache: true
11 changes: 10 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,13 @@ __debug*
vendor
logs

*.prof
*.prof

provider/build

cmd/hypervisor/hypervisor
*.o

_obj

metrics.log
14 changes: 9 additions & 5 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,18 @@
]
},
{
"name": "Debug Discovery",
"name": "Debug Hypervisor",
"type": "go",
"request": "launch",
"mode": "auto",
"console": "integratedTerminal",
"env": {
"HOSTNAME": "mocknode",
"KUBECONFIG": "~/.kube/config",
"KUBECONFIG": "~/.kube/config-local-studio",
"HYPERVISOR_PORT": "8042",
"GPU_NODE_NAME": "ubuntu",
},
"program": "${workspaceFolder}/cmd/nodediscovery/main.go",
"cwd": "${workspaceFolder}",
"program": "${workspaceFolder}/cmd/hypervisor/main.go",
},
{
"name": "Debug Dev Env Operator",
Expand Down Expand Up @@ -62,7 +65,8 @@
"ENABLE_WEBHOOKS": "false",
"ENABLE_SCHEDULER": "true",
"ENABLE_CR_CONTROLLER": "true",
"NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION": "true"
"NVIDIA_OPERATOR_PROGRESSIVE_MIGRATION": "true",
"IMPERSONATE_SERVICE_ACCOUNT": "system:serviceaccount:tensor-fusion-sys:tensor-fusion-sys"
},
"args": [
"--metrics-path", "${workspaceFolder}/logs/metrics.log",
Expand Down
43 changes: 42 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,24 @@
"apimachinery",
"apimachineryruntime",
"apiruntime",
"apiserver",
"apiutil",
"automount",
"AWSGPU",
"batchv",
"Biren",
"bubbletea",
"BUILDPLATFORM",
"buildx",
"burstable",
"Cambricon",
"CDNA",
"Cerebras",
"certgen",
"certificaterequests",
"certmanager",
"CFLAGS",
"charmbracelet",
"clientcmd",
"clientcmdapi",
"clientgoscheme",
Expand All @@ -45,27 +51,35 @@
"datanode",
"deepcopy",
"defaultbinder",
"deviceplugin",
"dylib",
"eastus",
"envtest",
"essd",
"Eventf",
"eventhandlers",
"evictable",
"featuregate",
"finalizer",
"Finalizers",
"frameworkruntime",
"fsnotify",
"FULLTEXT",
"GOBIN",
"goconst",
"gocyclo",
"goerrors",
"golangci",
"golint",
"Gomega",
"gonic",
"GOPATH",
"gopsutil",
"gorm",
"gosec",
"GPGPU",
"gpuallocator",
"GPUIDs",
"gpunode",
"gpunodeclaim",
"gpunodeclaims",
Expand All @@ -86,8 +100,11 @@
"imageutils",
"indexallocator",
"influxdata",
"Infof",
"internalcache",
"internalqueue",
"intstr",
"IVSHMEM",
"jsonpatch",
"karpenter",
"karpv",
Expand All @@ -99,9 +116,12 @@
"kubescheduler",
"kubeschedulerconfig",
"kustomization",
"libaccelerator",
"libcuda",
"libnvidia",
"lineprotocol",
"lipgloss",
"LOCALBIN",
"mapstructure",
"metav",
"metricsserver",
Expand All @@ -113,26 +133,33 @@
"nindent",
"nodeclaim",
"nodeclassref",
"nodelist",
"noderesources",
"nolint",
"NUMA",
"nvdp",
"Nvlink",
"NVML",
"objs",
"omitempty",
"onsi",
"pids",
"pluginapi",
"podname",
"portallocator",
"Postable",
"printcolumn",
"prometheusagents",
"prometheuses",
"prometheusrules",
"queuesort",
"Radeon",
"RDNA",
"readyz",
"replicaset",
"replicasets",
"rolebinding",
"RTXA",
"runbook",
"runpod",
"samber",
Expand All @@ -145,12 +172,18 @@
"schedv",
"serviceaccount",
"shirou",
"shmem",
"shortuuid",
"statefulset",
"statefulsets",
"stdbool",
"stddef",
"stdint",
"stdlib",
"strategicpatch",
"strategicpatches",
"stretchr",
"strncpy",
"subresource",
"Tabler",
"tensorfusion",
Expand All @@ -165,6 +198,8 @@
"testutil",
"tflops",
"timberio",
"Timeslicing",
"tmpfs",
"Tmpl",
"tokenreviews",
"Tolerations",
Expand All @@ -173,9 +208,15 @@
"utilerrors",
"utilruntime",
"vgpu",
"Warningf",
"webhookcorev",
"workerstate",
"workloadprofiles",
"workqueue",
"Xlarge"
]
],
"files.associations": {
"__locale": "cpp",
"bitset": "cpp"
}
}
20 changes: 20 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,26 @@ build: manifests generate fmt vet ## Build manager binary.
run: manifests generate fmt vet ## Run a controller from your host.
go run ./cmd/main.go

.PHONY: build-provider
build-provider: ## Build accelerator stub library.
$(MAKE) -C provider stub

.PHONY: build-hypervisor
build-hypervisor: build-provider ## Build hypervisor binary with CGO enabled.
@PROVIDER_DIR=$$(pwd)/provider; \
CGO_ENABLED=1 \
CGO_CFLAGS="-I$$PROVIDER_DIR" \
go build -o bin/hypervisor ./cmd/hypervisor

.PHONY: build-hypervisor-tui
build-hypervisor-tui:
go build -o bin/hypervisor-tui ./cmd/hypervisor-tui


.PHONY: clean-cache
clean-cache: ## Clean Go build cache.
go clean -cache -testcache

# If you wish to build the manager image targeting other platforms you can use the --platform flag.
# (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
# More info: https://docs.docker.com/develop/develop-images/build_enhancements/
Expand Down
20 changes: 12 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,30 +57,34 @@ Tensor Fusion is a state-of-the-art **GPU virtualization and pooling solution**

- [x] Fractional GPU and flexible oversubscription
- [x] Remote GPU sharing with SOTA GPU-over-IP technology, less than 4% performance loss
- [x] GPU VRAM expansion and hot/warm/cold tiering
- [ ] None NVIDIA GPU/NPU vendor support
- [x] GPU VRAM expansion and hot/cold tiering
- [x] None NVIDIA GPU/NPU vendor support

### Pooling & Scheduling & Management

- [x] GPU/NPU pool management in Kubernetes
- [x] GPU-first scheduling and allocation, with single TFlops/MB precision
- [x] GPU node auto provisioning/termination
- [x] GPU-first scheduling and allocation, with 1 TFLOPs, 1% Computing, 1 MB precision
- [x] GPU node auto provisioning/termination, Karpenter integration
- [x] GPU compaction/bin-packing
- [x] Take full control of GPU allocation with precision targeting by vendor, model, device index, and more
- [x] Seamless onboarding experience for Pytorch, TensorFlow, llama.cpp, vLLM, Tensor-RT, SGlang and all popular AI training/serving frameworks
- [x] Seamless migration from existing NVIDIA operator and device-plugin stack
- [x] Centralized Dashboard & Control Plane
- [x] GPU-first autoscaling policies, auto set requests/limits/replicas
- [x] Request multiple vGPUs with group scheduling for large models
- [x] Support different QoS levels
- [x] Hardware partitioned mode isolation like NVIDIA Dynamic MIG
- [x] Support Kubernetes dynamic resource allocation (DRA) API

### Enterprise Features

- [x] GPU live-migration, snapshot and restore GPU context cross cluster
- [ ] AI model registry and preloading, build your own private MaaS(Model-as-a-Service)
- [ ] Advanced auto-scaling policies, scale to zero, rebalance of hot GPUs
- [x] Advanced auto-scaling policies, scale to zero, rebalance of hot GPUs
- [ ] Advanced observability features, detailed metrics & tracing/profiling of CUDA calls
- [ ] Monetize your GPU cluster by multi-tenancy usage measurement & billing report
- [ ] Enterprise level high availability and resilience, support topology aware scheduling, GPU node auto failover etc.
- [ ] Enterprise level security, complete on-premise deployment support
- [x] Monetize your GPU cluster by multi-tenancy usage measurement & billing report
- [x] Enterprise level high availability and resilience, support topology aware scheduling, GPU node auto failover etc.
- [x] Enterprise level security, complete on-premise deployment support
- [ ] Enterprise level compliance, SSO/SAML support, advanced audit, ReBAC control, SOC2 and other compliance reports available

### 🗳️ Platform Support
Expand Down
52 changes: 52 additions & 0 deletions api/v1/gpu_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ type GPUStatus struct {

UUID string `json:"uuid"`

// +optional
// +kubebuilder:default=soft
IsolationMode IsolationModeType `json:"isolationMode,omitempty"`

// +optional
Index *int32 `json:"index,omitempty"`

Expand All @@ -61,6 +65,16 @@ type GPUStatus struct {

// +optional
RunningApps []*RunningAppDetail `json:"runningApps,omitempty"`

// +optional
// PartitionTemplates contains available partition templates for this GPU (e.g., MIG profiles)
// Reported from discovery, each template has fixed resource allocation
PartitionTemplates []PartitionTemplate `json:"partitionTemplates,omitempty"`

// +optional
// AllocatedPartitions tracks allocated partitions on this GPU
// Key is partitionUUID, value contains template info and allocated resources
AllocatedPartitions map[string]AllocatedPartition `json:"allocatedPartitions,omitempty"`
}

// +kubebuilder:validation:Enum=tensor-fusion;nvidia-device-plugin
Expand Down Expand Up @@ -94,6 +108,44 @@ type PodGPUInfo struct {
QoS QoSLevel `json:"qos,omitempty"`
}

// PartitionTemplate represents a hardware partition template (e.g., MIG profile)
// Only stores template ID and name in GPU status. Detailed resource information
// is stored in public GPU info config.
type PartitionTemplate struct {
// TemplateID is the unique identifier for this partition template (e.g., "1g.24gb", "4g.94gb")
TemplateID string `json:"templateId"`

// Name is a human-readable name for this template
Name string `json:"name"`
}

// AllocatedPartition represents an allocated partition on a GPU
// Key in AllocatedPartitions map is podUID
type AllocatedPartition struct {
// TemplateID is the template used to create this partition
TemplateID string `json:"templateId"`

// PodUID is the UID of the pod using this partition (used as map key)
PodUID string `json:"podUid"`

// PodName is the name of the pod using this partition
PodName string `json:"podName"`

// Namespace is the namespace of the pod using this partition
Namespace string `json:"namespace"`

// AllocatedAt is when this partition was allocated
AllocatedAt metav1.Time `json:"allocatedAt"`

// AllocatedSlotStart is the starting slot position where this partition is allocated
// This is the actual hardware slot position (0-based index)
AllocatedSlotStart *uint32 `json:"allocatedSlotStart,omitempty"`

// AllocatedSlotEnd is the ending slot position (exclusive) where this partition is allocated
// The partition occupies slots [AllocatedSlotStart, AllocatedSlotEnd)
AllocatedSlotEnd *uint32 `json:"allocatedSlotEnd,omitempty"`
}

// +kubebuilder:validation:Enum=Pending;Provisioning;Running;Unknown;Destroying;Migrating
type TensorFusionGPUPhase string

Expand Down
Loading