fix: update readme, unit test issue (#308)

Code2Life · web-flow · commit 710425a0875a · 2025-08-05T18:39:44.000+08:00
* fix: update readme, unit test issue

* fix: add gen one crd script

* fix: optimize connection prefix

* fix: unit test issue

* fix: add gen crd script

* fix: unit test issue
diff --git a/Makefile b/Makefile
@@ -60,6 +60,10 @@ fmt: ## Run go fmt against code.
 vet: ## Run go vet against code.
 	go vet ./...
 
+.PHONY: one-crd
+one-crd:
+	bash scripts/generate-crd.sh
+
 .PHONY: test
 test: manifests generate fmt vet envtest ## Run tests.
 	KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" GO_TESTING=true go run github.com/onsi/ginkgo/v2/ginkgo -p -timeout 0 -cover -coverprofile cover.out -r --skip-file ./test/e2e
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
-<p align="center"><a href="javascript:void(0);" target="_blank" rel="noreferrer"><img width="200" src="https://cdn.tensor-fusion.ai/logo.svg" alt="Logo"></a></p>
+<p align="center"><a href="javascript:void(0);" target="_blank" rel="noreferrer"><img width="100%" src="https://cdn.tensor-fusion.ai/logo-banner.png" alt="Logo"></a></p>
 
 <p align="center">
-    <strong><a href="https://tensor-fusion.ai" target="_blank">TensorFusion.AI</a></strong><br/>Next-Generation GPU Virtualization and Pooling for Enterprises<br><b>Less GPUs, More AI Apps.</b>
+    <br /><strong><a href="https://tensor-fusion.ai" target="_blank">TensorFusion.AI</a></strong><br/><b>Less GPUs, More AI Apps.</b>
     <br />
     <a href="https://tensor-fusion.ai/guide/overview"><strong>Explore the docs »</strong></a>
     <br />
@@ -13,12 +13,9 @@
   </p>
 
 
-# ♾️ Tensor Fusion
-
 [![Contributors][contributors-shield]][contributors-url]
 [![Forks][forks-shield]][forks-url]
 [![Stargazers][stars-shield]][stars-url]
-[![Issues][issues-shield]][issues-url]
 [![MIT License][license-shield]][license-url]
 [![LinkedIn][linkedin-shield]][linkedin-url]
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/NexusGPU/tensor-fusion)
@@ -27,11 +24,11 @@ Tensor Fusion is a state-of-the-art **GPU virtualization and pooling solution**
 
 ## 🌟 Highlights
 
-#### 📐 Fractional GPU with Single TFlops/MiB Precision
-#### 🔄 Battle-tested GPU-over-IP Remote GPU Sharing 
+#### 📐 Fractional Virtual GPU
+#### 🔄 Remote GPU Sharing over Ethernet/InfiniBand
 #### ⚖️ GPU-first Scheduling and Auto-scaling
-#### 📊 Computing Oversubscription and GPU VRAM Expansion
-#### 🛫 GPU Pooling, Monitoring, Live Migration, AI Model Preloading and more
+#### 📊 GPU Oversubscription and VRAM Expansion
+#### 🛫 GPU Pooling, Monitoring, Live Migration, Model Preloading and more
 
 ## 🎬 Demo
 
@@ -88,27 +85,26 @@ https://cdn.tensor-fusion.ai/GPU_Content_Migration.mp4
 - [x] GPU compaction/bin-packing
 - [x] Seamless onboarding experience for Pytorch, TensorFlow, llama.cpp, vLLM, Tensor-RT, SGlang and all popular AI training/serving frameworks
 - [x] Centralized Dashboard & Control Plane
-- [ ] GPU-first autoscaling policies, auto set requests/limits/replicas
-- [ ] Request multiple vGPUs with group scheduling for large models
-- [ ] Support different QoS levels
+- [x] GPU-first autoscaling policies, auto set requests/limits/replicas
+- [x] Request multiple vGPUs with group scheduling for large models
+- [x] Support different QoS levels
 
 ### Enterprise Features
 
-- [x] GPU live-migration, snapshot/distribute/restore GPU context cross cluster, fastest in the world
+- [x] GPU live-migration, snapshot and restore GPU context cross cluster
 - [ ] AI model registry and preloading, build your own private MaaS(Model-as-a-Service)
 - [ ] Advanced auto-scaling policies, scale to zero, rebalance of hot GPUs
 - [ ] Advanced observability features, detailed metrics & tracing/profiling of CUDA calls
 - [ ] Monetize your GPU cluster by multi-tenancy usage measurement & billing report
 - [ ] Enterprise level high availability and resilience, support topology aware scheduling, GPU node auto failover etc.
-- [ ] Enterprise level security, complete on-premise deployment support, encryption in-transit & at-rest
+- [ ] Enterprise level security, complete on-premise deployment support
 - [ ] Enterprise level compliance, SSO/SAML support, advanced audit, ReBAC control, SOC2 and other compliance reports available
 
 ### 🗳️ Platform Support
 
 - [x] Run on Linux Kubernetes clusters
 - [x] Run on Linux VMs or Bare Metal (one-click onboarding to Edge K3S)
-- [x] Run on Windows (Docs not ready, contact us for support)
-- [ ] Run on MacOS (Imagining mount a virtual NVIDIA GPU device on MacOS!)
+- [x] Run on Windows (Not open sourced, contact us for support)
 
 See the [open issues](https://github.com/NexusGPU/tensor-fusion/issues) for a full list of proposed features (and known issues).
 
@@ -131,12 +127,13 @@ Don't forget to give the project a star! Thanks again!
   <img src="https://contrib.rocks/image?repo=NexusGPU/tensor-fusion" alt="contrib.rocks image" />
 </a>
 
-<!-- LICENSE -->
 ## 🔷 License
 
-1. This repo is open sourced with [Apache 2.0 License](./LICENSE), which includes **GPU pooling, scheduling, management features**, you can use it for free and modify it.
-2. **GPU virtualization and GPU-over-IP features** are also free to use as the part of **Community Plan**, the implementation is not fully open sourced
-3. Features mentioned in "**Enterprise Features**" above are paid, **licensed users can automatically unlock these features**.
+1. [TensorFusion main repo](https://github.com/NexusGPU/tensor-fusion) is open sourced with [Apache 2.0 License](./LICENSE), which includes **GPU pooling, scheduling, management features**, you can use it for free and customize it as you want.
+2. [vgpu.rs repo](https://github.com/NexusGPU/vgpu.rs) is open sourced with [Apache 2.0 License](./LICENSE), which includes **Fractional GPU** and **vGPU hypervisor features**, you can use it for free and customize it as you want.
+3. **Advanced GPU virtualization and GPU-over-IP sharing features** are also free to use when **GPU total number of your organization is less than 10**, but the implementation is not fully open sourced, please [contact us](mailto:support@tensor-fusion.com) for more details.
+4. Features mentioned in "**Enterprise Features**" above are paid, **licensed users can use these features in [TensorFusion Console](https://app.tensor-fusion.ai)**.
+5. For large scale deployment that involves non-free features of #3 and #4, please [contact us](mailto:support@tensor-fusion.com), pricing details are available [here](https://tensor-fusion.ai/pricing)
 
 [![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FNexusGPU%2Ftensor-fusion.svg?type=large&issueType=license)](https://app.fossa.com/projects/git%2Bgithub.com%2FNexusGPU%2Ftensor-fusion?ref=badge_large&issueType=license)
 
diff --git a/internal/constants/env.go b/internal/constants/env.go
@@ -69,8 +69,9 @@ const (
 	LdPreloadFileName = "ld.so.preload"
 	LdPreloadFile     = "/etc/ld.so.preload"
 
-	TFLibsVolumeName      = "tf-libs"
-	TFLibsVolumeMountPath = "/tensor-fusion"
+	TFLibsVolumeName       = "tf-libs"
+	TFLibsVolumeMountPath  = "/tensor-fusion"
+	TFConnectionNamePrefix = "tf-vgpu-"
 
 	HostIPFieldRef       = "status.hostIP"
 	NodeNameFieldRef     = "spec.nodeName"
diff --git a/internal/controller/pod_controller.go b/internal/controller/pod_controller.go
@@ -90,6 +90,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R
 			return ctrl.Result{}, err
 		}
 		delete(pod.Annotations, constants.SetPendingOwnedWorkloadAnnotation)
+		log.Info("Pending owned workload set", "pod", pod.Name, "ownedWorkload", ownedWorkloadName)
 		if err := r.Update(ctx, pod); err != nil {
 			return ctrl.Result{}, err
 		}
diff --git a/internal/controller/pod_controller_test.go b/internal/controller/pod_controller_test.go
@@ -31,6 +31,7 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/utils/ptr"
 	"sigs.k8s.io/controller-runtime/pkg/client"
 	"sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
 )
@@ -158,6 +159,14 @@ var _ = Describe("Pod Controller", func() {
 				},
 			}
 			Expect(k8sClient.Create(ctx, workload)).To(Succeed())
+			Eventually(func() error {
+				updatedWorkload := &tfv1.TensorFusionWorkload{}
+				err := k8sClient.Get(ctx, client.ObjectKeyFromObject(workload), updatedWorkload)
+				if err != nil {
+					return err
+				}
+				return nil
+			}).Should(Succeed())
 
 			clientPod = &corev1.Pod{
 				ObjectMeta: metav1.ObjectMeta{
@@ -191,17 +200,35 @@ var _ = Describe("Pod Controller", func() {
 							},
 						},
 					},
+					TerminationGracePeriodSeconds: ptr.To(int64(0)),
 				},
 			}
 		})
 
 		AfterEach(func() {
 			if workload != nil {
 				_ = k8sClient.Delete(ctx, workload)
+				Eventually(func() error {
+					return k8sClient.Get(ctx, client.ObjectKeyFromObject(workload), workload)
+				}).Should(Satisfy(errors.IsNotFound))
 			}
 			if clientPod != nil {
 				_ = k8sClient.Delete(ctx, clientPod)
+				Eventually(func() error {
+					return k8sClient.Get(ctx, client.ObjectKeyFromObject(clientPod), clientPod)
+				}).Should(Satisfy(errors.IsNotFound))
 			}
+
+			connection := &tfv1.TensorFusionConnection{
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test-connection-pod-controller",
+					Namespace: "default",
+				},
+			}
+			_ = k8sClient.Delete(ctx, connection)
+			Eventually(func() error {
+				return k8sClient.Get(ctx, client.ObjectKeyFromObject(connection), connection)
+			}).Should(Satisfy(errors.IsNotFound))
 		})
 
 		It("should successfully create TensorFusion connection for client pod", func() {
@@ -331,6 +358,14 @@ var _ = Describe("Pod Controller", func() {
 				},
 			}
 			Expect(k8sClient.Create(ctx, workload)).To(Succeed())
+			Eventually(func() error {
+				updatedWorkload := &tfv1.TensorFusionWorkload{}
+				err := k8sClient.Get(ctx, client.ObjectKeyFromObject(workload), updatedWorkload)
+				if err != nil {
+					return err
+				}
+				return nil
+			}).Should(Succeed())
 
 			pod = &corev1.Pod{
 				ObjectMeta: metav1.ObjectMeta{
@@ -351,6 +386,7 @@ var _ = Describe("Pod Controller", func() {
 							Image: "test-image",
 						},
 					},
+					TerminationGracePeriodSeconds: ptr.To(int64(0)),
 				},
 			}
 		})
@@ -426,6 +462,7 @@ var _ = Describe("Pod Controller", func() {
 							},
 						},
 					},
+					TerminationGracePeriodSeconds: ptr.To(int64(0)),
 				},
 			}
 
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
@@ -388,7 +388,11 @@ func assignPodLabelsAndAnnotations(isLocalGPU bool, pod *corev1.Pod, pool *tfv1.
 }
 
 func addConnectionForRemoteFixedReplicaVirtualGPU(pod *corev1.Pod, container *corev1.Container, clientConfig *tfv1.ClientConfig) {
-	connectionName := fmt.Sprintf("%s%s", pod.GenerateName, utils.NewShortID(10))
+	prefix := pod.GenerateName
+	if pod.GenerateName == "" {
+		prefix = pod.Name + constants.TFConnectionNamePrefix
+	}
+	connectionName := fmt.Sprintf("%s%s", prefix, utils.NewShortID(10))
 	connectionNamespace := pod.Namespace
 
 	// metadata TF_POD_NAME and TF_CONNECTION_NAMESPACE
diff --git a/scripts/generate-crd.sh b/scripts/generate-crd.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+CRD_DIR="./charts/tensor-fusion/crds"
+OUTPUT_FILE="./tmp.tensor-fusion-crds.yaml"
+
+echo "Generating combined CRD file..."
+> "$OUTPUT_FILE"
+for file in "$CRD_DIR"/*.yaml; do
+    [ -s "$OUTPUT_FILE" ]
+    cat "$file" >> "$OUTPUT_FILE"
+done
+echo "Generated: $OUTPUT_FILE"

Original file line number	Diff line number	Diff line change
`@@ -90,6 +90,7 @@ func (r *PodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.R`
`90`	`90`	`return ctrl.Result{}, err`
`91`	`91`	`}`
`92`	`92`	`delete(pod.Annotations, constants.SetPendingOwnedWorkloadAnnotation)`
	`93`	`+ log.Info("Pending owned workload set", "pod", pod.Name, "ownedWorkload", ownedWorkloadName)`
`93`	`94`	`if err := r.Update(ctx, pod); err != nil {`
`94`	`95`	`return ctrl.Result{}, err`
`95`	`96`	`}`