Updated ray-finetune-demo test with latest changes

abhijeet-dhumal · abhijeet-dhumal · commit 81de12f77513 · 2024-08-27T12:59:35.000+05:30
diff --git a/tests/odh/ray_finetune_llm_deepspeed_test.go b/tests/odh/ray_finetune_llm_deepspeed_test.go
@@ -21,10 +21,12 @@ import (
 	"os"
 	"strings"
 	"testing"
+	"time"
 
 	. "github.com/onsi/gomega"
 	. "github.com/project-codeflare/codeflare-common/support"
 	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 )
 
 func TestRayFinetuneLlmDeepspeedDemo(t *testing.T) {
@@ -55,19 +57,17 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int) {
 		"server = ''":                            fmt.Sprintf("server = '%s'", GetOpenShiftApiUrl(test)),
 		"namespace='ray-finetune-llm-deepspeed'": fmt.Sprintf("namespace='%s'", namespace.Name),
 		"head_cpus=16":                           "head_cpus=2",
-		"head_gpus=1":                            "head_gpus=0",
+		"head_extended_resource_requests=1":      "head_extended_resource_requests=0",
 		"num_workers=7":                          "num_workers=1",
-		"min_cpus=16":                            "min_cpus=4",
-		"max_cpus=16":                            "max_cpus=4",
-		"min_memory=128":                         "min_memory=48",
-		"max_memory=256":                         "max_memory=48",
+		"worker_cpu_requests=16":                 "worker_cpu_requests=4",
+		"worker_cpu_limits=16":                   "worker_cpu_limits=4",
+		"worker_memory_requests=128":             "worker_memory_requests=60",
+		"worker_memory_limits=256":               "worker_memory_limits=60",
 		"head_memory=128":                        "head_memory=48",
-		"num_gpus=1":                             fmt.Sprintf("worker_extended_resource_requests={'nvidia.com/gpu': %d},\\n\",\n\t\"    write_to_file=True,\\n\",\n\t\"    verify_tls=False", numGpus),
-		"image='quay.io/rhoai/ray:2.23.0-py39-cu121'":            fmt.Sprintf("image='%s'", GetRayImage()),
-		"client = cluster.job_client":                            "ray_dashboard = cluster.cluster_dashboard_uri()\\n\",\n\t\"header = {\\\"Authorization\\\": \\\"Bearer " + userToken + "\\\"}\\n\",\n\t\"client = RayJobClient(address=ray_dashboard, headers=header, verify=False)\\n",
-		"--num-devices=8":                                        fmt.Sprintf("--num-devices=%d", numGpus),
-		"--num-epochs=3":                                         fmt.Sprintf("--num-epochs=%d", 1),
-		"--ds-config=./deepspeed_configs/zero_3_llama_2_7b.json": "--ds-config=./zero_3_llama_2_7b.json \\\"\\n\",\n\t\"               \\\"--lora-config=./lora.json \\\"\\n\",\n\t\"               \\\"--as-test",
+		"client = cluster.job_client":            "ray_dashboard = cluster.cluster_dashboard_uri()\\n\",\n\t\"header = {\\\"Authorization\\\": \\\"Bearer " + userToken + "\\\"}\\n\",\n\t\"client = RayJobClient(address=ray_dashboard, headers=header, verify=False)\\n",
+		"--num-devices=8":                        fmt.Sprintf("--num-devices=%d", numGpus),
+		"--num-epochs=3":                         fmt.Sprintf("--num-epochs=%d", 1),
+		"--ds-config=./deepspeed_configs/zero_3_llama_2_7b.json": "--ds-config=./zero_3_llama_2_7b.json \\\"\\n\",\n\t\" \\\"--lora-config=./lora.json \\\"\\n\",\n\t\" \\\"--as-test",
 		"'pip': 'requirements.txt'":                              "'pip': '/opt/app-root/src/requirements.txt'",
 		"'working_dir': './'":                                    "'working_dir': '/opt/app-root/src'",
 		"client.stop_job(submission_id)":                         "finished = False\\n\",\n\t\"while not finished:\\n\",\n\t\"    time.sleep(1)\\n\",\n\t\"    status = client.get_job_status(submission_id)\\n\",\n\t\"    finished = (status == \\\"SUCCEEDED\\\")\\n\",\n\t\"if finished:\\n\",\n\t\"    print(\\\"Job completed Successfully !\\\")\\n\",\n\t\"else:\\n\",\n\t\"    print(\\\"Job failed !\\\")\\n\",\n\t\"time.sleep(10)\\n",
@@ -111,6 +111,47 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int) {
 			),
 		)
 
+	time.Sleep(30 * time.Second)
+
+	// Fetch created raycluster
+	rayClusterName := "ray"
+	rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Get(test.Ctx(), rayClusterName, metav1.GetOptions{})
+	test.Expect(err).ToNot(HaveOccurred())
+
+	// Initialise raycluster client to interact with raycluster to get rayjob details using REST-API
+	dashboardUrl := GetDashboardUrl(test, namespace, rayCluster)
+	rayClusterClientConfig := RayClusterClientConfig{Address: dashboardUrl.String(), Client: nil, SkipTlsVerification: true}
+	rayClient, err := NewRayClusterClient(rayClusterClientConfig, test.Config().BearerToken)
+	if err != nil {
+		test.T().Errorf("%s", err)
+	}
+
+	jobID := GetTestJobId(test, rayClient, dashboardUrl.Host)
+	test.Expect(jobID).ToNot(Equal(nil))
+
+	// Wait for the job to be succeeded or failed
+	var rayJobStatus string
+	fmt.Printf("Waiting for job to be Succeeded...\n")
+	test.Eventually(func() string {
+		resp, err := rayClient.GetJobDetails(jobID)
+		test.Expect(err).ToNot(HaveOccurred())
+		rayJobStatusVal := resp.Status
+		if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
+			fmt.Printf("JobStatus : %s\n", rayJobStatusVal)
+			rayJobStatus = rayJobStatusVal
+			WriteRayJobAPILogs(test, rayClient, jobID)
+			return rayJobStatus
+		}
+		if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
+			fmt.Printf("JobStatus : %s...\n", rayJobStatusVal)
+			rayJobStatus = rayJobStatusVal
+		}
+		return rayJobStatus
+	}, TestTimeoutDouble, 3*time.Second).Should(Or(Equal("SUCCEEDED"), Equal("FAILED")), "Job did not complete within the expected time")
+	// Store job logs in output directory
+	WriteRayJobAPILogs(test, rayClient, jobID)
+	test.Expect(rayJobStatus).To(Equal("SUCCEEDED"), "RayJob failed !")
+
 	// Make sure the RayCluster finishes and is deleted
 	test.Eventually(RayClusters(test, namespace.Name), TestTimeoutMedium).
 		Should(HaveLen(0))
diff --git a/tests/odh/support.go b/tests/odh/support.go
@@ -20,6 +20,7 @@ import (
 	"embed"
 	"net/http"
 	"net/url"
+	"os"
 
 	. "github.com/onsi/gomega"
 	gomega "github.com/onsi/gomega"
@@ -39,6 +40,13 @@ func ReadFile(t support.Test, fileName string) []byte {
 	return file
 }
 
+func ReadFileExt(t support.Test, fileName string) []byte {
+	t.T().Helper()
+	file, err := os.ReadFile(fileName)
+	t.Expect(err).NotTo(gomega.HaveOccurred())
+	return file
+}
+
 func GetDashboardUrl(test support.Test, namespace *v1.Namespace, rayCluster *rayv1.RayCluster) *url.URL {
 	dashboardName := "ray-dashboard-" + rayCluster.Name
 	test.T().Logf("Raycluster created : %s\n", rayCluster.Name)