@@ -21,10 +21,12 @@ import (
2121 "os"
2222 "strings"
2323 "testing"
24+ "time"
2425
2526 . "github.com/onsi/gomega"
2627 . "github.com/project-codeflare/codeflare-common/support"
2728 rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
29+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2830)
2931
3032func TestRayFinetuneLlmDeepspeedDemo (t * testing.T ) {
@@ -55,40 +57,39 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int) {
5557 "server = ''" : fmt .Sprintf ("server = '%s'" , GetOpenShiftApiUrl (test )),
5658 "namespace='ray-finetune-llm-deepspeed'" : fmt .Sprintf ("namespace='%s'" , namespace .Name ),
5759 "head_cpus=16" : "head_cpus=2" ,
58- "head_gpus =1" : "head_gpus =0" ,
60+ "head_extended_resource_requests =1" : "head_extended_resource_requests =0" ,
5961 "num_workers=7" : "num_workers=1" ,
60- "min_cpus =16" : "min_cpus =4" ,
61- "max_cpus =16" : "max_cpus =4" ,
62- "min_memory =128" : "min_memory=48 " ,
63- "max_memory =256" : "max_memory=48 " ,
62+ "worker_cpu_requests =16" : "worker_cpu_requests =4" ,
63+ "worker_cpu_limits =16" : "worker_cpu_limits =4" ,
64+ "worker_memory_requests =128" : "worker_memory_requests=60 " ,
65+ "worker_memory_limits =256" : "worker_memory_limits=60 " ,
6466 "head_memory=128" : "head_memory=48" ,
65- "num_gpus=1" : fmt .Sprintf ("worker_extended_resource_requests={'nvidia.com/gpu': %d},\\ n\" ,\n \t \" write_to_file=True,\\ n\" ,\n \t \" verify_tls=False" , numGpus ),
66- "image='quay.io/rhoai/ray:2.23.0-py39-cu121'" : fmt .Sprintf ("image='%s'" , GetRayImage ()),
67- "client = cluster.job_client" : "ray_dashboard = cluster.cluster_dashboard_uri()\\ n\" ,\n \t \" header = {\\ \" Authorization\\ \" : \\ \" Bearer " + userToken + "\\ \" }\\ n\" ,\n \t \" client = RayJobClient(address=ray_dashboard, headers=header, verify=False)\\ n" ,
68- "--num-devices=8" : fmt .Sprintf ("--num-devices=%d" , numGpus ),
69- "--num-epochs=3" : fmt .Sprintf ("--num-epochs=%d" , 1 ),
70- "--ds-config=./deepspeed_configs/zero_3_llama_2_7b.json" : "--ds-config=./zero_3_llama_2_7b.json \\ \" \\ n\" ,\n \t \" \\ \" --lora-config=./lora.json \\ \" \\ n\" ,\n \t \" \\ \" --as-test" ,
71- "'pip': 'requirements.txt'" : "'pip': '/opt/app-root/src/requirements.txt'" ,
72- "'working_dir': './'" : "'working_dir': '/opt/app-root/src'" ,
73- "client.stop_job(submission_id)" : "finished = False\\ n\" ,\n \t \" while not finished:\\ n\" ,\n \t \" time.sleep(1)\\ n\" ,\n \t \" status = client.get_job_status(submission_id)\\ n\" ,\n \t \" finished = (status == \\ \" SUCCEEDED\\ \" )\\ n\" ,\n \t \" if finished:\\ n\" ,\n \t \" print(\\ \" Job completed Successfully !\\ \" )\\ n\" ,\n \t \" else:\\ n\" ,\n \t \" print(\\ \" Job failed !\\ \" )\\ n\" ,\n \t \" time.sleep(10)\\ n" ,
67+ "client = cluster.job_client" : "ray_dashboard = cluster.cluster_dashboard_uri()\\ n\" ,\n \t \" header = {\\ \" Authorization\\ \" : \\ \" Bearer " + userToken + "\\ \" }\\ n\" ,\n \t \" client = RayJobClient(address=ray_dashboard, headers=header, verify=False)\\ n" ,
68+ "--num-devices=8" : fmt .Sprintf ("--num-devices=%d" , numGpus ),
69+ "--num-epochs=3" : fmt .Sprintf ("--num-epochs=%d" , 1 ),
70+ "--ds-config=./deepspeed_configs/zero_3_offload_optim+param.json" : "--ds-config=./zero_3_offload_optim_param.json \\ \" \\ n\" ,\n \t \" \\ \" --lora-config=./lora.json \\ \" \\ n\" ,\n \t \" \\ \" --as-test" ,
71+ "'pip': 'requirements.txt'" : "'pip': '/opt/app-root/src/requirements.txt'" ,
72+ "'working_dir': './'" : "'working_dir': '/opt/app-root/src'" ,
73+ "client.stop_job(submission_id)" : "finished = False\\ n\" ,\n \t \" while not finished:\\ n\" ,\n \t \" time.sleep(1)\\ n\" ,\n \t \" status = client.get_job_status(submission_id)\\ n\" ,\n \t \" finished = (status == \\ \" SUCCEEDED\\ \" )\\ n\" ,\n \t \" if finished:\\ n\" ,\n \t \" print(\\ \" Job completed Successfully !\\ \" )\\ n\" ,\n \t \" else:\\ n\" ,\n \t \" print(\\ \" Job failed !\\ \" )\\ n\" ,\n \t \" time.sleep(10)\\ n" ,
7474 }
7575
7676 updatedNotebookContent := string (ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb" ))
7777 for oldValue , newValue := range requiredChangesInNotebook {
7878 updatedNotebookContent = strings .Replace (updatedNotebookContent , oldValue , newValue , - 1 )
7979 }
8080 updatedNotebook := []byte (updatedNotebookContent )
81+ os .WriteFile ("demo.ipynb" , updatedNotebook , 0644 )
8182
8283 // Test configuration
8384 jupyterNotebookConfigMapFileName := "ray_finetune_llm_deepspeed.ipynb"
8485 configMap := map [string ][]byte {
85- jupyterNotebookConfigMapFileName : updatedNotebook ,
86- "ray_finetune_llm_deepspeed.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py" ),
87- "requirements.txt" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/requirements.txt" ),
88- "create_dataset.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/create_dataset.py" ),
89- "lora.json" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/lora_configs/lora.json" ),
90- "zero_3_llama_2_7b .json" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_7b .json" ),
91- "utils.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/utils.py" ),
86+ jupyterNotebookConfigMapFileName : updatedNotebook ,
87+ "ray_finetune_llm_deepspeed.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py" ),
88+ "requirements.txt" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/requirements.txt" ),
89+ "create_dataset.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/create_dataset.py" ),
90+ "lora.json" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/lora_configs/lora.json" ),
91+ "zero_3_offload_optim_param .json" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_offload_optim+param .json" ),
92+ "utils.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/utils.py" ),
9293 }
9394
9495 config := CreateConfigMap (test , namespace .Name , configMap )
@@ -111,6 +112,47 @@ func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int) {
111112 ),
112113 )
113114
115+ time .Sleep (30 * time .Second )
116+
117+ // Fetch created raycluster
118+ rayClusterName := "ray"
119+ rayCluster , err := test .Client ().Ray ().RayV1 ().RayClusters (namespace .Name ).Get (test .Ctx (), rayClusterName , metav1.GetOptions {})
120+ test .Expect (err ).ToNot (HaveOccurred ())
121+
122+ // Initialise raycluster client to interact with raycluster to get rayjob details using REST-API
123+ dashboardUrl := GetDashboardUrl (test , namespace , rayCluster )
124+ rayClusterClientConfig := RayClusterClientConfig {Address : dashboardUrl .String (), Client : nil , SkipTlsVerification : true }
125+ rayClient , err := NewRayClusterClient (rayClusterClientConfig , test .Config ().BearerToken )
126+ if err != nil {
127+ test .T ().Errorf ("%s" , err )
128+ }
129+
130+ jobID := GetTestJobId (test , rayClient , dashboardUrl .Host )
131+ test .Expect (jobID ).ToNot (Equal (nil ))
132+
133+ // Wait for the job to be succeeded or failed
134+ var rayJobStatus string
135+ fmt .Printf ("Waiting for job to be Succeeded...\n " )
136+ test .Eventually (func () string {
137+ resp , err := rayClient .GetJobDetails (jobID )
138+ test .Expect (err ).ToNot (HaveOccurred ())
139+ rayJobStatusVal := resp .Status
140+ if rayJobStatusVal == "SUCCEEDED" || rayJobStatusVal == "FAILED" {
141+ fmt .Printf ("JobStatus : %s\n " , rayJobStatusVal )
142+ rayJobStatus = rayJobStatusVal
143+ WriteRayJobAPILogs (test , rayClient , jobID )
144+ return rayJobStatus
145+ }
146+ if rayJobStatus != rayJobStatusVal && rayJobStatusVal != "SUCCEEDED" {
147+ fmt .Printf ("JobStatus : %s...\n " , rayJobStatusVal )
148+ rayJobStatus = rayJobStatusVal
149+ }
150+ return rayJobStatus
151+ }, TestTimeoutDouble , 3 * time .Second ).Should (Or (Equal ("SUCCEEDED" ), Equal ("FAILED" )), "Job did not complete within the expected time" )
152+ // Store job logs in output directory
153+ WriteRayJobAPILogs (test , rayClient , jobID )
154+ test .Expect (rayJobStatus ).To (Equal ("SUCCEEDED" ), "RayJob failed !" )
155+
114156 // Make sure the RayCluster finishes and is deleted
115157 test .Eventually (RayClusters (test , namespace .Name ), TestTimeoutMedium ).
116158 Should (HaveLen (0 ))
0 commit comments