@@ -17,33 +17,86 @@ limitations under the License.
17
17
package odh
18
18
19
19
import (
20
+ "fmt"
20
21
"os"
22
+ "strings"
21
23
"testing"
22
24
"time"
23
25
24
26
. "github.com/onsi/gomega"
25
27
. "github.com/project-codeflare/codeflare-common/support"
26
28
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
29
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27
30
)
28
31
29
- func TestRayFinetuneDemo (t * testing.T ) {
30
- mnistRayLlmFinetune (t , 1 )
32
+ func TestRayFinetuneDemo2 (t * testing.T ) {
33
+ mnistRayLlmFinetune2 (t , 1 )
31
34
}
32
35
33
- func mnistRayLlmFinetune (t * testing.T , numGpus int ) {
36
+ func mnistRayLlmFinetune2 (t * testing.T , numGpus int ) {
34
37
test := With (t )
35
38
36
39
// Create a namespace
37
40
namespace := test .NewTestNamespace ()
38
- var workingDirectory , _ = os .Getwd ()
41
+ var workingDirectory , err = os .Getwd ()
42
+ test .Expect (err ).ToNot (HaveOccurred ())
43
+
44
+ // Define the regular(non-admin) user
45
+ userName := GetNotebookUserName (test )
46
+ userToken := GetNotebookUserToken (test )
47
+
48
+ // Create role binding with Namespace specific admin cluster role
49
+ CreateUserRoleBindingWithClusterRole (test , userName , namespace .Name , "admin" )
50
+
51
+ s3BucketName , _ := GetStorageBucketName ()
52
+ s3AccessKeyId , _ := GetStorageBucketAccessKeyId ()
53
+ s3SecretAccessKey , _ := GetStorageBucketSecretKey ()
54
+ s3DefaultRegion , _ := GetStorageBucketDefaultRegion ()
55
+
56
+ // list changes required in llm-deepspeed-finetune-demo.ipynb file and update those
57
+ requiredChangesInNotebook := map [string ]string {
58
+ "import os" : "import os,time,sys" ,
59
+ "import sys" : "!cp /opt/app-root/notebooks/* ./" ,
60
+ "from codeflare_sdk.cluster.auth import TokenAuthentication" : "from codeflare_sdk.cluster.auth import TokenAuthentication\\ n\" ,\n \t \" from codeflare_sdk.job import RayJobClient" ,
61
+ "token = ''" : fmt .Sprintf ("token = '%s'" , userToken ),
62
+ "server = ''" : fmt .Sprintf ("server = '%s'" , GetOpenShiftApiUrl (test )),
63
+ "namespace='ray-finetune-llm-deepspeed'" : fmt .Sprintf ("namespace='%s'" , namespace .Name ),
64
+ "head_cpus=16" : "head_cpus=2" ,
65
+ "head_gpus=1" : "head_gpus=0" ,
66
+ "num_workers=7" : "num_workers=1" ,
67
+ "min_cpus=16" : "min_cpus=4" ,
68
+ "max_cpus=16" : "max_cpus=4" ,
69
+ "min_memory=128" : "min_memory=48" ,
70
+ "max_memory=256" : "max_memory=48" ,
71
+ "head_memory=128" : "head_memory=48" ,
72
+ "num_gpus=1" : fmt .Sprintf ("worker_extended_resource_requests={'nvidia.com/gpu': %d},\\ n\" ,\n \t \" write_to_file=True,\\ n\" ,\n \t \" verify_tls=False" , numGpus ),
73
+ "image='quay.io/rhoai/ray:2.23.0-py39-cu121'" : fmt .Sprintf ("image='%s'" , GetRayImage ()),
74
+ "client = cluster.job_client" : "ray_dashboard = cluster.cluster_dashboard_uri()\\ n\" ,\n \t \" header = {\\ \" Authorization\\ \" : \\ \" Bearer " + userToken + "\\ \" }\\ n\" ,\n \t \" client = RayJobClient(address=ray_dashboard, headers=header, verify=False)\\ n" ,
75
+ "s3_bucket = ''" : fmt .Sprintf ("s3_bucket = '%s'" , s3BucketName ),
76
+ "--num-devices=8" : fmt .Sprintf ("--num-devices=%d" , numGpus ),
77
+ "--num-epochs=3" : fmt .Sprintf ("--num-epochs=%d" , 1 ),
78
+ "--ds-config=./deepspeed_configs/zero_3_llama_2_7b.json" : "--ds-config=./zero_3_llama_2_7b.json \\ \" \\ n\" ,\n \t \" \\ \" --lora-config=./lora.json \\ \" \\ n\" ,\n \t \" \\ \" --as-test" ,
79
+ "'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID')" : fmt .Sprintf ("'AWS_ACCESS_KEY_ID': '%s'" , s3AccessKeyId ),
80
+ "'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY')" : fmt .Sprintf ("'AWS_SECRET_ACCESS_KEY': '%s'" , s3SecretAccessKey ),
81
+ "'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION')" : fmt .Sprintf ("'AWS_DEFAULT_REGION': '%s'" , s3DefaultRegion ),
82
+ "'pip': 'requirements.txt'" : "'pip': '/opt/app-root/src/ray_finetune_requirements.txt'" ,
83
+ "'working_dir': './'" : "'working_dir': '/opt/app-root/src'" ,
84
+ "client.stop_job(submission_id)" : "finished = False\\ n\" ,\n \t \" while not finished:\\ n\" ,\n \t \" time.sleep(1)\\ n\" ,\n \t \" status = client.get_job_status(submission_id)\\ n\" ,\n \t \" finished = (status == \\ \" SUCCEEDED\\ \" )\\ n\" ,\n \t \" if finished:\\ n\" ,\n \t \" print(\\ \" Job completed Successfully !\\ \" )\\ n\" ,\n \t \" else:\\ n\" ,\n \t \" print(\\ \" Job failed !\\ \" )\\ n\" ,\n \t \" time.sleep(10)\\ n" ,
85
+ }
86
+
87
+ updatedNotebookContent := string (ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb" ))
88
+ for oldValue , newValue := range requiredChangesInNotebook {
89
+ updatedNotebookContent = strings .Replace (updatedNotebookContent , oldValue , newValue , - 1 )
90
+ }
91
+ updatedNotebook := []byte (updatedNotebookContent )
39
92
40
93
// Test configuration
41
94
jupyterNotebookConfigMapFileName := "ray_finetune_llm_deepspeed.ipynb"
42
95
configMap := map [string ][]byte {
43
96
// MNIST Ray Notebook
44
- jupyterNotebookConfigMapFileName : ReadFile ( test , "resources/ray_finetune_demo/ray_finetune_llm_deepspeed.ipynb" ) ,
97
+ jupyterNotebookConfigMapFileName : updatedNotebook ,
45
98
"ray_finetune_llm_deepspeed.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py" ),
46
- "ray_finetune_requirements.txt" : ReadRayFinetuneRequirementsTxt (test ),
99
+ "ray_finetune_requirements.txt" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/requirements.txt" ),
47
100
"create_dataset.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/create_dataset.py" ),
48
101
"lora.json" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/lora_configs/lora.json" ),
49
102
"zero_3_llama_2_7b.json" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_7b.json" ),
@@ -52,13 +105,6 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
52
105
53
106
config := CreateConfigMap (test , namespace .Name , configMap )
54
107
55
- // Define the regular(non-admin) user
56
- userName := GetNotebookUserName (test )
57
- userToken := GetNotebookUserToken (test )
58
-
59
- // Create role binding with Namespace specific admin cluster role
60
- CreateUserRoleBindingWithClusterRole (test , userName , namespace .Name , "admin" )
61
-
62
108
// Create Notebook CR
63
109
createNotebook (test , namespace , userToken , config .Name , jupyterNotebookConfigMapFileName , numGpus )
64
110
@@ -78,30 +124,16 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
78
124
)
79
125
time .Sleep (30 * time .Second )
80
126
81
- jobStatus := ReadJobLogs (test , namespace )
127
+ rayClusters , err := test .Client ().Ray ().RayV1 ().RayClusters (namespace .Name ).List (test .Ctx (), metav1.ListOptions {})
128
+ test .Expect (err ).ToNot (HaveOccurred ())
129
+ test .Expect (len (rayClusters .Items )).To (BeNumerically (">" , 0 ))
130
+ test .Expect (len (rayClusters .Items )).To (Equal (1 ))
131
+ rayCluster := rayClusters .Items [0 ]
132
+
133
+ jobStatus := ReadJobLogs (test , namespace , rayCluster )
82
134
test .Expect (jobStatus ).To (Equal ("SUCCEEDED" ))
83
135
84
136
// Make sure the RayCluster finishes and is deleted
85
137
test .Eventually (RayClusters (test , namespace .Name ), TestTimeoutMedium ).
86
138
Should (HaveLen (0 ))
87
139
}
88
-
89
- func ReadRayFinetuneRequirementsTxt (test Test ) []byte {
90
- // Read the requirements.txt from resources and perform replacements for custom values using go template
91
- props := struct {
92
- PipIndexUrl string
93
- PipTrustedHost string
94
- }{
95
- PipIndexUrl : "--index " + string (GetPipIndexURL ()),
96
- }
97
-
98
- // Provide trusted host only if defined
99
- if len (GetPipTrustedHost ()) > 0 {
100
- props .PipTrustedHost = "--trusted-host " + GetPipTrustedHost ()
101
- }
102
-
103
- template , err := files .ReadFile ("resources/ray_finetune_demo/ray_finetune_requirements.txt" )
104
- test .Expect (err ).NotTo (HaveOccurred ())
105
-
106
- return ParseTemplate (test , template , props )
107
- }
0 commit comments