Skip to content

Commit 61d72f9

Browse files
Updated ray-finetune-test flow to reuse existing demo files from examples directory and updated mnist-raytune-hpo script to run in disconnected environment
1 parent 4f8c0bb commit 61d72f9

15 files changed

+260
-457
lines changed

examples/ray-finetune-llm-deepspeed/create_dataset.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,10 @@
22
import json
33
import os
44

5-
datasets_dir="../../datasets"
6-
if os.path.exists(datasets_dir):
7-
dataset = load_dataset("gsm8k", "main", cache_dir=datasets_dir)
8-
else:
9-
dataset = load_dataset("gsm8k", "main")
5+
cache_dir="../../datasets"
6+
if not os.path.exists(cache_dir):
7+
cache_dir=""
8+
dataset = load_dataset("gsm8k", "main", cache_dir=cache_dir)
109

1110
dataset_splits = {"train": dataset["train"], "test": dataset["test"]}
1211

examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
"# On OpenShift, you can retrieve the token by running `oc whoami -t`,\n",
4343
"# and the server with `oc cluster-info`.\n",
4444
"auth = TokenAuthentication(\n",
45-
" token = \"\",\n",
46-
" server = \"\",\n",
45+
" token = '',\n",
46+
" server = '',\n",
4747
" skip_tls=False\n",
4848
")\n",
4949
"auth.login()"
@@ -69,7 +69,7 @@
6969
" head_memory=128,\n",
7070
" head_gpus=1,\n",
7171
" num_gpus=1,\n",
72-
" image=\"quay.io/rhoai/ray:2.23.0-py39-cu121\",\n",
72+
" image='quay.io/rhoai/ray:2.23.0-py39-cu121',\n",
7373
"))"
7474
]
7575
},
@@ -126,7 +126,7 @@
126126
"source": [
127127
"# The S3 bucket where to store checkpoint.\n",
128128
"# It can be set manually, otherwise it's retrieved from configured the data connection.\n",
129-
"s3_bucket = \"\"\n",
129+
"s3_bucket = ''\n",
130130
"if not s3_bucket:\n",
131131
" s3_bucket = os.environ.get('AWS_S3_BUCKET')\n",
132132
"assert s3_bucket, \"An S3 bucket must be provided to store checkpoints\""
@@ -153,12 +153,12 @@
153153
" \"--eval-batch-size-per-device=32 \",\n",
154154
" runtime_env={\n",
155155
" \"env_vars\": {\n",
156-
" \"AWS_ACCESS_KEY_ID\": os.environ.get('AWS_ACCESS_KEY_ID'),\n",
157-
" \"AWS_SECRET_ACCESS_KEY\": os.environ.get('AWS_SECRET_ACCESS_KEY'),\n",
158-
" \"AWS_DEFAULT_REGION\": os.environ.get('AWS_DEFAULT_REGION')\n",
156+
" 'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),\n",
157+
" 'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),\n",
158+
" 'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION')\n",
159159
" },\n",
160-
" \"pip\": \"requirements.txt\",\n",
161-
" \"working_dir\": \"./\",\n",
160+
" 'pip': 'requirements.txt',\n",
161+
" 'working_dir': './',\n",
162162
" \"excludes\": [\"/docs/\", \"*.ipynb\", \"*.md\"]\n",
163163
" },\n",
164164
")\n",

tests/odh/mnist_ray_test.go

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ func mnistRay(t *testing.T, numGpus int) {
8282

8383
// Test configuration
8484
jupyterNotebookConfigMapFileName := "mnist_ray_mini.ipynb"
85-
mnist := readMnistPy(test)
85+
mnist := readMnistPy(test, "resources/mnist.py")
8686
if numGpus > 0 {
8787
mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"gpu\""), 1)
8888
} else {
@@ -92,7 +92,7 @@ func mnistRay(t *testing.T, numGpus int) {
9292
// MNIST Ray Notebook
9393
jupyterNotebookConfigMapFileName: ReadFile(test, "resources/mnist_ray_mini.ipynb"),
9494
"mnist.py": mnist,
95-
"requirements.txt": readRequirementsTxt(test),
95+
"requirements.txt": ReadFile(test, "resources/requirements.txt"),
9696
})
9797

9898
// Define the regular(non-admin) user
@@ -131,35 +131,21 @@ func mnistRay(t *testing.T, numGpus int) {
131131

132132
time.Sleep(30 * time.Second)
133133

134-
jobStatus := ReadJobLogs(test, namespace)
134+
rayClusters, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).List(test.Ctx(), metav1.ListOptions{})
135+
test.Expect(err).ToNot(HaveOccurred())
136+
test.Expect(len(rayClusters.Items)).To(BeNumerically(">", 0))
137+
test.Expect(len(rayClusters.Items)).To(Equal(1))
138+
rayCluster := rayClusters.Items[0]
139+
140+
jobStatus := ReadJobLogs(test, namespace, rayCluster)
135141
test.Expect(jobStatus).To(Equal("SUCCEEDED"))
136142

137143
// Make sure the RayCluster finishes and is deleted
138144
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
139145
Should(HaveLen(0))
140146
}
141147

142-
func readRequirementsTxt(test Test) []byte {
143-
// Read the requirements.txt from resources and perform replacements for custom values using go template
144-
props := struct {
145-
PipIndexUrl string
146-
PipTrustedHost string
147-
}{
148-
PipIndexUrl: "--index " + string(GetPipIndexURL()),
149-
}
150-
151-
// Provide trusted host only if defined
152-
if len(GetPipTrustedHost()) > 0 {
153-
props.PipTrustedHost = "--trusted-host " + GetPipTrustedHost()
154-
}
155-
156-
template, err := files.ReadFile("resources/requirements.txt")
157-
test.Expect(err).NotTo(HaveOccurred())
158-
159-
return ParseTemplate(test, template, props)
160-
}
161-
162-
func readMnistPy(test Test) []byte {
148+
func readMnistPy(test Test, filePath string) []byte {
163149
// Read the mnist.py from resources and perform replacements for custom values using go template
164150
storage_bucket_endpoint, storage_bucket_endpoint_exists := GetStorageBucketDefaultEndpoint()
165151
storage_bucket_access_key_id, storage_bucket_access_key_id_exists := GetStorageBucketAccessKeyId()
@@ -190,7 +176,7 @@ func readMnistPy(test Test) []byte {
190176
StorageBucketMnistDir: storage_bucket_mnist_dir,
191177
StorageBucketMnistDirExists: storage_bucket_mnist_dir_exists,
192178
}
193-
template, err := files.ReadFile("resources/mnist.py")
179+
template, err := files.ReadFile(filePath)
194180
test.Expect(err).NotTo(HaveOccurred())
195181

196182
return ParseTemplate(test, template, props)

tests/odh/mnist_raytune_hpo_test.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
8181

8282
// Test configuration
8383
jupyterNotebookConfigMapFileName := "mnist_hpo_raytune.ipynb"
84-
mnist_hpo := ReadFile(test, "resources/mnist_hpo.py")
84+
mnist_hpo := readMnistPy(test, "resources/mnist_hpo.py")
8585

8686
if numGpus > 0 {
8787
mnist_hpo = bytes.Replace(mnist_hpo, []byte("gpu_value=\"has to be specified\""), []byte("gpu_value=\"1\""), 1)
@@ -132,7 +132,13 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
132132

133133
time.Sleep(30 * time.Second)
134134

135-
jobStatus := ReadJobLogs(test, namespace)
135+
rayClusters, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).List(test.Ctx(), metav1.ListOptions{})
136+
test.Expect(err).ToNot(HaveOccurred())
137+
test.Expect(len(rayClusters.Items)).To(BeNumerically(">", 0))
138+
test.Expect(len(rayClusters.Items)).To(Equal(1))
139+
rayCluster := rayClusters.Items[0]
140+
141+
jobStatus := ReadJobLogs(test, namespace, rayCluster)
136142
test.Expect(jobStatus).To(Equal("SUCCEEDED"))
137143

138144
// Make sure the RayCluster finishes and is deleted

tests/odh/notebook.go

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,29 +45,27 @@ type NotebookProps struct {
4545
NotebookConfigMapFileName string
4646
NotebookPVC string
4747
NumGpus int
48+
PipIndexUrl string
49+
PipTrustedHost string
4850
S3BucketName string
49-
S3BucketNameExists bool
5051
S3AccessKeyId string
51-
S3AccessKeyIdExists bool
5252
S3SecretAccessKey string
53-
S3SecretAccessKeyExists bool
5453
S3DefaultRegion string
5554
}
5655

5756
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
5857
// Create PVC for Notebook
5958
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", corev1.ReadWriteOnce)
60-
s3BucketName, exists := GetStorageBucketName()
59+
s3BucketName, s3BucketNameExists := GetStorageBucketName()
6160
s3AccessKeyId, _ := GetStorageBucketAccessKeyId()
6261
s3SecretAccessKey, _ := GetStorageBucketSecretKey()
6362
s3DefaultRegion, _ := GetStorageBucketDefaultRegion()
6463

65-
if !exists {
66-
println("Storage bucket doesn't exists!")
67-
s3BucketName = "\"\""
68-
s3AccessKeyId = "\"\""
69-
s3SecretAccessKey = "\"\""
70-
s3DefaultRegion = "\"\""
64+
if !s3BucketNameExists {
65+
s3BucketName = "''"
66+
s3AccessKeyId = "''"
67+
s3SecretAccessKey = "''"
68+
s3DefaultRegion = "''"
7169
}
7270

7371
// Read the Notebook CR from resources and perform replacements for custom values using go template
@@ -87,6 +85,8 @@ func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, j
8785
S3AccessKeyId: s3AccessKeyId,
8886
S3SecretAccessKey: s3SecretAccessKey,
8987
S3DefaultRegion: s3DefaultRegion,
88+
PipIndexUrl: GetPipIndexURL(),
89+
PipTrustedHost: GetPipTrustedHost(),
9090
}
9191
notebookTemplate, err := files.ReadFile("resources/custom-nb-small.yaml")
9292
test.Expect(err).NotTo(gomega.HaveOccurred())

tests/odh/ray_finetune_llm_deepspeed_test.go

Lines changed: 66 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -17,33 +17,86 @@ limitations under the License.
1717
package odh
1818

1919
import (
20+
"fmt"
2021
"os"
22+
"strings"
2123
"testing"
2224
"time"
2325

2426
. "github.com/onsi/gomega"
2527
. "github.com/project-codeflare/codeflare-common/support"
2628
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
29+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2730
)
2831

29-
func TestRayFinetuneDemo(t *testing.T) {
30-
mnistRayLlmFinetune(t, 1)
32+
func TestRayFinetuneDemo2(t *testing.T) {
33+
mnistRayLlmFinetune2(t, 1)
3134
}
3235

33-
func mnistRayLlmFinetune(t *testing.T, numGpus int) {
36+
func mnistRayLlmFinetune2(t *testing.T, numGpus int) {
3437
test := With(t)
3538

3639
// Create a namespace
3740
namespace := test.NewTestNamespace()
38-
var workingDirectory, _ = os.Getwd()
41+
var workingDirectory, err = os.Getwd()
42+
test.Expect(err).ToNot(HaveOccurred())
43+
44+
// Define the regular(non-admin) user
45+
userName := GetNotebookUserName(test)
46+
userToken := GetNotebookUserToken(test)
47+
48+
// Create role binding with Namespace specific admin cluster role
49+
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
50+
51+
s3BucketName, _ := GetStorageBucketName()
52+
s3AccessKeyId, _ := GetStorageBucketAccessKeyId()
53+
s3SecretAccessKey, _ := GetStorageBucketSecretKey()
54+
s3DefaultRegion, _ := GetStorageBucketDefaultRegion()
55+
56+
// list changes required in llm-deepspeed-finetune-demo.ipynb file and update those
57+
requiredChangesInNotebook := map[string]string{
58+
"import os": "import os,time,sys",
59+
"import sys": "!cp /opt/app-root/notebooks/* ./",
60+
"from codeflare_sdk.cluster.auth import TokenAuthentication": "from codeflare_sdk.cluster.auth import TokenAuthentication\\n\",\n\t\"from codeflare_sdk.job import RayJobClient",
61+
"token = ''": fmt.Sprintf("token = '%s'", userToken),
62+
"server = ''": fmt.Sprintf("server = '%s'", GetOpenShiftApiUrl(test)),
63+
"namespace='ray-finetune-llm-deepspeed'": fmt.Sprintf("namespace='%s'", namespace.Name),
64+
"head_cpus=16": "head_cpus=2",
65+
"head_gpus=1": "head_gpus=0",
66+
"num_workers=7": "num_workers=1",
67+
"min_cpus=16": "min_cpus=4",
68+
"max_cpus=16": "max_cpus=4",
69+
"min_memory=128": "min_memory=48",
70+
"max_memory=256": "max_memory=48",
71+
"head_memory=128": "head_memory=48",
72+
"num_gpus=1": fmt.Sprintf("worker_extended_resource_requests={'nvidia.com/gpu': %d},\\n\",\n\t\" write_to_file=True,\\n\",\n\t\" verify_tls=False", numGpus),
73+
"image='quay.io/rhoai/ray:2.23.0-py39-cu121'": fmt.Sprintf("image='%s'", GetRayImage()),
74+
"client = cluster.job_client": "ray_dashboard = cluster.cluster_dashboard_uri()\\n\",\n\t\"header = {\\\"Authorization\\\": \\\"Bearer " + userToken + "\\\"}\\n\",\n\t\"client = RayJobClient(address=ray_dashboard, headers=header, verify=False)\\n",
75+
"s3_bucket = ''": fmt.Sprintf("s3_bucket = '%s'", s3BucketName),
76+
"--num-devices=8": fmt.Sprintf("--num-devices=%d", numGpus),
77+
"--num-epochs=3": fmt.Sprintf("--num-epochs=%d", 1),
78+
"--ds-config=./deepspeed_configs/zero_3_llama_2_7b.json": "--ds-config=./zero_3_llama_2_7b.json \\\"\\n\",\n\t\" \\\"--lora-config=./lora.json \\\"\\n\",\n\t\" \\\"--as-test",
79+
"'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID')": fmt.Sprintf("'AWS_ACCESS_KEY_ID': '%s'", s3AccessKeyId),
80+
"'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY')": fmt.Sprintf("'AWS_SECRET_ACCESS_KEY': '%s'", s3SecretAccessKey),
81+
"'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION')": fmt.Sprintf("'AWS_DEFAULT_REGION': '%s'", s3DefaultRegion),
82+
"'pip': 'requirements.txt'": "'pip': '/opt/app-root/src/ray_finetune_requirements.txt'",
83+
"'working_dir': './'": "'working_dir': '/opt/app-root/src'",
84+
"client.stop_job(submission_id)": "finished = False\\n\",\n\t\"while not finished:\\n\",\n\t\" time.sleep(1)\\n\",\n\t\" status = client.get_job_status(submission_id)\\n\",\n\t\" finished = (status == \\\"SUCCEEDED\\\")\\n\",\n\t\"if finished:\\n\",\n\t\" print(\\\"Job completed Successfully !\\\")\\n\",\n\t\"else:\\n\",\n\t\" print(\\\"Job failed !\\\")\\n\",\n\t\"time.sleep(10)\\n",
85+
}
86+
87+
updatedNotebookContent := string(ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb"))
88+
for oldValue, newValue := range requiredChangesInNotebook {
89+
updatedNotebookContent = strings.Replace(updatedNotebookContent, oldValue, newValue, -1)
90+
}
91+
updatedNotebook := []byte(updatedNotebookContent)
3992

4093
// Test configuration
4194
jupyterNotebookConfigMapFileName := "ray_finetune_llm_deepspeed.ipynb"
4295
configMap := map[string][]byte{
4396
// MNIST Ray Notebook
44-
jupyterNotebookConfigMapFileName: ReadFile(test, "resources/ray_finetune_demo/ray_finetune_llm_deepspeed.ipynb"),
97+
jupyterNotebookConfigMapFileName: updatedNotebook,
4598
"ray_finetune_llm_deepspeed.py": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py"),
46-
"ray_finetune_requirements.txt": ReadRayFinetuneRequirementsTxt(test),
99+
"ray_finetune_requirements.txt": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/requirements.txt"),
47100
"create_dataset.py": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/create_dataset.py"),
48101
"lora.json": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/lora_configs/lora.json"),
49102
"zero_3_llama_2_7b.json": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_7b.json"),
@@ -52,13 +105,6 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
52105

53106
config := CreateConfigMap(test, namespace.Name, configMap)
54107

55-
// Define the regular(non-admin) user
56-
userName := GetNotebookUserName(test)
57-
userToken := GetNotebookUserToken(test)
58-
59-
// Create role binding with Namespace specific admin cluster role
60-
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
61-
62108
// Create Notebook CR
63109
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
64110

@@ -78,30 +124,16 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
78124
)
79125
time.Sleep(30 * time.Second)
80126

81-
jobStatus := ReadJobLogs(test, namespace)
127+
rayClusters, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).List(test.Ctx(), metav1.ListOptions{})
128+
test.Expect(err).ToNot(HaveOccurred())
129+
test.Expect(len(rayClusters.Items)).To(BeNumerically(">", 0))
130+
test.Expect(len(rayClusters.Items)).To(Equal(1))
131+
rayCluster := rayClusters.Items[0]
132+
133+
jobStatus := ReadJobLogs(test, namespace, rayCluster)
82134
test.Expect(jobStatus).To(Equal("SUCCEEDED"))
83135

84136
// Make sure the RayCluster finishes and is deleted
85137
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutMedium).
86138
Should(HaveLen(0))
87139
}
88-
89-
func ReadRayFinetuneRequirementsTxt(test Test) []byte {
90-
// Read the requirements.txt from resources and perform replacements for custom values using go template
91-
props := struct {
92-
PipIndexUrl string
93-
PipTrustedHost string
94-
}{
95-
PipIndexUrl: "--index " + string(GetPipIndexURL()),
96-
}
97-
98-
// Provide trusted host only if defined
99-
if len(GetPipTrustedHost()) > 0 {
100-
props.PipTrustedHost = "--trusted-host " + GetPipTrustedHost()
101-
}
102-
103-
template, err := files.ReadFile("resources/ray_finetune_demo/ray_finetune_requirements.txt")
104-
test.Expect(err).NotTo(HaveOccurred())
105-
106-
return ParseTemplate(test, template, props)
107-
}

tests/odh/resources/custom-nb-small.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ spec:
5151
- name: JUPYTER_NOTEBOOK_PORT
5252
value: "8888"
5353
image: {{.NotebookImage}}
54-
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} -p s3_bucket_name {{.S3BucketName}} -p s3_access_key_id {{.S3AccessKeyId}} -p s3_secret_access_key {{.S3SecretAccessKey}} -p s3_default_region {{.S3DefaultRegion}} --log-output && sleep infinity"]
54+
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} -p s3_bucket_name {{.S3BucketName}} -p s3_access_key_id {{.S3AccessKeyId}} -p s3_secret_access_key {{.S3SecretAccessKey}} -p s3_default_region {{.S3DefaultRegion}} -p pip_index_url {{.PipIndexUrl}} -p pip_trusted_host {{.PipTrustedHost}} --log-output && sleep infinity"]
5555
# args: ["pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/mcad.ipynb /opt/app-root/src/mcad-out.ipynb" ]
5656
imagePullPolicy: Always
5757
# livenessProbe:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
torchvision==0.18.0
2+
minio

0 commit comments

Comments
 (0)