Skip to content

Commit 0eb5dc2

Browse files
Updated ray-finetune-test flow to reuse existing demo files from examples directory
1 parent 44c7064 commit 0eb5dc2

File tree

11 files changed

+72
-1248
lines changed

11 files changed

+72
-1248
lines changed

examples/ray-finetune-llm-deepspeed/create_dataset.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
import json
33
import os
44

5+
cache_dir="../../datasets"
6+
if not os.path.exists(cache_dir):
7+
cache_dir=""
8+
dataset = load_dataset("gsm8k", "main", cache_dir=cache_dir)
59

610
def gsm8k_qa_tokens_template():
711
dataset = load_dataset("gsm8k", "main")

examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@
4242
"# On OpenShift, you can retrieve the token by running `oc whoami -t`,\n",
4343
"# and the server with `oc cluster-info`.\n",
4444
"auth = TokenAuthentication(\n",
45-
" token = \"\",\n",
46-
" server = \"\",\n",
45+
" token = '',\n",
46+
" server = '',\n",
4747
" skip_tls=False\n",
4848
")\n",
4949
"auth.login()"
@@ -125,7 +125,7 @@
125125
"source": [
126126
"# The S3 bucket where to store checkpoint.\n",
127127
"# It can be set manually, otherwise it's retrieved from configured the data connection.\n",
128-
"s3_bucket = \"\"\n",
128+
"s3_bucket = ''\n",
129129
"if not s3_bucket:\n",
130130
" s3_bucket = os.environ.get('AWS_S3_BUCKET')\n",
131131
"assert s3_bucket, \"An S3 bucket must be provided to store checkpoints\""
@@ -152,12 +152,12 @@
152152
" \"--eval-batch-size-per-device=32 \",\n",
153153
" runtime_env={\n",
154154
" \"env_vars\": {\n",
155-
" \"AWS_ACCESS_KEY_ID\": os.environ.get('AWS_ACCESS_KEY_ID'),\n",
156-
" \"AWS_SECRET_ACCESS_KEY\": os.environ.get('AWS_SECRET_ACCESS_KEY'),\n",
157-
" \"AWS_DEFAULT_REGION\": os.environ.get('AWS_DEFAULT_REGION')\n",
155+
" 'AWS_ACCESS_KEY_ID': os.environ.get('AWS_ACCESS_KEY_ID'),\n",
156+
" 'AWS_SECRET_ACCESS_KEY': os.environ.get('AWS_SECRET_ACCESS_KEY'),\n",
157+
" 'AWS_DEFAULT_REGION': os.environ.get('AWS_DEFAULT_REGION')\n",
158158
" },\n",
159-
" \"pip\": \"requirements.txt\",\n",
160-
" \"working_dir\": \"./\",\n",
159+
" 'pip': 'requirements.txt',\n",
160+
" 'working_dir': './',\n",
161161
" \"excludes\": [\"/docs/\", \"*.ipynb\", \"*.md\"]\n",
162162
" },\n",
163163
")\n",

examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -601,6 +601,9 @@ def parse_args():
601601

602602
parser.add_argument("--lora", action="store_true", default=False,
603603
help="If passed, will enable parameter efficient fine-tuning with LoRA.")
604+
605+
parser.add_argument("--lora-config", type=str, default="./lora_configs/lora.json",
606+
help="Lora config json to use.")
604607

605608
parser.add_argument("--num-epochs", type=int, default=1,
606609
help="Number of epochs to train for.")
@@ -669,7 +672,7 @@ def main():
669672

670673
# Add LoRA config if needed
671674
if args.lora:
672-
with open("./lora_configs/lora.json", "r") as json_file:
675+
with open(args.lora_config, "r") as json_file:
673676
lora_config = json.load(json_file)
674677
config["lora_config"] = lora_config
675678

tests/odh/ray_finetune_llm_deepspeed_test.go

Lines changed: 56 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -17,39 +17,27 @@ limitations under the License.
1717
package odh
1818

1919
import (
20+
"fmt"
21+
"os"
22+
"strings"
2023
"testing"
2124

2225
. "github.com/onsi/gomega"
2326
. "github.com/project-codeflare/codeflare-common/support"
2427
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
2528
)
2629

27-
func TestRayFinetuneDemo(t *testing.T) {
28-
mnistRayLlmFinetune(t, 1)
30+
func TestRayFinetuneLlmDeepspeedDemo(t *testing.T) {
31+
rayFinetuneLlmDeepspeed(t, 1)
2932
}
3033

31-
func mnistRayLlmFinetune(t *testing.T, numGpus int) {
34+
func rayFinetuneLlmDeepspeed(t *testing.T, numGpus int) {
3235
test := With(t)
3336

3437
// Create a namespace
3538
namespace := test.NewTestNamespace()
36-
37-
// Test configuration
38-
jupyterNotebookConfigMapFileName := "ray_finetune_llm_deepspeed.ipynb"
39-
40-
// Test configuration
41-
configMap := map[string][]byte{
42-
// MNIST Ray Notebook
43-
jupyterNotebookConfigMapFileName: ReadFile(test, "resources/ray_finetune_demo/ray_finetune_llm_deepspeed.ipynb"),
44-
"ray_finetune_llm_deepspeed.py": ReadFile(test, "resources/ray_finetune_demo/ray_finetune_llm_deepspeed.py"),
45-
"ray_finetune_requirements.txt": ReadRayFinetuneRequirementsTxt(test),
46-
"create_dataset.py": ReadFile(test, "resources/ray_finetune_demo/create_dataset.py"),
47-
"lora.json": ReadFile(test, "resources/ray_finetune_demo/lora.json"),
48-
"zero_3_llama_2_7b.json": ReadFile(test, "resources/ray_finetune_demo/zero_3_llama_2_7b.json"),
49-
"utils.py": ReadFile(test, "resources/ray_finetune_demo/utils.py"),
50-
}
51-
52-
config := CreateConfigMap(test, namespace.Name, configMap)
39+
var workingDirectory, err = os.Getwd()
40+
test.Expect(err).ToNot(HaveOccurred())
5341

5442
// Define the regular(non-admin) user
5543
userName := GetNotebookUserName(test)
@@ -58,6 +46,53 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
5846
// Create role binding with Namespace specific admin cluster role
5947
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
6048

49+
// list changes required in llm-deepspeed-finetune-demo.ipynb file and update those
50+
requiredChangesInNotebook := map[string]string{
51+
"import os": "import os,time,sys",
52+
"import sys": "!cp /opt/app-root/notebooks/* ./",
53+
"from codeflare_sdk.cluster.auth import TokenAuthentication": "from codeflare_sdk.cluster.auth import TokenAuthentication\\n\",\n\t\"from codeflare_sdk.job import RayJobClient",
54+
"token = ''": fmt.Sprintf("token = '%s'", userToken),
55+
"server = ''": fmt.Sprintf("server = '%s'", GetOpenShiftApiUrl(test)),
56+
"namespace='ray-finetune-llm-deepspeed'": fmt.Sprintf("namespace='%s'", namespace.Name),
57+
"head_cpus=16": "head_cpus=2",
58+
"head_gpus=1": "head_gpus=0",
59+
"num_workers=7": "num_workers=1",
60+
"min_cpus=16": "min_cpus=4",
61+
"max_cpus=16": "max_cpus=4",
62+
"min_memory=128": "min_memory=48",
63+
"max_memory=256": "max_memory=48",
64+
"head_memory=128": "head_memory=48",
65+
"num_gpus=1": fmt.Sprintf("worker_extended_resource_requests={'nvidia.com/gpu': %d},\\n\",\n\t\" write_to_file=True,\\n\",\n\t\" verify_tls=False", numGpus),
66+
"image='quay.io/rhoai/ray:2.23.0-py39-cu121'": fmt.Sprintf("image='%s'", GetRayImage()),
67+
"client = cluster.job_client": "ray_dashboard = cluster.cluster_dashboard_uri()\\n\",\n\t\"header = {\\\"Authorization\\\": \\\"Bearer " + userToken + "\\\"}\\n\",\n\t\"client = RayJobClient(address=ray_dashboard, headers=header, verify=False)\\n",
68+
"--num-devices=8": fmt.Sprintf("--num-devices=%d", numGpus),
69+
"--num-epochs=3": fmt.Sprintf("--num-epochs=%d", 1),
70+
"--ds-config=./deepspeed_configs/zero_3_llama_2_7b.json": "--ds-config=./zero_3_llama_2_7b.json \\\"\\n\",\n\t\" \\\"--lora-config=./lora.json \\\"\\n\",\n\t\" \\\"--as-test",
71+
"'pip': 'requirements.txt'": "'pip': '/opt/app-root/src/requirements.txt'",
72+
"'working_dir': './'": "'working_dir': '/opt/app-root/src'",
73+
"client.stop_job(submission_id)": "finished = False\\n\",\n\t\"while not finished:\\n\",\n\t\" time.sleep(1)\\n\",\n\t\" status = client.get_job_status(submission_id)\\n\",\n\t\" finished = (status == \\\"SUCCEEDED\\\")\\n\",\n\t\"if finished:\\n\",\n\t\" print(\\\"Job completed Successfully !\\\")\\n\",\n\t\"else:\\n\",\n\t\" print(\\\"Job failed !\\\")\\n\",\n\t\"time.sleep(10)\\n",
74+
}
75+
76+
updatedNotebookContent := string(ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb"))
77+
for oldValue, newValue := range requiredChangesInNotebook {
78+
updatedNotebookContent = strings.Replace(updatedNotebookContent, oldValue, newValue, -1)
79+
}
80+
updatedNotebook := []byte(updatedNotebookContent)
81+
82+
// Test configuration
83+
jupyterNotebookConfigMapFileName := "ray_finetune_llm_deepspeed.ipynb"
84+
configMap := map[string][]byte{
85+
jupyterNotebookConfigMapFileName: updatedNotebook,
86+
"ray_finetune_llm_deepspeed.py": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py"),
87+
"requirements.txt": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/requirements.txt"),
88+
"create_dataset.py": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/create_dataset.py"),
89+
"lora.json": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/lora_configs/lora.json"),
90+
"zero_3_llama_2_7b.json": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_7b.json"),
91+
"utils.py": ReadFileExt(test, workingDirectory+"/../../examples/ray-finetune-llm-deepspeed/utils.py"),
92+
}
93+
94+
config := CreateConfigMap(test, namespace.Name, configMap)
95+
6196
// Create Notebook CR
6297
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
6398

@@ -77,26 +112,6 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
77112
)
78113

79114
// Make sure the RayCluster finishes and is deleted
80-
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutGpuProvisioning).
115+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutMedium).
81116
Should(HaveLen(0))
82117
}
83-
84-
func ReadRayFinetuneRequirementsTxt(test Test) []byte {
85-
// Read the requirements.txt from resources and perform replacements for custom values using go template
86-
props := struct {
87-
PipIndexUrl string
88-
PipTrustedHost string
89-
}{
90-
PipIndexUrl: "--index " + string(GetPipIndexURL()),
91-
}
92-
93-
// Provide trusted host only if defined
94-
if len(GetPipTrustedHost()) > 0 {
95-
props.PipTrustedHost = "--trusted-host " + GetPipTrustedHost()
96-
}
97-
98-
template, err := files.ReadFile("resources/ray_finetune_demo/ray_finetune_requirements.txt")
99-
test.Expect(err).NotTo(HaveOccurred())
100-
101-
return ParseTemplate(test, template, props)
102-
}

tests/odh/resources/ray_finetune_demo/create_dataset.py

Lines changed: 0 additions & 31 deletions
This file was deleted.

tests/odh/resources/ray_finetune_demo/lora.json

Lines changed: 0 additions & 11 deletions
This file was deleted.

0 commit comments

Comments
 (0)