@@ -17,11 +17,19 @@ limitations under the License.
17
17
package odh
18
18
19
19
import (
20
+ "crypto/tls"
21
+ "encoding/json"
22
+ "fmt"
23
+ "io"
24
+ "net/http"
25
+ "os"
20
26
"testing"
27
+ "time"
21
28
22
29
. "github.com/onsi/gomega"
23
30
. "github.com/project-codeflare/codeflare-common/support"
24
31
rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
32
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
25
33
)
26
34
27
35
func TestRayFinetuneDemo (t * testing.T ) {
@@ -33,20 +41,19 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
33
41
34
42
// Create a namespace
35
43
namespace := test .NewTestNamespace ()
44
+ var workingDirectory , _ = os .Getwd ()
36
45
37
46
// Test configuration
38
47
jupyterNotebookConfigMapFileName := "ray_finetune_llm_deepspeed.ipynb"
39
-
40
- // Test configuration
41
48
configMap := map [string ][]byte {
42
49
// MNIST Ray Notebook
43
50
jupyterNotebookConfigMapFileName : ReadFile (test , "resources/ray_finetune_demo/ray_finetune_llm_deepspeed.ipynb" ),
44
- "ray_finetune_llm_deepspeed.py" : ReadFile (test , "resources/ray_finetune_demo /ray_finetune_llm_deepspeed.py" ),
51
+ "ray_finetune_llm_deepspeed.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed /ray_finetune_llm_deepspeed.py" ),
45
52
"ray_finetune_requirements.txt" : ReadRayFinetuneRequirementsTxt (test ),
46
- "create_dataset.py" : ReadFile (test , "resources/ray_finetune_demo /create_dataset.py" ),
47
- "lora.json" : ReadFile (test , "resources/ray_finetune_demo /lora.json" ),
48
- "zero_3_llama_2_7b.json" : ReadFile (test , "resources/ray_finetune_demo /zero_3_llama_2_7b.json" ),
49
- "utils.py" : ReadFile (test , "resources/ray_finetune_demo /utils.py" ),
53
+ "create_dataset.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed /create_dataset.py" ),
54
+ "lora.json" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/lora_configs /lora.json" ),
55
+ "zero_3_llama_2_7b.json" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed/deepspeed_configs /zero_3_llama_2_7b.json" ),
56
+ "utils.py" : ReadFileExt (test , workingDirectory + "/../../examples/ray-finetune-llm-deepspeed /utils.py" ),
50
57
}
51
58
52
59
config := CreateConfigMap (test , namespace .Name , configMap )
@@ -75,9 +82,77 @@ func mnistRayLlmFinetune(t *testing.T, numGpus int) {
75
82
ContainElement (WithTransform (RayClusterState , Equal (rayv1 .Ready ))),
76
83
),
77
84
)
85
+ time .Sleep (30 * time .Second )
86
+
87
+ rayClusters , _ := test .Client ().Ray ().RayV1 ().RayClusters (namespace .Name ).List (test .Ctx (), metav1.ListOptions {})
88
+ test .Expect (len (rayClusters .Items )).To (BeNumerically (">" , 0 ))
89
+ dashboardName := "ray-dashboard-" + rayClusters .Items [0 ].Name
90
+ fmt .Printf ("Raycluster created : %s\n " , rayClusters .Items [0 ].Name )
91
+ route := GetRoute (test , namespace .Name , dashboardName )
92
+ hostname := route .Status .Ingress [0 ].Host
93
+
94
+ // Wait for expected HTTP code
95
+ fmt .Printf ("Waiting for Route %s/%s to be available...\n " , route .Namespace , route .Name )
96
+ tr := & http.Transport {
97
+ TLSClientConfig : & tls.Config {InsecureSkipVerify : true },
98
+ Proxy : http .ProxyFromEnvironment ,
99
+ }
100
+ client := & http.Client {Transport : tr }
101
+ req , err := http .NewRequest ("GET" , "https://" + hostname + "/api/jobs/" , nil )
102
+ if err != nil {
103
+ test .T ().Fatal (err )
104
+ }
105
+ req .Header .Add ("Authorization" , "Bearer " + test .Config ().BearerToken )
106
+
107
+ resp , err := client .Do (req )
108
+ test .Expect (err ).ToNot (HaveOccurred ())
109
+ test .Expect (resp .StatusCode ).ToNot (Equal (503 ))
110
+ defer resp .Body .Close ()
111
+ body , err := io .ReadAll (resp .Body )
112
+ test .Expect (err ).ToNot (HaveOccurred ())
113
+
114
+ var resp_json []map [string ]interface {}
115
+ err = json .Unmarshal (body , & resp_json )
116
+ test .Expect (err ).ToNot (HaveOccurred ())
117
+ if len (resp_json ) > 0 {
118
+ fmt .Printf ("Job is submitted in the raycluster!\n Submission-ID : %s\n " , resp_json [0 ]["submission_id" ])
119
+ }
120
+
121
+ var status string
122
+ var prevStatus string
123
+ fmt .Printf ("Waiting for job to be Succeeded...\n " )
124
+ for status != "SUCCEEDED" {
125
+ resp , err := client .Do (req )
126
+ test .Expect (err ).ToNot (HaveOccurred ())
127
+ body , err := io .ReadAll (resp .Body )
128
+ test .Expect (err ).ToNot (HaveOccurred ())
129
+ var result []map [string ]interface {}
130
+ if err := json .Unmarshal (body , & result ); err != nil {
131
+ time .Sleep (2 * time .Second )
132
+ break
133
+ }
134
+ if status , ok := result [0 ]["status" ].(string ); ok {
135
+ if prevStatus != status {
136
+ fmt .Printf ("JobStatus : %s...\n " , status )
137
+ prevStatus = status
138
+ }
139
+ if status == "SUCCEEDED" {
140
+ prevStatus = status
141
+ break
142
+ }
143
+ prevStatus = status
144
+ } else {
145
+ test .T ().Logf ("Status key not found or not a string" )
146
+ }
147
+ time .Sleep (3 * time .Second )
148
+ }
149
+ if prevStatus != "SUCCEEDED" {
150
+ fmt .Printf ("Job failed!" )
151
+ }
152
+ test .Expect (prevStatus ).To (Equal ("SUCCEEDED" ))
78
153
79
154
// Make sure the RayCluster finishes and is deleted
80
- test .Eventually (RayClusters (test , namespace .Name ), TestTimeoutGpuProvisioning ).
155
+ test .Eventually (RayClusters (test , namespace .Name ), TestTimeoutMedium ).
81
156
Should (HaveLen (0 ))
82
157
}
83
158
0 commit comments