Skip to content

Commit 6c84161

Browse files
Added provision to assert job status, updated HPO script for disconnected and removed local-queue parameter usage from cluster configuration to make it optional
1 parent fa7fe64 commit 6c84161

13 files changed

+660
-522
lines changed

tests/kfto/core/kfto_kueue_sft_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
7979
}
8080
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
8181
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
82-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
82+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
8383

8484
// Create training PyTorch job
8585
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
@@ -143,7 +143,7 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
143143
}
144144
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
145145
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
146-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
146+
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
147147

148148
// Create first training PyTorch job
149149
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)

tests/kfto/upgrade/kfto_kueue_sft_upgrade_training_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ func TestSetupPytorchjob(t *testing.T) {
9999
clusterQueue, err = test.Client().Kueue().KueueV1beta1().ClusterQueues().Create(test.Ctx(), clusterQueue, metav1.CreateOptions{})
100100
test.Expect(err).NotTo(HaveOccurred())
101101

102-
localQueue := CreateKueueLocalQueue(test, namespaceName, clusterQueue.Name)
102+
localQueue := CreateKueueLocalQueue(test, namespaceName, clusterQueue.Name, AsDefaultQueue)
103103

104104
// Create training PyTorch job
105105
tuningJob := createPyTorchJob(test, namespaceName, localQueue.Name, *config)

tests/odh/mnist_ray_test.go

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"bytes"
2121
"fmt"
2222
"testing"
23+
"time"
2324

2425
. "github.com/onsi/gomega"
2526
. "github.com/project-codeflare/codeflare-common/support"
@@ -77,11 +78,11 @@ func mnistRay(t *testing.T, numGpus int) {
7778
}
7879
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
7980
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
80-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
81+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
8182

8283
// Test configuration
8384
jupyterNotebookConfigMapFileName := "mnist_ray_mini.ipynb"
84-
mnist := readMnistPy(test)
85+
mnist := readMnistPy(test, "resources/mnist.py")
8586
if numGpus > 0 {
8687
mnist = bytes.Replace(mnist, []byte("accelerator=\"has to be specified\""), []byte("accelerator=\"gpu\""), 1)
8788
} else {
@@ -91,7 +92,7 @@ func mnistRay(t *testing.T, numGpus int) {
9192
// MNIST Ray Notebook
9293
jupyterNotebookConfigMapFileName: ReadFile(test, "resources/mnist_ray_mini.ipynb"),
9394
"mnist.py": mnist,
94-
"requirements.txt": readRequirementsTxt(test),
95+
"requirements.txt": ReadFile(test, "resources/requirements.txt"),
9596
})
9697

9798
// Define the regular(non-admin) user
@@ -102,7 +103,7 @@ func mnistRay(t *testing.T, numGpus int) {
102103
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
103104

104105
// Create Notebook CR
105-
createNotebook(test, namespace, userToken, localQueue.Name, config.Name, jupyterNotebookConfigMapFileName, numGpus)
106+
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
106107

107108
// Gracefully cleanup Notebook
108109
defer func() {
@@ -111,7 +112,7 @@ func mnistRay(t *testing.T, numGpus int) {
111112
}()
112113

113114
// Make sure the RayCluster is created and running
114-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
115+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
115116
Should(
116117
And(
117118
HaveLen(1),
@@ -128,32 +129,20 @@ func mnistRay(t *testing.T, numGpus int) {
128129
),
129130
)
130131

131-
// Make sure the RayCluster finishes and is deleted
132-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
133-
Should(HaveLen(0))
134-
}
135-
136-
func readRequirementsTxt(test Test) []byte {
137-
// Read the requirements.txt from resources and perform replacements for custom values using go template
138-
props := struct {
139-
PipIndexUrl string
140-
PipTrustedHost string
141-
}{
142-
PipIndexUrl: "--index " + string(GetPipIndexURL()),
143-
}
132+
time.Sleep(30 * time.Second)
144133

145-
// Provide trusted host only if defined
146-
if len(GetPipTrustedHost()) > 0 {
147-
props.PipTrustedHost = "--trusted-host " + GetPipTrustedHost()
148-
}
134+
rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Get(test.Ctx(), "mnisttest", metav1.GetOptions{})
135+
test.Expect(err).ToNot(HaveOccurred())
149136

150-
template, err := files.ReadFile("resources/requirements.txt")
151-
test.Expect(err).NotTo(HaveOccurred())
137+
jobStatus := ReadJobLogs(test, namespace, rayCluster)
138+
test.Expect(jobStatus).To(Equal("SUCCEEDED"))
152139

153-
return ParseTemplate(test, template, props)
140+
// Make sure the RayCluster finishes and is deleted
141+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
142+
Should(HaveLen(0))
154143
}
155144

156-
func readMnistPy(test Test) []byte {
145+
func readMnistPy(test Test, filePath string) []byte {
157146
// Read the mnist.py from resources and perform replacements for custom values using go template
158147
storage_bucket_endpoint, storage_bucket_endpoint_exists := GetStorageBucketDefaultEndpoint()
159148
storage_bucket_access_key_id, storage_bucket_access_key_id_exists := GetStorageBucketAccessKeyId()
@@ -184,7 +173,7 @@ func readMnistPy(test Test) []byte {
184173
StorageBucketMnistDir: storage_bucket_mnist_dir,
185174
StorageBucketMnistDirExists: storage_bucket_mnist_dir_exists,
186175
}
187-
template, err := files.ReadFile("resources/mnist.py")
176+
template, err := files.ReadFile(filePath)
188177
test.Expect(err).NotTo(HaveOccurred())
189178

190179
return ParseTemplate(test, template, props)

tests/odh/mnist_raytune_hpo_test.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"bytes"
2121
"fmt"
2222
"testing"
23+
"time"
2324

2425
. "github.com/onsi/gomega"
2526
. "github.com/project-codeflare/codeflare-common/support"
@@ -76,11 +77,11 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
7677
}
7778
clusterQueue := CreateKueueClusterQueue(test, cqSpec)
7879
defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
79-
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
80+
CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)
8081

8182
// Test configuration
8283
jupyterNotebookConfigMapFileName := "mnist_hpo_raytune.ipynb"
83-
mnist_hpo := ReadFile(test, "resources/mnist_hpo.py")
84+
mnist_hpo := readMnistPy(test, "resources/mnist_hpo.py")
8485

8586
if numGpus > 0 {
8687
mnist_hpo = bytes.Replace(mnist_hpo, []byte("gpu_value=\"has to be specified\""), []byte("gpu_value=\"1\""), 1)
@@ -103,7 +104,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
103104
CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
104105

105106
// Create Notebook CR
106-
createNotebook(test, namespace, userToken, localQueue.Name, config.Name, jupyterNotebookConfigMapFileName, numGpus)
107+
createNotebook(test, namespace, userToken, config.Name, jupyterNotebookConfigMapFileName, numGpus)
107108

108109
// Gracefully cleanup Notebook
109110
defer func() {
@@ -112,7 +113,7 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
112113
}()
113114

114115
// Make sure the RayCluster is created and running
115-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
116+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
116117
Should(
117118
And(
118119
HaveLen(1),
@@ -128,8 +129,15 @@ func mnistRayTuneHpo(t *testing.T, numGpus int) {
128129
ContainElement(WithTransform(KueueWorkloadAdmitted, BeTrueBecause("Workload failed to be admitted"))),
129130
),
130131
)
132+
time.Sleep(30 * time.Second)
133+
134+
rayCluster, err := test.Client().Ray().RayV1().RayClusters(namespace.Name).Get(test.Ctx(), "mnisthpotest", metav1.GetOptions{})
135+
test.Expect(err).ToNot(HaveOccurred())
136+
137+
jobStatus := ReadJobLogs(test, namespace, rayCluster)
138+
test.Expect(jobStatus).To(Equal("SUCCEEDED"))
131139

132140
// Make sure the RayCluster finishes and is deleted
133-
test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
141+
test.Eventually(RayClusters(test, namespace.Name), TestTimeoutLong).
134142
Should(HaveLen(0))
135143
}

tests/odh/notebook.go

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,32 @@ type NotebookProps struct {
4141
OpenDataHubNamespace string
4242
RayImage string
4343
NotebookImage string
44-
LocalQueue string
4544
NotebookConfigMapName string
4645
NotebookConfigMapFileName string
4746
NotebookPVC string
4847
NumGpus int
48+
PipIndexUrl string
49+
PipTrustedHost string
50+
S3BucketName string
51+
S3AccessKeyId string
52+
S3SecretAccessKey string
53+
S3DefaultRegion string
4954
}
5055

51-
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, localQueue, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
56+
func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, jupyterNotebookConfigMapName, jupyterNotebookConfigMapFileName string, numGpus int) {
5257
// Create PVC for Notebook
5358
notebookPVC := CreatePersistentVolumeClaim(test, namespace.Name, "10Gi", corev1.ReadWriteOnce)
59+
s3BucketName, s3BucketNameExists := GetStorageBucketName()
60+
s3AccessKeyId, _ := GetStorageBucketAccessKeyId()
61+
s3SecretAccessKey, _ := GetStorageBucketSecretKey()
62+
s3DefaultRegion, _ := GetStorageBucketDefaultRegion()
63+
64+
if !s3BucketNameExists {
65+
s3BucketName = "''"
66+
s3AccessKeyId = "''"
67+
s3SecretAccessKey = "''"
68+
s3DefaultRegion = "''"
69+
}
5470

5571
// Read the Notebook CR from resources and perform replacements for custom values using go template
5672
notebookProps := NotebookProps{
@@ -61,11 +77,16 @@ func createNotebook(test Test, namespace *corev1.Namespace, notebookUserToken, l
6177
OpenDataHubNamespace: GetOpenDataHubNamespace(test),
6278
RayImage: GetRayImage(),
6379
NotebookImage: GetNotebookImage(test),
64-
LocalQueue: localQueue,
6580
NotebookConfigMapName: jupyterNotebookConfigMapName,
6681
NotebookConfigMapFileName: jupyterNotebookConfigMapFileName,
6782
NotebookPVC: notebookPVC.Name,
6883
NumGpus: numGpus,
84+
S3BucketName: s3BucketName,
85+
S3AccessKeyId: s3AccessKeyId,
86+
S3SecretAccessKey: s3SecretAccessKey,
87+
S3DefaultRegion: s3DefaultRegion,
88+
PipIndexUrl: GetPipIndexURL(),
89+
PipTrustedHost: GetPipTrustedHost(),
6990
}
7091
notebookTemplate, err := files.ReadFile("resources/custom-nb-small.yaml")
7192
test.Expect(err).NotTo(gomega.HaveOccurred())

tests/odh/resources/custom-nb-small.yaml

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,20 @@ spec:
5050
value: {{.NotebookImage}}
5151
- name: JUPYTER_NOTEBOOK_PORT
5252
value: "8888"
53+
- name: AWS_ACCESS_KEY_ID
54+
value: {{.S3AccessKeyId}}
55+
- name: AWS_SECRET_ACCESS_KEY
56+
value: {{.S3SecretAccessKey}}
57+
- name: AWS_DEFAULT_REGION
58+
value: {{.S3DefaultRegion}}
59+
- name: AWS_S3_BUCKET
60+
value: {{.S3BucketName}}
61+
- name: PIP_INDEX_URL
62+
value: {{.PipIndexUrl}}
63+
- name: PIP_TRUSTED_HOST
64+
value: {{.PipTrustedHost}}
5365
image: {{.NotebookImage}}
54-
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p local_queue {{.LocalQueue}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} --log-output && sleep infinity"]
66+
command: ["/bin/sh", "-c", "pip install papermill && papermill /opt/app-root/notebooks/{{.NotebookConfigMapFileName}} /opt/app-root/src/mcad-out.ipynb -p namespace {{.Namespace}} -p ray_image {{.RayImage}} -p openshift_api_url {{.OpenShiftApiUrl}} -p kubernetes_user_bearer_token {{.KubernetesUserBearerToken}} -p num_gpus {{ .NumGpus }} --log-output && sleep infinity"]
5567
# args: ["pip install papermill && oc login --token=${OCP_TOKEN} --server=${OCP_SERVER} --insecure-skip-tls-verify=true && papermill /opt/app-root/notebooks/mcad.ipynb /opt/app-root/src/mcad-out.ipynb" ]
5668
imagePullPolicy: Always
5769
# livenessProbe:
@@ -158,4 +170,4 @@ spec:
158170
secretName: jupyter-nb-kube-3aadmin-tls
159171
- name: {{.NotebookConfigMapName}}
160172
configMap:
161-
name: {{.NotebookConfigMapName}}
173+
name: {{.NotebookConfigMapName}}
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
torchvision==0.18.0
2+
minio

tests/odh/resources/mnist.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,20 @@ def prepare_data(self):
133133
secret_key = "{{.StorageBucketSecretKey}}"
134134
bucket_name = "{{.StorageBucketName}}"
135135

136+
# remove prefix if specified in storage bucket endpoint url
137+
secure = True
138+
if endpoint.startswith("https://"):
139+
endpoint = endpoint[len("https://") :]
140+
elif endpoint.startswith("http://"):
141+
endpoint = endpoint[len("http://") :]
142+
secure = False
143+
136144
client = Minio(
137145
endpoint,
138146
access_key=access_key,
139147
secret_key=secret_key,
140148
cert_check=False,
149+
secure=secure
141150
)
142151

143152
if not os.path.exists(dataset_dir):

0 commit comments

Comments
 (0)