Add e2e Test for HyperParameter Optimisation with Ray Tune

Srihari1192 · openshift-merge-bot[bot] · commit 53ffaa6ea8cc · 2024-07-26T12:48:17.000Z
diff --git a/tests/odh/mnist_ray_test.go b/tests/odh/mnist_ray_test.go
@@ -189,18 +189,3 @@ func readMnistPy(test Test) []byte {
 
 	return ParseTemplate(test, template, props)
 }
-
-// TODO: This belongs on codeflare-common/support/ray.go
-func rayClusters(t Test, namespace *corev1.Namespace) func(g Gomega) []*rayv1.RayCluster {
-	return func(g Gomega) []*rayv1.RayCluster {
-		rcs, err := t.Client().Ray().RayV1().RayClusters(namespace.Name).List(t.Ctx(), metav1.ListOptions{})
-		g.Expect(err).NotTo(HaveOccurred())
-
-		rcsp := []*rayv1.RayCluster{}
-		for _, v := range rcs.Items {
-			rcsp = append(rcsp, &v)
-		}
-
-		return rcsp
-	}
-}
diff --git a/tests/odh/mnist_raytune_hpo_test.go b/tests/odh/mnist_raytune_hpo_test.go
@@ -0,0 +1,135 @@
+/*
+Copyright 2023.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package odh
+
+import (
+	"bytes"
+	"fmt"
+	"testing"
+
+	. "github.com/onsi/gomega"
+	. "github.com/project-codeflare/codeflare-common/support"
+	rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1"
+	corev1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"sigs.k8s.io/kueue/apis/kueue/v1beta1"
+)
+
+func TestMnistRayTuneHpoCpu(t *testing.T) {
+	mnistRayTuneHpo(t, 0)
+}
+
+func TestMnistRayTuneHpoGpu(t *testing.T) {
+	mnistRayTuneHpo(t, 1)
+}
+
+func mnistRayTuneHpo(t *testing.T, numGpus int) {
+	test := With(t)
+
+	// Creating a namespace
+	namespace := test.NewTestNamespace()
+
+	// Create Kueue resources
+	resourceFlavor := CreateKueueResourceFlavor(test, v1beta1.ResourceFlavorSpec{})
+	defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
+	cqSpec := v1beta1.ClusterQueueSpec{
+		NamespaceSelector: &metav1.LabelSelector{},
+		ResourceGroups: []v1beta1.ResourceGroup{
+			{
+				CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName("nvidia.com/gpu")},
+				Flavors: []v1beta1.FlavorQuotas{
+					{
+						Name: v1beta1.ResourceFlavorReference(resourceFlavor.Name),
+						Resources: []v1beta1.ResourceQuota{
+							{
+								Name:         corev1.ResourceCPU,
+								NominalQuota: resource.MustParse("8"),
+							},
+							{
+								Name:         corev1.ResourceMemory,
+								NominalQuota: resource.MustParse("12Gi"),
+							},
+							{
+								Name:         corev1.ResourceName("nvidia.com/gpu"),
+								NominalQuota: resource.MustParse(fmt.Sprint(numGpus)),
+							},
+						},
+					},
+				},
+			},
+		},
+	}
+	clusterQueue := CreateKueueClusterQueue(test, cqSpec)
+	defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
+	localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
+
+	// Test configuration
+	jupyterNotebookConfigMapFileName := "mnist_hpo_raytune.ipynb"
+	mnist_hpo := ReadFile(test, "resources/mnist_hpo.py")
+
+	if numGpus > 0 {
+		mnist_hpo = bytes.Replace(mnist_hpo, []byte("gpu_value=\"has to be specified\""), []byte("gpu_value=\"1\""), 1)
+	} else {
+		mnist_hpo = bytes.Replace(mnist_hpo, []byte("gpu_value=\"has to be specified\""), []byte("gpu_value=\"0\""), 1)
+	}
+
+	config := CreateConfigMap(test, namespace.Name, map[string][]byte{
+		// MNIST Raytune HPO Notebook
+		jupyterNotebookConfigMapFileName: ReadFile(test, "resources/mnist_hpo_raytune.ipynb"),
+		"mnist_hpo.py":                   mnist_hpo,
+		"hpo_raytune_requirements.txt":   ReadFile(test, "resources/hpo_raytune_requirements.txt"),
+	})
+
+	// Define the regular(non-admin) user
+	userName := GetNotebookUserName(test)
+	userToken := GetNotebookUserToken(test)
+
+	// Create role binding with Namespace specific admin cluster role
+	CreateUserRoleBindingWithClusterRole(test, userName, namespace.Name, "admin")
+
+	// Create Notebook CR
+	createNotebook(test, namespace, userToken, localQueue.Name, config.Name, jupyterNotebookConfigMapFileName, numGpus)
+
+	// Gracefully cleanup Notebook
+	defer func() {
+		deleteNotebook(test, namespace)
+		test.Eventually(listNotebooks(test, namespace), TestTimeoutMedium).Should(HaveLen(0))
+	}()
+
+	// Make sure the RayCluster is created and running
+	test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
+		Should(
+			And(
+				HaveLen(1),
+				ContainElement(WithTransform(RayClusterState, Equal(rayv1.Ready))),
+			),
+		)
+
+	// Make sure the Workload is created and running
+	test.Eventually(GetKueueWorkloads(test, namespace.Name), TestTimeoutMedium).
+		Should(
+			And(
+				HaveLen(1),
+				ContainElement(WithTransform(KueueWorkloadAdmitted, BeTrueBecause("Workload failed to be admitted"))),
+			),
+		)
+
+	// Make sure the RayCluster finishes and is deleted
+	test.Eventually(rayClusters(test, namespace), TestTimeoutLong).
+		Should(HaveLen(0))
+}
diff --git a/tests/odh/resources/hpo_raytune_requirements.txt b/tests/odh/resources/hpo_raytune_requirements.txt
@@ -0,0 +1 @@
+torchvision==0.18.0
diff --git a/tests/odh/resources/mnist_hpo.py b/tests/odh/resources/mnist_hpo.py
@@ -0,0 +1,145 @@
+import os
+import tempfile
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from filelock import FileLock
+from torchvision import datasets, transforms
+
+import ray
+from ray import train, tune
+from ray.train import Checkpoint
+from ray.tune.schedulers import AsyncHyperBandScheduler
+
+EPOCH_SIZE = 128
+TEST_SIZE = 64
+
+
+class ConvNet(nn.Module):
+    def __init__(self):
+        super(ConvNet, self).__init__()
+        self.conv1 = nn.Conv2d(1, 3, kernel_size=3)
+        self.fc = nn.Linear(192, 10)
+
+    def forward(self, x):
+        x = F.relu(F.max_pool2d(self.conv1(x), 3))
+        x = x.view(-1, 192)
+        x = self.fc(x)
+        return F.log_softmax(x, dim=1)
+
+
+def train_func(model, optimizer, train_loader, device=None):
+    device = device or torch.device("cpu")
+    model.train()
+    for batch_idx, (data, target) in enumerate(train_loader):
+        if batch_idx * len(data) > EPOCH_SIZE:
+            return
+        data, target = data.to(device), target.to(device)
+        optimizer.zero_grad()
+        output = model(data)
+        loss = F.nll_loss(output, target)
+        loss.backward()
+        optimizer.step()
+
+
+def test_func(model, data_loader, device=None):
+    device = device or torch.device("cpu")
+    model.eval()
+    correct = 0
+    total = 0
+    with torch.no_grad():
+        for batch_idx, (data, target) in enumerate(data_loader):
+            if batch_idx * len(data) > TEST_SIZE:
+                break
+            data, target = data.to(device), target.to(device)
+            outputs = model(data)
+            _, predicted = torch.max(outputs.data, 1)
+            total += target.size(0)
+            correct += (predicted == target).sum().item()
+
+    return correct / total
+
+
+def get_data_loaders(batch_size=128):
+    mnist_transforms = transforms.Compose(
+        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
+    )
+
+    # We add FileLock here because multiple workers will want to
+    # download data, and this may cause overwrites since
+    # DataLoader is not threadsafe.
+    with FileLock(os.path.expanduser("~/data.lock")):
+        train_loader = torch.utils.data.DataLoader(
+            datasets.MNIST(
+                "~/data", train=True, download=True, transform=mnist_transforms
+            ),
+            batch_size=batch_size,
+            shuffle=True,
+        )
+        test_loader = torch.utils.data.DataLoader(
+            datasets.MNIST(
+                "~/data", train=False, download=True, transform=mnist_transforms
+            ),
+            batch_size=batch_size,
+            shuffle=True,
+        )
+    return train_loader, test_loader
+
+
+def train_mnist(config):
+    should_checkpoint = config.get("should_checkpoint", False)
+    use_cuda = torch.cuda.is_available()
+    device = torch.device("cuda" if use_cuda else "cpu")
+    train_loader, test_loader = get_data_loaders()
+    model = ConvNet().to(device)
+
+    optimizer = optim.SGD(
+        model.parameters(), lr=config["lr"], momentum=config["momentum"]
+    )
+
+    while True:
+        train_func(model, optimizer, train_loader, device)
+        acc = test_func(model, test_loader, device)
+        metrics = {"mean_accuracy": acc}
+
+        # Report metrics (and possibly a checkpoint)
+        if should_checkpoint:
+            with tempfile.TemporaryDirectory() as tempdir:
+                torch.save(model.state_dict(), os.path.join(tempdir, "model.pt"))
+                train.report(metrics, checkpoint=Checkpoint.from_directory(tempdir))
+        else:
+            train.report(metrics)
+
+
+if __name__ == "__main__":
+    # for early stopping
+    sched = AsyncHyperBandScheduler()
+    gpu_value="has to be specified"
+    resources_per_trial = {"cpu": 1, "gpu": gpu_value}
+    tuner = tune.Tuner(
+        tune.with_resources(train_mnist, resources=resources_per_trial),
+        tune_config=tune.TuneConfig(
+            metric="mean_accuracy",
+            mode="max",
+            scheduler=sched,
+            num_samples=5,
+        ),
+        run_config=train.RunConfig(
+            name="exp",
+            stop={
+                "mean_accuracy": 0.98,
+                "training_iteration": 5,
+            },
+        ),
+        param_space={
+            "lr": tune.loguniform(1e-4, 1e-2),
+            "momentum": tune.uniform(0.1, 0.9),
+        },
+    )
+    results = tuner.fit()
+
+    print("Best hyperparameters config is:", results.get_best_result().config)
+
+    assert not results.errors
diff --git a/tests/odh/resources/mnist_hpo_raytune.ipynb b/tests/odh/resources/mnist_hpo_raytune.ipynb
diff --git a/tests/odh/support.go b/tests/odh/support.go