diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt index 086633cf043..d9e7b338cfd 100644 --- a/.ci/docker/requirements.txt +++ b/.ci/docker/requirements.txt @@ -32,7 +32,7 @@ bs4 awscliv2==2.1.1 flask spacy==3.4.1 -ray[tune]==2.7.2 +ray[tune]==2.52.1 tensorboard jinja2==3.1.3 pytorch-lightning diff --git a/.gitignore b/.gitignore index 3f1f927ee33..ea478ca180d 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,8 @@ beginner_source/hymenoptera_data/ intermediate_source/data/ *.zip MNIST/ +data/cifar-10-batches-py/* +*.tar.gz #builds _build/ @@ -132,3 +134,4 @@ dictionary.dic # linters /.lintbin + diff --git a/beginner_source/hyperparameter_tuning_tutorial.py b/beginner_source/hyperparameter_tuning_tutorial.py index dd3fe65699e..c1798cca8ac 100644 --- a/beginner_source/hyperparameter_tuning_tutorial.py +++ b/beginner_source/hyperparameter_tuning_tutorial.py @@ -1,44 +1,42 @@ -# -*- coding: utf-8 -*- """ Hyperparameter tuning with Ray Tune =================================== -Hyperparameter tuning can make the difference between an average model and a highly -accurate one. Often simple things like choosing a different learning rate or changing -a network layer size can have a dramatic impact on your model performance. +**Author:** `Ricardo Decal `_ -Fortunately, there are tools that help with finding the best combination of parameters. -`Ray Tune `_ is an industry standard tool for -distributed hyperparameter tuning. Ray Tune includes the latest hyperparameter search -algorithms, integrates with various analysis libraries, and natively -supports distributed training through `Ray's distributed machine learning engine -`_. +This tutorial shows how to integrate Ray Tune into your PyTorch training +workflow to perform scalable and efficient hyperparameter tuning. -In this tutorial, we will show you how to integrate Ray Tune into your PyTorch -training workflow. We will extend `this tutorial from the PyTorch documentation -`_ for training -a CIFAR10 image classifier. +`Ray `__, a project of the +PyTorch Foundation, is an open-source unified framework for scaling AI +and Python applications. It helps run distributed workloads by handling +the complexity of distributed computing. `Ray +Tune `__ is a library +built on Ray for hyperparameter tuning that enables you to scale a +hyperparameter sweep from your machine to a large cluster with no code +changes. -As you will see, we only need to add some slight modifications. In particular, we -need to +This tutorial extends the PyTorch tutorial for training a CIFAR10 image +classifier in the `CIFAR10 tutorial (PyTorch +documentation) `__. +Only minor modifications are needed to adapt the PyTorch tutorial for +Ray Tune. Specifically, this tutorial wraps the data loading and +training in functions, makes some network parameters configurable, adds +optional checkpointing, and defines the search space for model tuning. -1. wrap data loading and training in functions, -2. make some network parameters configurable, -3. add checkpointing (optional), -4. and define the search space for the model tuning +Setup +----- -| +To run this tutorial, install the dependencies: -To run this tutorial, please make sure the following packages are -installed: +""" -- ``ray[tune]``: Distributed hyperparameter tuning library -- ``torchvision``: For the data transformers +# %%bash +# pip install "ray[tune]" torchvision + +###################################################################### +# Then start with the imports: -Setup / Imports ---------------- -Let's start with the imports: -""" from functools import partial import os import tempfile @@ -50,33 +48,22 @@ from torch.utils.data import random_split import torchvision import torchvision.transforms as transforms -# sphinx_gallery_start_ignore -# Fixes ``AttributeError: '_LoggingTee' object has no attribute 'fileno'``. -# This is only needed to run with sphinx-build. -import sys -if not hasattr(sys.stdout, "encoding"): - sys.stdout.encoding = "latin1" - sys.stdout.fileno = lambda: 0 -# sphinx_gallery_end_ignore +# New: imports for Ray Tune +import ray from ray import tune -from ray import train -from ray.train import Checkpoint, get_checkpoint +from ray.tune import Checkpoint from ray.tune.schedulers import ASHAScheduler -import ray.cloudpickle as pickle ###################################################################### -# Most of the imports are needed for building the PyTorch model. Only the last -# imports are for Ray Tune. +# How to use PyTorch data loaders with Ray Tune +# --------------------------------------------- # -# Data loaders -# ------------ -# We wrap the data loaders in their own function and pass a global data directory. -# This way we can share a data directory between different trials. - +# Wrap the data loaders in a constructor function. Pass a global data +# directory here to reuse the dataset across different trials. def load_data(data_dir="./data"): transform = transforms.Compose( - [transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))] + [transforms.ToTensor(), transforms.Normalize((0.4914, 0.48216, 0.44653), (0.2022, 0.19932, 0.20086))] ) trainset = torchvision.datasets.CIFAR10( @@ -89,18 +76,16 @@ def load_data(data_dir="./data"): return trainset, testset - ###################################################################### -# Configurable neural network -# --------------------------- -# We can only tune those parameters that are configurable. -# In this example, we can specify -# the layer sizes of the fully connected layers: - +# Configure the hyperparameters +# ----------------------------- +# +# In this example, we specify the layer sizes of the fully connected +# layers. class Net(nn.Module): def __init__(self, l1=120, l2=84): - super(Net, self).__init__() + super().__init__() self.conv1 = nn.Conv2d(3, 6, 5) self.pool = nn.MaxPool2d(2, 2) self.conv2 = nn.Conv2d(6, 16, 5) @@ -117,119 +102,133 @@ def forward(self, x): x = self.fc3(x) return x - ###################################################################### -# The train function -# ------------------ -# Now it gets interesting, because we introduce some changes to the example `from the PyTorch -# documentation `_. -# -# We wrap the training script in a function ``train_cifar(config, data_dir=None)``. -# The ``config`` parameter will receive the hyperparameters we would like to -# train with. The ``data_dir`` specifies the directory where we load and store the data, -# so that multiple runs can share the same data source. -# We also load the model and optimizer state at the start of the run, if a checkpoint -# is provided. Further down in this tutorial you will find information on how -# to save the checkpoint and what it is used for. +# Use a train function with Ray Tune +# ---------------------------------- +# +# Now it gets interesting, because we introduce some changes to the +# example `from the PyTorch +# documentation `__. +# +# We wrap the training script in a function +# ``train_cifar(config, data_dir=None)``. The ``config`` parameter +# receives the hyperparameters we want to train with. The ``data_dir`` +# specifies the directory where we load and store the data, allowing +# multiple runs to share the same data source. This is especially useful +# in cluster environments where you can mount shared storage (for example +# NFS) to prevent the data from being downloaded to each node separately. +# We also load the model and optimizer state at the start of the run if a +# checkpoint is provided. Further down in this tutorial, you will find +# information on how to save the checkpoint and how it is used. # # .. code-block:: python # -# net = Net(config["l1"], config["l2"]) +# net = Net(config["l1"], config["l2"]) # -# checkpoint = get_checkpoint() -# if checkpoint: -# with checkpoint.as_directory() as checkpoint_dir: -# data_path = Path(checkpoint_dir) / "data.pkl" -# with open(data_path, "rb") as fp: -# checkpoint_state = pickle.load(fp) -# start_epoch = checkpoint_state["epoch"] -# net.load_state_dict(checkpoint_state["net_state_dict"]) -# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) -# else: -# start_epoch = 0 +# checkpoint = tune.get_checkpoint() +# if checkpoint: +# with checkpoint.as_directory() as checkpoint_dir: +# checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" +# checkpoint_state = torch.load(checkpoint_path) +# start_epoch = checkpoint_state["epoch"] +# net.load_state_dict(checkpoint_state["net_state_dict"]) +# optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) +# else: +# start_epoch = 0 # # The learning rate of the optimizer is made configurable, too: # # .. code-block:: python # -# optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) +# optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) # -# We also split the training data into a training and validation subset. We thus train on -# 80% of the data and calculate the validation loss on the remaining 20%. The batch sizes -# with which we iterate through the training and test sets are configurable as well. +# We also split the training data into a training and validation subset. +# We thus train on 80% of the data and calculate the validation loss on +# the remaining 20%. The batch sizes with which we iterate through the +# training and test sets are configurable as well. # -# Adding (multi) GPU support with DataParallel -# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# Image classification benefits largely from GPUs. Luckily, we can continue to use -# PyTorch's abstractions in Ray Tune. Thus, we can wrap our model in ``nn.DataParallel`` -# to support data parallel training on multiple GPUs: +# Add multi-GPU support with DataParallel +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# +# Image classification benefits largely from GPUs. Luckily, you can +# continue to use PyTorch tools in Ray Tune. Thus, you can wrap the model +# in ``nn.DataParallel`` to support data-parallel training on multiple +# GPUs: # # .. code-block:: python # -# device = "cpu" -# if torch.cuda.is_available(): -# device = "cuda:0" -# if torch.cuda.device_count() > 1: -# net = nn.DataParallel(net) -# net.to(device) +# device = "cpu" +# if torch.cuda.is_available(): +# device = "cuda:0" +# if torch.cuda.device_count() > 1: +# net = nn.DataParallel(net) +# net.to(device) # -# By using a ``device`` variable we make sure that training also works when we have -# no GPUs available. PyTorch requires us to send our data to the GPU memory explicitly, -# like this: +# By using a ``device`` variable, we ensure that training works even +# without a GPU. PyTorch requires us to send our data to the GPU memory +# explicitly: # # .. code-block:: python # -# for i, data in enumerate(trainloader, 0): -# inputs, labels = data -# inputs, labels = inputs.to(device), labels.to(device) +# for i, data in enumerate(trainloader, 0): +# inputs, labels = data +# inputs, labels = inputs.to(device), labels.to(device) # -# The code now supports training on CPUs, on a single GPU, and on multiple GPUs. Notably, Ray -# also supports `fractional GPUs `_ -# so we can share GPUs among trials, as long as the model still fits on the GPU memory. We'll come back -# to that later. +# The code now supports training on CPUs, on a single GPU, and on multiple +# GPUs. Notably, Ray also supports `fractional +# GPUs `__ +# so we can share GPUs among trials, as long as the model still fits on +# the GPU memory. We will return to that later. # # Communicating with Ray Tune # ~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# The most interesting part is the communication with Ray Tune: +# The most interesting part is the communication with Ray Tune. As you’ll +# see, integrating Ray Tune into your training code requires only a few +# additional lines: # # .. code-block:: python # -# checkpoint_data = { -# "epoch": epoch, -# "net_state_dict": net.state_dict(), -# "optimizer_state_dict": optimizer.state_dict(), -# } -# with tempfile.TemporaryDirectory() as checkpoint_dir: -# data_path = Path(checkpoint_dir) / "data.pkl" -# with open(data_path, "wb") as fp: -# pickle.dump(checkpoint_data, fp) -# -# checkpoint = Checkpoint.from_directory(checkpoint_dir) -# train.report( -# {"loss": val_loss / val_steps, "accuracy": correct / total}, -# checkpoint=checkpoint, -# ) -# -# Here we first save a checkpoint and then report some metrics back to Ray Tune. Specifically, -# we send the validation loss and accuracy back to Ray Tune. Ray Tune can then use these metrics -# to decide which hyperparameter configuration lead to the best results. These metrics -# can also be used to stop bad performing trials early in order to avoid wasting -# resources on those trials. -# -# The checkpoint saving is optional, however, it is necessary if we wanted to use advanced -# schedulers like -# `Population Based Training `_. -# Also, by saving the checkpoint we can later load the trained models and validate them -# on a test set. Lastly, saving checkpoints is useful for fault tolerance, and it allows -# us to interrupt training and continue training later. +# checkpoint_data = { +# "epoch": epoch, +# "net_state_dict": net.state_dict(), +# "optimizer_state_dict": optimizer.state_dict(), +# } +# with tempfile.TemporaryDirectory() as checkpoint_dir: +# checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" +# torch.save(checkpoint_data, checkpoint_path) +# +# checkpoint = Checkpoint.from_directory(checkpoint_dir) +# tune.report( +# {"loss": val_loss / val_steps, "accuracy": correct / total}, +# checkpoint=checkpoint, +# ) +# +# Here we first save a checkpoint and then report some metrics back to Ray +# Tune. Specifically, we send the validation loss and accuracy back to Ray +# Tune. Ray Tune uses these metrics to determine the best hyperparameter +# configuration and to stop underperforming trials early, saving +# resources. +# +# The checkpoint saving is optional. However, it is necessary if we wanted +# to use advanced schedulers like `Population Based +# Training `__. +# Saving the checkpoint also allows us to later load the trained models +# for validation on a test set. Lastly, it provides fault tolerance, +# enabling us to pause and resume training. +# +# To summarize, integrating Ray Tune into your PyTorch training requires +# just a few key additions: use ``tune.report()`` to report metrics (and +# optionally checkpoints) to Ray Tune, ``tune.get_checkpoint()`` to load a +# model from a checkpoint, and ``Checkpoint.from_directory()`` to create a +# checkpoint object from saved state. The rest of your training code +# remains standard PyTorch. # # Full training function # ~~~~~~~~~~~~~~~~~~~~~~ # # The full code example looks like this: - def train_cifar(config, data_dir=None): net = Net(config["l1"], config["l2"]) @@ -243,12 +242,12 @@ def train_cifar(config, data_dir=None): criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9) - checkpoint = get_checkpoint() + # Load checkpoint if resuming training + checkpoint = tune.get_checkpoint() if checkpoint: with checkpoint.as_directory() as checkpoint_dir: - data_path = Path(checkpoint_dir) / "data.pkl" - with open(data_path, "rb") as fp: - checkpoint_state = pickle.load(fp) + checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" + checkpoint_state = torch.load(checkpoint_path) start_epoch = checkpoint_state["epoch"] net.load_state_dict(checkpoint_state["net_state_dict"]) optimizer.load_state_dict(checkpoint_state["optimizer_state_dict"]) @@ -263,10 +262,10 @@ def train_cifar(config, data_dir=None): ) trainloader = torch.utils.data.DataLoader( - train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8 + train_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=2 ) valloader = torch.utils.data.DataLoader( - val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=8 + val_subset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=2 ) for epoch in range(start_epoch, 10): # loop over the dataset multiple times @@ -315,34 +314,34 @@ def train_cifar(config, data_dir=None): val_loss += loss.cpu().numpy() val_steps += 1 + # Save checkpoint and report metrics checkpoint_data = { "epoch": epoch, "net_state_dict": net.state_dict(), "optimizer_state_dict": optimizer.state_dict(), } with tempfile.TemporaryDirectory() as checkpoint_dir: - data_path = Path(checkpoint_dir) / "data.pkl" - with open(data_path, "wb") as fp: - pickle.dump(checkpoint_data, fp) + checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" + torch.save(checkpoint_data, checkpoint_path) checkpoint = Checkpoint.from_directory(checkpoint_dir) - train.report( + tune.report( {"loss": val_loss / val_steps, "accuracy": correct / total}, checkpoint=checkpoint, ) - - print("Finished Training") + print("Finished Training") ###################################################################### -# As you can see, most of the code is adapted directly from the original example. +# As you can see, most of the code is adapted directly from the original +# example. # -# Test set accuracy -# ----------------- -# Commonly the performance of a machine learning model is tested on a hold-out test -# set with data that has not been used for training the model. We also wrap this in a -# function: - +# Compute test set accuracy +# ------------------------- +# +# Commonly the performance of a machine learning model is tested on a +# held-out test set with data that has not been used for training the +# model. We also wrap this in a function: def test_accuracy(net, device="cpu"): trainset, testset = load_data() @@ -364,69 +363,115 @@ def test_accuracy(net, device="cpu"): return correct / total - ###################################################################### -# The function also expects a ``device`` parameter, so we can do the -# test set validation on a GPU. +# The function also expects a ``device`` parameter so you can run the test +# set validation on a GPU. # -# Configuring the search space -# ---------------------------- -# Lastly, we need to define Ray Tune's search space. Here is an example: +# Configure the search space +# -------------------------- # -# .. code-block:: python +# Lastly, we need to define Ray Tune’s search space. Ray Tune offers a +# variety of `search space +# distributions `__ +# to suit different parameter types: ``loguniform``, ``uniform``, +# ``choice``, ``randint``, ``grid``, and more. It also lets you express +# complex dependencies between parameters with `conditional search +# spaces `__. # -# config = { -# "l1": tune.choice([2 ** i for i in range(9)]), -# "l2": tune.choice([2 ** i for i in range(9)]), -# "lr": tune.loguniform(1e-4, 1e-1), -# "batch_size": tune.choice([2, 4, 8, 16]) -# } +# Here is an example: # -# The ``tune.choice()`` accepts a list of values that are uniformly sampled from. -# In this example, the ``l1`` and ``l2`` parameters -# should be powers of 2 between 4 and 256, so either 4, 8, 16, 32, 64, 128, or 256. -# The ``lr`` (learning rate) should be uniformly sampled between 0.0001 and 0.1. Lastly, -# the batch size is a choice between 2, 4, 8, and 16. -# -# At each trial, Ray Tune will now randomly sample a combination of parameters from these -# search spaces. It will then train a number of models in parallel and find the best -# performing one among these. We also use the ``ASHAScheduler`` which will terminate bad -# performing trials early. +# .. code-block:: python # -# We wrap the ``train_cifar`` function with ``functools.partial`` to set the constant -# ``data_dir`` parameter. We can also tell Ray Tune what resources should be -# available for each trial: +# config = { +# "l1": tune.choice([2**i for i in range(9)]), +# "l2": tune.choice([2**i for i in range(9)]), +# "lr": tune.loguniform(1e-4, 1e-1), +# "batch_size": tune.choice([2, 4, 8, 16]), +# } +# +# The ``tune.choice()`` accepts a list of values that are uniformly +# sampled from. In this example, the ``l1`` and ``l2`` parameter values +# will be powers of 2 between 1 and 256. The learning rate is sampled on a +# log scale between 0.0001 and 0.1. Sampling on a log scale ensures that +# the search space is explored efficiently across different magnitudes. +# +# Smarter sampling and scheduling +# ------------------------------- +# +# To make the hyperparameter search process efficient, Ray Tune provides +# two main controls: +# +# 1. It can intelligently pick the next set of hyperparameters to test +# based on previous results using `advanced search +# algorithms `__ +# such as +# `Optuna `__ +# or +# ```bayesopt`` `__, +# instead of relying only on random or grid search. +# 2. It can detect underperforming trials and stop them early using +# `schedulers `__, +# enabling you to explore the parameter space more on the same compute +# budget. +# +# In this tutorial, we use the ``ASHAScheduler``, which aggressively +# terminates low-performing trials to save computational resources. +# +# Configure the resources +# ----------------------- +# +# Tell Ray Tune what resources should be available for each trial using +# ``tune.with_resources``: # # .. code-block:: python # -# gpus_per_trial = 2 -# # ... -# result = tune.run( -# partial(train_cifar, data_dir=data_dir), -# resources_per_trial={"cpu": 8, "gpu": gpus_per_trial}, -# config=config, -# num_samples=num_samples, -# scheduler=scheduler, -# checkpoint_at_end=True) +# tune.with_resources( +# partial(train_cifar, data_dir=data_dir), +# resources={"cpu": cpus_per_trial, "gpu": gpus_per_trial} +# ) # -# You can specify the number of CPUs, which are then available e.g. -# to increase the ``num_workers`` of the PyTorch ``DataLoader`` instances. The selected -# number of GPUs are made visible to PyTorch in each trial. Trials do not have access to -# GPUs that haven't been requested for them - so you don't have to care about two trials -# using the same set of resources. +# This tells Ray Tune to allocate ``cpus_per_trial`` CPUs and +# ``gpus_per_trial`` GPUs for each trial. Ray Tune automatically manages +# the placement of these trials and ensures they are isolated, so you +# don’t need to manually assign GPUs to processes. # -# Here we can also specify fractional GPUs, so something like ``gpus_per_trial=0.5`` is -# completely valid. The trials will then share GPUs among each other. -# You just have to make sure that the models still fit in the GPU memory. +# For example, if you are running this experiment on a cluster of 20 +# machines, each with 8 GPUs, you can set ``gpus_per_trial = 0.5`` to +# schedule 2 concurrent trials per GPU. This configuration runs 320 trials +# in parallel across the cluster. # -# After training the models, we will find the best performing one and load the trained -# network from the checkpoint file. We then obtain the test set accuracy and report -# everything by printing. +# Putting it together +# ------------------- # -# The full main function looks like this: - - -def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): +# The Ray Tune API is designed to be modular and composable: you pass your +# configurations to the ``tune.Tuner`` class to create a tuner object, +# then execute ``tuner.fit()`` to start training: +# +# .. code-block:: python +# +# tuner = tune.Tuner( +# tune.with_resources( +# partial(train_cifar, data_dir=data_dir), +# resources={"cpu": cpus_per_trial, "gpu": gpus_per_trial} +# ), +# tune_config=tune.TuneConfig( +# metric="loss", +# mode="min", +# scheduler=scheduler, +# num_samples=num_samples, +# ), +# param_space=config, +# ) +# results = tuner.fit() +# +# After training the models, we will find the best performing one and load +# the trained network from the checkpoint file. We then obtain the test +# set accuracy and report the results. + +def main(num_trials=10, max_num_epochs=10, gpus_per_trial=2): + print("Starting hyperparameter tuning.") + ray.init() + data_dir = os.path.abspath("./data") load_data(data_dir) config = { @@ -436,26 +481,32 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): "batch_size": tune.choice([2, 4, 8, 16]), } scheduler = ASHAScheduler( - metric="loss", - mode="min", max_t=max_num_epochs, grace_period=1, reduction_factor=2, ) - result = tune.run( - partial(train_cifar, data_dir=data_dir), - resources_per_trial={"cpu": 2, "gpu": gpus_per_trial}, - config=config, - num_samples=num_samples, - scheduler=scheduler, + + tuner = tune.Tuner( + tune.with_resources( + partial(train_cifar, data_dir=data_dir), + resources={"cpu": 2, "gpu": gpus_per_trial} + ), + tune_config=tune.TuneConfig( + metric="loss", + mode="min", + scheduler=scheduler, + num_samples=num_trials, + ), + param_space=config, ) + results = tuner.fit() - best_trial = result.get_best_trial("loss", "min", "last") - print(f"Best trial config: {best_trial.config}") - print(f"Best trial final validation loss: {best_trial.last_result['loss']}") - print(f"Best trial final validation accuracy: {best_trial.last_result['accuracy']}") + best_result = results.get_best_result("loss", "min") + print(f"Best trial config: {best_result.config}") + print(f"Best trial final validation loss: {best_result.metrics['loss']}") + print(f"Best trial final validation accuracy: {best_result.metrics['accuracy']}") - best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"]) + best_trained_model = Net(best_result.config["l1"], best_result.config["l2"]) device = "cpu" if torch.cuda.is_available(): device = "cuda:0" @@ -463,11 +514,10 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): best_trained_model = nn.DataParallel(best_trained_model) best_trained_model.to(device) - best_checkpoint = result.get_best_checkpoint(trial=best_trial, metric="accuracy", mode="max") + best_checkpoint = best_result.checkpoint with best_checkpoint.as_directory() as checkpoint_dir: - data_path = Path(checkpoint_dir) / "data.pkl" - with open(data_path, "rb") as fp: - best_checkpoint_data = pickle.load(fp) + checkpoint_path = Path(checkpoint_dir) / "checkpoint.pt" + best_checkpoint_data = torch.load(checkpoint_path) best_trained_model.load_state_dict(best_checkpoint_data["net_state_dict"]) test_acc = test_accuracy(best_trained_model, device) @@ -475,38 +525,74 @@ def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2): if __name__ == "__main__": - # You can change the number of GPUs per trial here: - main(num_samples=10, max_num_epochs=10, gpus_per_trial=0) - + # Set the number of trials, epochs, and GPUs per trial here: + # The following configuration is for a quick run (1 trial, 1 epoch, CPU only) for demonstration purposes. + main(num_trials=1, max_num_epochs=1, gpus_per_trial=0) ###################################################################### -# If you run the code, an example output could look like this: -# -# .. code-block:: sh -# -# Number of trials: 10/10 (10 TERMINATED) -# +-----+--------------+------+------+-------------+--------+---------+------------+ -# | ... | batch_size | l1 | l2 | lr | iter | loss | accuracy | -# |-----+--------------+------+------+-------------+--------+---------+------------| -# | ... | 2 | 1 | 256 | 0.000668163 | 1 | 2.31479 | 0.0977 | -# | ... | 4 | 64 | 8 | 0.0331514 | 1 | 2.31605 | 0.0983 | -# | ... | 4 | 2 | 1 | 0.000150295 | 1 | 2.30755 | 0.1023 | -# | ... | 16 | 32 | 32 | 0.0128248 | 10 | 1.66912 | 0.4391 | -# | ... | 4 | 8 | 128 | 0.00464561 | 2 | 1.7316 | 0.3463 | -# | ... | 8 | 256 | 8 | 0.00031556 | 1 | 2.19409 | 0.1736 | -# | ... | 4 | 16 | 256 | 0.00574329 | 2 | 1.85679 | 0.3368 | -# | ... | 8 | 2 | 2 | 0.00325652 | 1 | 2.30272 | 0.0984 | -# | ... | 2 | 2 | 2 | 0.000342987 | 2 | 1.76044 | 0.292 | -# | ... | 4 | 64 | 32 | 0.003734 | 8 | 1.53101 | 0.4761 | -# +-----+--------------+------+------+-------------+--------+---------+------------+ -# -# Best trial config: {'l1': 64, 'l2': 32, 'lr': 0.0037339984519545164, 'batch_size': 4} -# Best trial final validation loss: 1.5310075663924216 -# Best trial final validation accuracy: 0.4761 -# Best trial test set accuracy: 0.4737 -# -# Most trials have been stopped early in order to avoid wasting resources. -# The best performing trial achieved a validation accuracy of about 47%, which could -# be confirmed on the test set. -# -# So that's it! You can now tune the parameters of your PyTorch models. +# Your Ray Tune trial summary output will look something like this: +# +# .. code-block:: bash +# +# Number of trials: 10/10 (10 TERMINATED) +# +-----+--------------+------+------+-------------+--------+---------+------------+ +# | ... | batch_size | l1 | l2 | lr | iter | loss | accuracy | +# |-----+--------------+------+------+-------------+--------+---------+------------| +# | ... | 2 | 1 | 256 | 0.000668163 | 1 | 2.31479 | 0.0977 | +# | ... | 4 | 64 | 8 | 0.0331514 | 1 | 2.31605 | 0.0983 | +# | ... | 4 | 2 | 1 | 0.000150295 | 1 | 2.30755 | 0.1023 | +# | ... | 16 | 32 | 32 | 0.0128248 | 10 | 1.66912 | 0.4391 | +# | ... | 4 | 8 | 128 | 0.00464561 | 2 | 1.7316 | 0.3463 | +# | ... | 8 | 256 | 8 | 0.00031556 | 1 | 2.19409 | 0.1736 | +# | ... | 4 | 16 | 256 | 0.00574329 | 2 | 1.85679 | 0.3368 | +# | ... | 8 | 2 | 2 | 0.00325652 | 1 | 2.30272 | 0.0984 | +# | ... | 2 | 2 | 2 | 0.000342987 | 2 | 1.76044 | 0.292 | +# | ... | 4 | 64 | 32 | 0.003734 | 8 | 1.53101 | 0.4761 | +# +-----+--------------+------+------+-------------+--------+---------+------------+ +# +# Best trial config: {'l1': 64, 'l2': 32, 'lr': 0.0037339984519545164, 'batch_size': 4} +# Best trial final validation loss: 1.5310075663924216 +# Best trial final validation accuracy: 0.4761 +# Best trial test set accuracy: 0.4737 +# +# Most trials were stopped early to conserve resources. The best +# performing trial achieved a validation accuracy of approximately 47%, +# which could be confirmed on the test set. +# +# You can now tune the parameters of your PyTorch models. +# +# Observability +# ------------- +# +# When running large-scale experiments, monitoring is crucial. Ray +# provides a +# `Dashboard `__ +# that lets you view the status of your trials, check cluster resource +# utilization, and inspect logs in real-time. +# +# For debugging, Ray also offers `Distributed +# Debugging `__ +# tools that let you attach a debugger to running trials across the +# cluster. +# +# Conclusion +# ---------- +# +# In this tutorial, you learned how to tune the hyperparameters of a +# PyTorch model using Ray Tune. You saw how to integrate Ray Tune into +# your PyTorch training loop, define a search space for your +# hyperparameters, use an efficient scheduler like ASHA to terminate bad +# trials early, save checkpoints and report metrics to Ray Tune, and run +# the hyperparameter search and analyze the results. +# +# Ray Tune makes it easy to scale your experiments from a single machine +# to a large cluster, helping you find the best model configuration +# efficiently. +# +# Further reading +# --------------- +# +# - `Ray Tune +# documentation `__ +# - `Ray Tune +# examples `__ diff --git a/ecosystem.rst b/ecosystem.rst index da2a926851a..ddd6c505561 100644 --- a/ecosystem.rst +++ b/ecosystem.rst @@ -33,7 +33,7 @@ to production deployment. :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model. :image: _static/img/ray-tune.png :link: beginner/hyperparameter_tuning_tutorial.html - :tags: Model-Optimization,Best-Practice,Ecosystem + :tags: Model-Optimization,Best-Practice,Ecosystem,Ray-Distributed,Parallel-and-Distributed-Training .. customcarditem:: :header: Multi-Objective Neural Architecture Search with Ax diff --git a/index.rst b/index.rst index 5a5e80abfbb..f9a76296750 100644 --- a/index.rst +++ b/index.rst @@ -493,7 +493,7 @@ Welcome to PyTorch Tutorials :card_description: Learn how to use Ray Tune to find the best performing set of hyperparameters for your model. :image: _static/img/ray-tune.png :link: beginner/hyperparameter_tuning_tutorial.html - :tags: Model-Optimization,Best-Practice + :tags: Model-Optimization,Best-Practice,Ray-Distributed,Parallel-and-Distributed-Training .. customcarditem:: :header: Parametrizations Tutorial