aws-samples
diff --git a/‎3.test_cases/22.SMHP-trainium-llama3/70B_config_llama3/config.json‎
Lines changed: 31 additions & 0 deletions b/‎3.test_cases/22.SMHP-trainium-llama3/70B_config_llama3/config.json‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎3.test_cases/22.SMHP-trainium-llama3/8B_config_llama3/config.json‎
Lines changed: 31 additions & 0 deletions b/‎3.test_cases/22.SMHP-trainium-llama3/8B_config_llama3/config.json‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎3.test_cases/22.SMHP-trainium-llama3/README.md‎
Lines changed: 165 additions & 0 deletions b/‎3.test_cases/22.SMHP-trainium-llama3/README.md‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎3.test_cases/22.SMHP-trainium-llama3/activation_checkpoint.py‎
Lines changed: 115 additions & 0 deletions b/‎3.test_cases/22.SMHP-trainium-llama3/activation_checkpoint.py‎
Lines changed: 115 additions & 0 deletions
@@ -0,0 +1,31 @@
+{
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "hidden_act": "silu",
+  "hidden_size": 8192,
+  "initializer_range": 0.02,
+  "intermediate_size": 28672,
+  "max_position_embeddings": 8192,
+  "model_type": "llama",
+  "num_attention_heads": 64,
+  "num_hidden_layers": 80,
+  "num_key_value_heads": 8,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.40.0.dev0",
+  "use_cache": true,
+  "vocab_size": 128256,
+  "sequence_parallel_enabled": false,
+  "selective_checkpoint_enabled": false,
+  "move_model_to_device":false
+}
+  
@@ -0,0 +1,31 @@
+{
+	"architectures": [
+		"LlamaForCausalLM"
+	],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 8192,
+  "model_type": "llama",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "pad_token_id": 0,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 500000.0,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.31.0",
+  "use_cache": true,
+  "vocab_size": 128256,
+  "sequence_parallel_enabled": false,
+  "selective_checkpoint_enabled": false,
+  "move_model_to_device":false
+}
@@ -0,0 +1,165 @@
+# How to pre-train Llama3 with SageMaker Hyperpod using Amazon Trainium
+
+## What is SageMaker Hyperpod?
+[Amazon SageMaker Hyperpod](https://aws.amazon.com/sagemaker/hyperpod/) offers advanced training tools to help you accelerate scalable, reliable, and secure generative AI application development. It removes the undifferentiated heavy lifting involved in building and optimizing machine learning (ML) infrastructure for training foundation models (FMs) significantly reducing training time. SageMaker Hyperpod ensure customers can continue FM training uninterrupted by periodically saving checkpoints. When a hardware failure occurs during training, SageMaker Hyperpod automatically detects the failure, repairs, or replaces the faulty instance, and resumes the training from the last saved checkpoint, removing the need for customers to manually manage this process and helping them train for week or months in a distributed setting without disruption. 
+
+SageMaker Hyperpod also allows customers to run their FM training workloads on [AWS Trainium](https://aws.amazon.com/machine-learning/trainium/). AWS Trainium is the machine learning (ML) chip that AWS purpose built for deep learning (DL) training of 100B+ parameter models. Each Amazon Elastic Compute Cloud (Amazon EC2) [Trn1 instance](https://aws.amazon.com/ec2/instance-types/trn1) deploys up to 16 Trainium accelerators to deliver a high-performance, low-cost solution for DL training in the cloud. [AWS Neuron SDK](https://aws.amazon.com/machine-learning/neuron/) helps developers train models on Trainium accelerators (and deploy them on [AWS Inferentia](https://aws.amazon.com/machine-learning/inferentia/) accelerators). It natively integrates popular frameworks, such as PyTorch and Tensorflow, so that you can continue to train on Trainium accelerators and use your existing code and workflows.
+
+## 0. Prerequisites
+You will need to set up a SageMaker Hyperpod cluster using 4 [trn1.32xlarge](https://aws.amazon.com/ec2/instance-types/trn1/) instances with a shared parallel filesystem such as [Amazon FSx for Lustre](https://docs.aws.amazon.com/fsx/latest/LustreGuide/getting-started.html).  See the sagemaker-hyperpod section in the [Sagemaker Hyperpod](https://github.com/aws-samples/awsome-distributed-training/tree/main/1.architectures/5.sagemaker-hyperpod) folder for setup instructions.  
+
+## 1. Create Environment 
+
+1. Once the cluster is set up, SSH into the cluster head/controller node and switch to the `ubuntu` user:
+``` bash
+sudo su - ubuntu
+```
+> [!NOTE]  
+> You will run the following steps from the head/controller node of your cluster.
+
+2. Make sure the home directory is set up to `/fsx/ubuntu` as this will allow us to install the required dependencies only once on the head node:
+
+``` bash
+pwd
+```
+
+3. Next install Python virtual environment:
+
+``` bash
+# Install Python venv 
+sudo apt-get install -y python3.8-venv g++ 
+
+# Create Python venv
+python3.8 -m venv aws_neuron_venv_pytorch 
+```
+
+Now lets activate the Virtual Environment:
+```bash
+# Activate Python venv 
+source aws_neuron_venv_pytorch/bin/activate 
+python -m pip install -U pip 
+```
+
+4. Install PyTorch Neuron:
+
+``` bash
+# Install Jupyter notebook kernel
+pip install ipykernel 
+python3.8 -m ipykernel install --user --name aws_neuron_venv_pytorch --display-name "Python (torch-neuronx)"
+pip install jupyter notebook
+pip install environment_kernels
+
+# Set pip repository pointing to the Neuron repository 
+python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+
+# Install wget, awscli 
+python -m pip install wget 
+python -m pip install awscli 
+
+# Install Neuron Compiler and Framework
+python -m pip install neuronx-cc==2.* torch-neuronx torchvision
+python -m pip install neuronx_distributed --extra-index-url https://pip.repos.neuron.amazonaws.com
+```
+
+
+On your cluster head node, clone this repo 
+``` bash
+git clone https://github.com/aws-samples/awsome-distributed-training/
+cd awsome-distributed-training/3.test_cases/22.SMHP-trainium-llama3
+```
+
+With the repo installed, lets install the libraries defined in our `requirements.txt` file:
+
+```bash
+# Install requirements.txt 
+pip install -r requirements.txt
+```
+
+## 2. Prepare Dataset
+
+Next, we need to tokenize our dataset. To tokenize the data, you must request the tokenizer from HuggingFace and Meta by following the instructions at the following link: [HuggingFace Llama 3 8B Model](https://huggingface.co/meta-llama/Meta-Llama-3-8B) . Use of the Llama 3 model is governed by the Meta license. In order to download the model weights and tokenizer, please visit the above website and accept their License before requesting access. After access has been granted, you may use the download scripts provided by Meta to download the model weights and tokenizer to your cluster.
+
+1. Install the huggingface CLI and download the model:
+
+```bash
+pip install huggingface-hub 
+```
+
+2. Authenticate with your [HuggingFace Access Token](https://huggingface.co/settings/tokens). 
+> [!IMPORTANT]  
+> Ensure your HuggingFace Access Token has permissions to public gated repos. You can configure your token to access public gated repos with: (*Edit Acces Token Permissions* > Check the Box: *Read access to contents of all public gated repos you can access* > Save).
+```bash
+huggingface-cli login
+```
+3. Download the [Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) repo from HuggingFace:
+> [!NOTE]  
+> Note we are passing the `--include "*"` flag to ensure that we clone the entire repo, including the sub-directory `/original` which contains the `tokenizer.model` file we will copy in the next step.
+```bash
+huggingface-cli download meta-llama/Meta-Llama-3-8B --include "*" --local-dir Meta-Llama-3-8B .
+```
+
+
+4. Within the current working directory `22.SMHP-trainium-llama3`, lets copy some files from the cloned repo (`/Meta-Llama-3-8B`) to our local working directory. In particular, we will copy `config.json`, `tokenizer_config.json`, `tokenizer.json`, and `original/tokenizer.model` files from the cloned HuggingFace directory we named `/Meta-Llama-3-8B` to our current working directory `22.SMHP-trainium-llama3` so they can be picked up by our scripts:
+
+```bash
+cp Meta-Llama-3-8B/tokenizer_config.json Meta-Llama-3-8B/config.json Meta-Llama-3-8B/tokenizer.json Meta-Llama-3-8B/original/tokenizer.model .
+```
+
+5. Run the 'get_dataset.py' script to prepare the dataset for training. We will run this script via `srun` to ensure it runs on a compute (trn1) node:
+
+``` bash
+srun --job-name=get_dataset_job --output=get_dataset_output.log --nodes=1 python get_dataset.py &
+```
+
+>[!IMPORTANT] 
+>The `get_dataset.py` job will take several minutes to execute, do not proceed until this job is completed. You can monitor the job logs with the following command:
+>```bash
+>tail -f get_dataset_output.log 
+>```
+> Once `squeue` shows the job is completed and `sinfo` shows all nodes as idle, you can proceed to the next section, **Compiling the Model**.
+
+
+## 3. Compile the Model
+
+Next, we will comiplie the model graph using the [neuron parallel compile](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/frameworks/torch/torch-neuronx/api-reference-guide/training/pytorch-neuron-parallel-compile.html#pytorch-neuronx-parallel-compile-cli) tool. 
+
+``` bash
+sbatch --exclusive \ 
+--nodes 4 \
+--cpus-per-task 64 \
+--wrap="srun neuron_parallel_compile bash $(pwd)/run_llama_8B_tp_pp.sh"
+```
+
+## 4. Run Training
+
+Once the graphs are compiled, we can now run model training
+
+``` bash
+sbatch --exclusive \
+--nodes 4 \
+--cpus-per-task 64 \
+--wrap="srun bash $(pwd)/run_llama_8B_tp_pp.sh"
+```
+
+## Running the 70B model
+
+If you would like to compile and train the 70B model instead, run the `run_llama_70B_tp_pp.sh` script instead as below:
+
+- Model Compilation
+
+``` bash
+sbatch --exclusive \ 
+--nodes 4 \
+--cpus-per-task 64 \
+--wrap="srun neuron_parallel_compile bash $(pwd)/run_llama_70B_tp_pp.sh"
+```
+
+- Model Training
+
+``` bash
+sbatch --exclusive \
+--nodes 4 \
+--cpus-per-task 64 \
+--wrap="srun bash $(pwd)/run_llama_70B_tp_pp.sh"
+```
+
@@ -0,0 +1,115 @@
+from typing import Any, Dict, Iterator, Tuple
+import torch.nn as nn
+
+import torch
+from torch_xla.utils.checkpoint import checkpoint as torch_checkpoint
+from neuronx_distributed.parallel_layers.parallel_state import rmsg
+from neuronx_distributed.utils.logger import get_logger
+from torch.distributed.utils import _replace_by_prefix
+
+logger = get_logger()
+
+_CHECKPOINT_WRAPPED_MODULE = "mod"
+_CHECKPOINT_PREFIX = _CHECKPOINT_WRAPPED_MODULE + "."
+
+class CheckPointWrapper(torch.nn.Module):
+    def __init__(self, mod) -> None:
+        super().__init__()
+        self.mod = mod
+        # state_dict post hook to remove prefix to allow loading into a
+        # non-checkpoint wrapped module.
+        self._register_state_dict_hook(self._post_state_dict_hook)
+        # load_state_dict pre-hook to allow loading back into
+        # checkpoint-wrapped module.
+        self._register_load_state_dict_pre_hook(
+            self._pre_load_state_dict_hook, with_module=True
+        )
+
+
+    def forward(self, *args, **kwargs):
+        ordered_args = list(args)
+        for value in kwargs.values():
+            ordered_args += [value]
+
+        # Note: checkpoint cannot accept kwargs
+        return torch_checkpoint(self.mod, *ordered_args, use_reentrant=True)
+    
+    def named_parameters(
+        self,
+        *args,
+        **kwargs,
+    ) -> Iterator[Tuple[str, torch.nn.Parameter]]:
+        """
+        Overrides :meth:`named_parameters()` to intercept parameter names and
+        remove all occurrences of ``_CHECKPOINT_PREFIX``.
+        """
+        for param_name, param in super().named_parameters(*args, **kwargs):
+            updated_name = param_name.replace(_CHECKPOINT_PREFIX, "")
+            yield updated_name, param
+    
+    def named_modules(self,*args,**kwargs):
+        for module_name, module in super().named_modules(*args, **kwargs):
+            updated_name = module_name.replace(_CHECKPOINT_PREFIX, "")
+            yield updated_name, module
+
+    @staticmethod
+    def _post_state_dict_hook(
+        module: nn.Module,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        *args: Any,
+    ) -> Dict[str, Any]:
+        """
+        _post_state_dict_hook() is called after the state_dict() of this
+        FSDP module is executed. For ``checkpoint_wrapper``, it will strip
+        checkpoint-wrapped module prefix so that this module can be loaded into
+        non-checkpointed modules. It would still be able to be loaded into
+        checkpoint-wrapped modules as this class adds the prefix back before
+        loading the state_dict.
+        """
+        _replace_by_prefix(state_dict, f"{prefix}{_CHECKPOINT_PREFIX}", prefix)
+        return state_dict
+    
+    @staticmethod
+    def _pre_load_state_dict_hook(
+        module: nn.Module,
+        state_dict: Dict[str, Any],
+        prefix: str,
+        *args: Any,
+    ) -> None:
+        """
+        ``_pre_state_dict_hook` is called before ``self._load_from_state_dict()``
+        is called. For ``checkpoint_wrapper``, it will add back the module
+        prefix so that non-checkpointed modules can be loaded into
+        checkpoint_wrapper modules properly.
+        """
+        _replace_by_prefix(state_dict, prefix, prefix + f"{_CHECKPOINT_PREFIX}")
+
+    
+
+def apply_checkpoint(dist_model, layers_to_checkpoint=None):
+    checkpoint_wrapper_added = False
+    if layers_to_checkpoint is not None and len(layers_to_checkpoint) == 0:
+        raise RuntimeError(
+            rmsg(f"invalid input layers_to_checkpoint {layers_to_checkpoint}, can't be empty")
+        )
+    for name, module in dist_model.local_module.named_children():
+        # checkpoint layers that are provided in input
+        # if layers not provide in input, then checkpoint if it is transformer layer
+        if (layers_to_checkpoint and name in layers_to_checkpoint) or (
+            not layers_to_checkpoint and type(module) == dist_model.transformer_layer_cls
+        ):
+            # add_module replaces old module with our own custom module.
+            # https://pytorch.org/docs/stable/_modules/torch/nn/modules/module.html#Module.add_module
+            dist_model.local_module.add_module(name, CheckPointWrapper(module))
+            checkpoint_wrapper_added = True
+    if layers_to_checkpoint is not None and not checkpoint_wrapper_added:
+        logger.warning(
+            rmsg(f"layers_to_checkpoint {layers_to_checkpoint} do not exist in the graph")
+        )
+    elif layers_to_checkpoint is None and not checkpoint_wrapper_added:
+        logger.warning(
+            rmsg(
+                f"During applying activation checkpointing, transformer_layer_cls {dist_model.transformer_layer_cls.__name__} can not be found in stage {dist_model.pipeline_parallel_rank}, skipping..."
+            )
+        )