From ebb79c86a0749de9de5a78b2611f2a2735d9a2f9 Mon Sep 17 00:00:00 2001
From: Risto Luukkonen <risto.m.luukkonen@gmail.com>
Date: Wed, 5 Oct 2022 09:24:52 +0300
Subject: [PATCH 01/37] Squash 3 commits to 1

Patch HIP-support to fused kernels from Microsoft/Megatron-DeepSpeed-fork

Path HIP-support to fused kernels from Microsoft/Megatron-DeepSpeed-fork

remove some local files
---
 .gitignore                                    | 10 +++
 megatron/fused_kernels/__init__.py            | 80 +++++++++++--------
 .../fused_kernels/layer_norm_cuda_kernel.cu   | 21 ++++-
 .../scaled_masked_softmax_cuda.cu             |  2 +
 .../scaled_upper_triang_masked_softmax.h      |  3 +-
 ...scaled_upper_triang_masked_softmax_cuda.cu |  2 +
 6 files changed, 84 insertions(+), 34 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6b6db06fb..267995984 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,13 @@
+## HIP-compiled kernels etc.
+*hip*
+# 
+local_examples/
+logs/
+trash/
+kb-runs-gpt/
+ds_configs/
+gpt2-tokenizer/
+smi-output/
 # tests
 # megatron autogenerated indices
 tests/data/*/*npy
diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py
index e2ac2567b..bdc654c39 100644
--- a/megatron/fused_kernels/__init__.py
+++ b/megatron/fused_kernels/__init__.py
@@ -17,81 +17,97 @@
 import pathlib
 import subprocess
 
+import torch
 from torch.utils import cpp_extension
 
+# Setting this param to a list has a problem of generating different
+# compilation commands (with diferent order of architectures) and
+# leading to recompilation of fused kernels. Set it to empty string
+# to avoid recompilation and assign arch flags explicity in
+# extra_cuda_cflags below
+os.environ["TORCH_CUDA_ARCH_LIST"] = ""
+
 
 def load(args):
 
-    # Setting this param to a list has a problem of generating different
-    # compilation commands (with diferent order of architectures) and
-    # leading to recompilation of fused kernels. Set it to empty string
-    # to avoid recompilation and assign arch flags explicity in
-    # extra_cuda_cflags below
-    #
-    # but if a user wants to set an explicit list of archs to compile to, then let that list
-    # through:
-    arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST', None)
-    if arch_list is None:
-        os.environ["TORCH_CUDA_ARCH_LIST"] = ""
-
-    # # Check if cuda 11 is installed for compute capability 8.0
-    # cc_flag = []
-    # _, bare_metal_major, _ = _get_cuda_bare_metal_version(
-    #     cpp_extension.CUDA_HOME)
-    # if int(bare_metal_major) >= 11:
-    #     cc_flag.append('-gencode')
-    #     cc_flag.append('arch=compute_80,code=sm_80')
+    # Check if cuda 11 is installed for compute capability 8.0
+    cc_flag = []
+    if torch.version.hip is None:
+        _, bare_metal_major, _ = _get_cuda_bare_metal_version(
+            cpp_extension.CUDA_HOME)
+        if int(bare_metal_major) >= 11:
+            cc_flag.append('-gencode')
+            cc_flag.append('arch=compute_80,code=sm_80')
 
     # Build path
     srcpath = pathlib.Path(__file__).parent.absolute()
     buildpath = srcpath / 'build'
-    buildpath.mkdir(parents=True, exist_ok=True)
+    _create_build_dir(buildpath)
 
     # Helper function to build the kernels.
-    def _cpp_extention_load_helper(name, sources, extra_cuda_flags):
+    def _cpp_extention_load_helper(name, sources, extra_cuda_flags, extra_include_paths):
+        if torch.version.hip is not None:
+            extra_cuda_cflags=['-O3'] + extra_cuda_flags + cc_flag
+        else:
+            extra_cuda_cflags=['-O3',
+                               '-gencode', 'arch=compute_70,code=sm_70',
+                               '--use_fast_math'] + extra_cuda_flags + cc_flag
+
         return cpp_extension.load(
             name=name,
             sources=sources,
             build_directory=buildpath,
             extra_cflags=['-O3',],
-            extra_cuda_cflags=['-O3',
-                               '--use_fast_math'] + extra_cuda_flags,
+            extra_cuda_cflags=extra_cuda_cflags,
+            extra_include_paths=extra_include_paths,
             verbose=(args.rank == 0)
         )
-                               # '-gencode', 'arch=compute_70,code=sm_70',
 
     # ==============
     # Fused softmax.
     # ==============
 
+    if torch.version.hip is not None:
+        extra_include_paths=[os.path.abspath(srcpath)]
+    else:
+        extra_include_paths=[]
+
     if args.masked_softmax_fusion:
-        extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
-                            '-U__CUDA_NO_HALF_CONVERSIONS__',
-                            '--expt-relaxed-constexpr',
-                            '--expt-extended-lambda']
+        if torch.version.hip is not None:
+             extra_cuda_flags = ['-D__HIP_NO_HALF_OPERATORS__=1',
+                                '-D__HIP_NO_HALF_CONVERSIONS__=1']
+        else:
+             extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__',
+                                '-U__CUDA_NO_HALF_CONVERSIONS__',
+                                '--expt-relaxed-constexpr',
+                                '--expt-extended-lambda']
         
         # Upper triangular softmax.
         sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp',
                  srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu']
         scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper(
             "scaled_upper_triang_masked_softmax_cuda",
-            sources, extra_cuda_flags)
+            sources, extra_cuda_flags, extra_include_paths)
 
         # Masked softmax.
         sources=[srcpath / 'scaled_masked_softmax.cpp',
                  srcpath / 'scaled_masked_softmax_cuda.cu']
         scaled_masked_softmax_cuda = _cpp_extention_load_helper(
-            "scaled_masked_softmax_cuda", sources, extra_cuda_flags)
+            "scaled_masked_softmax_cuda", sources, extra_cuda_flags, extra_include_paths)
 
     # =================================
     # Mixed precision fused layer norm.
     # =================================
 
-    extra_cuda_flags = ['-maxrregcount=50']
+    if torch.version.hip is not None:
+        extra_cuda_flags = []
+    else:
+        extra_cuda_flags = ['-maxrregcount=50']
+
     sources=[srcpath / 'layer_norm_cuda.cpp',
              srcpath / 'layer_norm_cuda_kernel.cu']
     fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper(
-        "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags)
+        "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags, extra_include_paths)
 
 
 def _get_cuda_bare_metal_version(cuda_dir):
diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
index 28a579e1a..ea14f80f7 100644
--- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu
+++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
@@ -246,14 +246,25 @@ void cuWelfordMuSigma2(
     }
   }
 }
-
+#ifndef __HIP_PLATFORM_HCC__
 template<typename U> U rsqrt(U v) {
+#else
+template<typename U> __device__ U rsqrt(U v) {
+#endif
   return U(1) / sqrt(v);
 }
+#ifndef __HIP_PLATFORM_HCC__
 template<> float rsqrt(float v) {
+#else
+template<> __device__ float rsqrt(float v) {
+#endif
   return rsqrtf(v);
 }
+#ifndef __HIP_PLATFORM_HCC__
 template<> double rsqrt(double v) {
+#else
+template<> __device__ double rsqrt(double v) {
+#endif
   return rsqrt(v);
 }
 
@@ -304,7 +315,11 @@ void cuApplyLayerNorm(
   // 1) blockDim.x == warpSize
   // 2) Tensors are contiguous
   //
+#ifndef __HIP_PLATFORM_HCC__
   for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#else
+  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#endif
     SharedMemory<U> shared;
     U* buf = shared.getPointer();
     U mu,sigma2;
@@ -543,7 +558,11 @@ void cuComputeGradInput(
     const V* gamma,
     T* grad_input)
 {
+#ifndef __HIP_PLATFORM_HCC__
   for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#else
+  for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) {
+#endif
     U sum_loss1 = U(0);
     U sum_loss2 = U(0);
     const U c_mean = mean[i1];
diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
index 2efee39a6..0c068c7cb 100644
--- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu
@@ -18,7 +18,9 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
+#endif
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #include "scaled_masked_softmax.h"
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
index 6df83fc10..ee140c037 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
@@ -17,7 +17,8 @@
 #pragma once
 
 #include <assert.h>
-#include <cuda_fp16.h>
+#include <hip/hip_fp16.h>
+// #include <cuda_fp16.h>
 #include <cfloat>
 #include <limits>
 #include <stdint.h>
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 5efc3d412..59e452584 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -18,7 +18,9 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
+#ifndef __HIP_PLATFORM_HCC__
 #include <cuda_profiler_api.h>
+#endif
 #include <ATen/cuda/CUDAContext.h>
 #include <torch/extension.h>
 #include "scaled_upper_triang_masked_softmax.h"

From 21c90de18f702731c96fbdf6e70a7435e2f486f7 Mon Sep 17 00:00:00 2001
From: Sampo Pyysalo <sampo.pyysalo@gmail.com>
Date: Fri, 21 Oct 2022 14:54:16 +0300
Subject: [PATCH 02/37] Add --no-layer-norm-fusion argument

---
 megatron/arguments.py              |  3 +++
 megatron/model/fused_layer_norm.py | 11 +++++++----
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index c18235a78..78f4724d8 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -549,6 +549,9 @@ def _add_training_args(parser):
     group.add_argument('--no-bias-dropout-fusion', action='store_false',
                        help='Disable bias and dropout fusion.',
                        dest='bias_dropout_fusion')
+    group.add_argument('--no-layer-norm-fusion', action='store_true',
+                       help='Disable fused layer norm.',
+                       dest='layer_norm_fusion')
     group.add_argument('--optimizer', type=str, default='adam',
                        choices=['adam', 'sgd'],
                        help='Optimizer function')
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 55e9c9dd8..7b1d7eaa7 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -86,10 +86,13 @@ def __init__(self, normalized_shape, eps=1e-5):
     args = get_args()
     self.layernorm_tp_auto_sync = args.sync_tp_duplicated_parameters
 
-    self.use_meg_ds_fused_layer_norm = (
-      args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm
-      or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920
-    )
+    if not args.layer_norm_fusion:
+        self.use_meg_ds_fused_layer_norm = False
+    else:
+        self.use_meg_ds_fused_layer_norm = (
+          args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm
+          or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920
+        )
 
 
   def reset_parameters(self):

From e0487132140df9d7fec727689864d2d03b022a3e Mon Sep 17 00:00:00 2001
From: Sampo Pyysalo <sampo.pyysalo@gmail.com>
Date: Fri, 21 Oct 2022 15:19:15 +0300
Subject: [PATCH 03/37] Add --no-optimizer-fusion argument

---
 megatron/arguments.py          |  3 +++
 megatron/optimizer/__init__.py | 22 +++++++++++++++-------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 78f4724d8..ca416670d 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -552,6 +552,9 @@ def _add_training_args(parser):
     group.add_argument('--no-layer-norm-fusion', action='store_true',
                        help='Disable fused layer norm.',
                        dest='layer_norm_fusion')
+    group.add_argument('--no-optimizer-fusion', action='store_true',
+                       help='Disable FusedAdam/FusedSGD norm.',
+                       dest='optimizer_fusion')
     group.add_argument('--optimizer', type=str, default='adam',
                        choices=['adam', 'sgd'],
                        help='Optimizer function')
diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py
index 738717d55..b492abfb4 100644
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -13,8 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from apex.optimizers import FusedAdam as Adam
-from apex.optimizers import FusedSGD as SGD
+from torch.optim import AdamW
+from torch.optim import SGD
+from apex.optimizers import FusedAdam
+from apex.optimizers import FusedSGD
 
 from megatron import get_args
 from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm
@@ -72,18 +74,24 @@ def get_megatron_optimizer(model):
         if args.use_bnb_optimizer:
             import bitsandbytes as bnb
             adam_optimizer = bnb.optim.Adam8bit
+        elif args.optimizer_fusion:
+            adam_optimizer = FusedAdam
         else:
-            adam_optimizer = Adam
+            adam_optimizer = AdamW
         optimizer = adam_optimizer(param_groups,
                                    lr=args.lr,
                                    weight_decay=args.weight_decay,
                                    betas=(args.adam_beta1, args.adam_beta2),
                                    eps=args.adam_eps)
     elif args.optimizer == 'sgd':
-        optimizer = SGD(param_groups,
-                        lr=args.lr,
-                        weight_decay=args.weight_decay,
-                        momentum=args.sgd_momentum)
+        if args.optimizer_fusion:
+            sgd_optimizer = FusedSGD
+        else:
+            sgd_optimizer = SGD
+        optimizer = sgd_optimizer(param_groups,
+                                  lr=args.lr,
+                                  weight_decay=args.weight_decay,
+                                  momentum=args.sgd_momentum)
     else:
         raise Exception('{} optimizer is not supported.'.format(
             args.optimizer))

From 18e2c65becf98e73744fd23b4e3b38fc5ccb1e0b Mon Sep 17 00:00:00 2001
From: Sampo Pyysalo <sampo.pyysalo@gmail.com>
Date: Fri, 21 Oct 2022 15:57:59 +0300
Subject: [PATCH 04/37] Bugfix (thanks to Thomas Wang for catching this)

---
 megatron/arguments.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index ca416670d..d2499d149 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -549,10 +549,10 @@ def _add_training_args(parser):
     group.add_argument('--no-bias-dropout-fusion', action='store_false',
                        help='Disable bias and dropout fusion.',
                        dest='bias_dropout_fusion')
-    group.add_argument('--no-layer-norm-fusion', action='store_true',
+    group.add_argument('--no-layer-norm-fusion', action='store_false',
                        help='Disable fused layer norm.',
                        dest='layer_norm_fusion')
-    group.add_argument('--no-optimizer-fusion', action='store_true',
+    group.add_argument('--no-optimizer-fusion', action='store_false',
                        help='Disable FusedAdam/FusedSGD norm.',
                        dest='optimizer_fusion')
     group.add_argument('--optimizer', type=str, default='adam',

From 9b7cd052ed2bb93c96b40dc7f34c4556655fed6e Mon Sep 17 00:00:00 2001
From: Hubert Lu <55214931+hubertlu-tw@users.noreply.github.com>
Date: Fri, 18 Nov 2022 07:20:46 -0800
Subject: [PATCH 05/37] Fix the bug of FusedLayerNorm on ROCm (#96)

---
 .../fused_kernels/layer_norm_cuda_kernel.cu   | 57 ++++++++++++-------
 megatron/model/fused_layer_norm.py            | 17 +++---
 2 files changed, 45 insertions(+), 29 deletions(-)

diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
index ea14f80f7..aae0c993c 100644
--- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu
+++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu
@@ -76,7 +76,8 @@ void cuWelfordMuSigma2(
   const int i1,
   U& mu,
   U& sigma2,
-  U* buf) 
+  U* buf,
+  const int GPU_WARP_SIZE)
 {
   // Assumptions:
   // 1) blockDim.x == warpSize
@@ -106,12 +107,11 @@ void cuWelfordMuSigma2(
       cuWelfordOnlineSum<U>(curr,mu,sigma2,count);
     }
     // intra-warp reductions
-    for (int l = 0;  l <= 4;  ++l) {
-      int srcLaneB = (threadIdx.x+(1<<l))&31;
-      U muB = WARP_SHFL(mu, srcLaneB);
-      U countB = WARP_SHFL(count, srcLaneB);
-      U sigma2B = WARP_SHFL(sigma2, srcLaneB);
-      cuChanOnlineSum<U>(muB,sigma2B,countB,mu,sigma2,count);
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      U sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      U muB = WARP_SHFL_DOWN(mu, stride);
+      U countB = WARP_SHFL_DOWN(count, stride);
+      cuChanOnlineSum<U>(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -160,7 +160,8 @@ void cuWelfordMuSigma2(
   const int i1,
   float& mu,
   float& sigma2,
-  float* buf) 
+  float* buf,
+  const int GPU_WARP_SIZE)
 {
   // Assumptions:
   // 1) blockDim.x == warpSize
@@ -201,12 +202,11 @@ void cuWelfordMuSigma2(
       cuWelfordOnlineSum(curr,mu,sigma2,count);
     }
     // intra-warp reductions
-    for (int l = 0;  l <= 4;  ++l) {
-      int srcLaneB = (threadIdx.x+(1<<l))&31;
-      float muB = WARP_SHFL(mu, srcLaneB);
-      float countB = WARP_SHFL(count, srcLaneB);
-      float sigma2B = WARP_SHFL(sigma2, srcLaneB);
-      cuChanOnlineSum(muB,sigma2B,countB,mu,sigma2,count);
+    for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) {
+      float sigma2B = WARP_SHFL_DOWN(sigma2, stride);
+      float muB = WARP_SHFL_DOWN(mu, stride);
+      float countB = WARP_SHFL_DOWN(count, stride);
+      cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count);
     }
     // threadIdx.x == 0 has correct values for each warp
     // inter-warp reductions
@@ -308,7 +308,8 @@ void cuApplyLayerNorm(
   const int n2,
   const U epsilon,
   const V* __restrict__ gamma,
-  const V* __restrict__ beta
+  const V* __restrict__ beta,
+  const int GPU_WARP_SIZE
   ) 
 {
   // Assumptions:
@@ -323,7 +324,7 @@ void cuApplyLayerNorm(
     SharedMemory<U> shared;
     U* buf = shared.getPointer();
     U mu,sigma2;
-    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf);
+    cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf,GPU_WARP_SIZE);
     const T* lvals = vals + i1*n2;
     V* ovals = output_vals + i1*n2;
     U c_invvar = rsqrt(sigma2 + epsilon);
@@ -686,7 +687,11 @@ void HostApplyLayerNorm(
     )
 {
     auto stream = at::cuda::getCurrentCUDAStream().stream();
-    const dim3 threads(32,4,1);
+    const int warp_size = at::cuda::warp_size();
+    dim3 threads(warp_size,4,1);
+#ifndef __HIP_PLATFORM_HCC__
+    threads.y = 1;
+#endif
     const uint64_t maxGridY =
       at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
     const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1);
@@ -701,7 +706,9 @@ void HostApplyLayerNorm(
 		    input,
 		    n1,n2,
 		    U(epsilon),
-            gamma,beta);
+                    gamma,
+		    beta,
+		    warp_size);
 }
 
 
@@ -754,11 +761,16 @@ void HostLayerNormGradient(
     )
 {
     auto stream = at::cuda::getCurrentCUDAStream().stream();
+    const int warp_size = at::cuda::warp_size();
 
     if (gamma != NULL && beta != NULL) {
       // compute grad_gamma(j) and grad_beta(j)
+#ifndef __HIP_PLATFORM_HCC__
+      const int part_size = warp_size;
+#else
       const int part_size = 16;
-      const dim3 threads2(32,4,1);
+#endif
+      const dim3 threads2(warp_size,4,1);
       const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1);
       const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y *
 	(threads2.x + 1);
@@ -777,7 +789,7 @@ void HostLayerNormGradient(
 		      part_grad_gamma.DATA_PTR<U>(),
 		      part_grad_beta.DATA_PTR<U>());
 
-      const dim3 threads3(32,8,1);
+      const dim3 threads3(warp_size,8,1);
       const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1);
       const int nshared3 = threads3.x * threads3.y * sizeof(U);
       cuComputeGradGammaBeta<<<blocks3, threads3, nshared3, stream>>>(
@@ -793,7 +805,10 @@ void HostLayerNormGradient(
     const uint64_t maxGridY =
       at::cuda::getCurrentDeviceProperties()->maxGridSize[1];
     const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1);
-    const dim3 threads1(32,4,1);
+    dim3 threads1(warp_size,4,1);
+#ifndef __HIP_PLATFORM_HCC__
+    threads1.y = 2;
+#endif
     int nshared =
 	    threads1.y > 1 ?
 	    threads1.y*threads1.x*sizeof(U) :
diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index 7b1d7eaa7..cadf45561 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -28,7 +28,7 @@
 from torch.nn.parameter import Parameter
 import importlib
 import torch
-import torch.nn.functional as F
+from torch.nn import functional as F
 
 global fused_mix_prec_layer_norm_cuda
 fused_mix_prec_layer_norm_cuda = None
@@ -102,13 +102,14 @@ def reset_parameters(self):
 
 
   def forward(self, input):
-
     if self.layernorm_tp_auto_sync:
       torch.distributed.all_reduce(self.weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
       torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
-
-    if self.use_meg_ds_fused_layer_norm:
-        return FusedLayerNormAffineFunction.apply(
-            input, self.weight, self.bias, self.normalized_shape, self.eps)
-    else:
-        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)
+=
+    # CPU path is here for unittest sake.
+    if not input.is_cuda:
+        print("WARNING! The input of FusedLayerNorm should be on the GPU."
+              "This warning should only be triggered in the FusedLayerNorm unit tests.")
+        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
+    return FusedLayerNormAffineFunction.apply(
+      input, self.weight, self.bias, self.normalized_shape,self.eps)

From 277e1d38760375f1b0356cc84b25ead74da5a0cb Mon Sep 17 00:00:00 2001
From: Sampo Pyysalo <sampo.pyysalo@gmail.com>
Date: Fri, 18 Nov 2022 21:20:06 +0200
Subject: [PATCH 06/37] Revert cherry-picked changes to .py

---
 megatron/model/fused_layer_norm.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py
index cadf45561..7b1d7eaa7 100644
--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -28,7 +28,7 @@
 from torch.nn.parameter import Parameter
 import importlib
 import torch
-from torch.nn import functional as F
+import torch.nn.functional as F
 
 global fused_mix_prec_layer_norm_cuda
 fused_mix_prec_layer_norm_cuda = None
@@ -102,14 +102,13 @@ def reset_parameters(self):
 
 
   def forward(self, input):
+
     if self.layernorm_tp_auto_sync:
       torch.distributed.all_reduce(self.weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
       torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group())
-=
-    # CPU path is here for unittest sake.
-    if not input.is_cuda:
-        print("WARNING! The input of FusedLayerNorm should be on the GPU."
-              "This warning should only be triggered in the FusedLayerNorm unit tests.")
-        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps)
-    return FusedLayerNormAffineFunction.apply(
-      input, self.weight, self.bias, self.normalized_shape,self.eps)
+
+    if self.use_meg_ds_fused_layer_norm:
+        return FusedLayerNormAffineFunction.apply(
+            input, self.weight, self.bias, self.normalized_shape, self.eps)
+    else:
+        return F.layer_norm(input, self.normalized_shape, self.weight, self.bias)

From 2963caeaf1c55cfc6aee2eae39d999fcc7ed1ecf Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 22 Nov 2022 14:18:39 +0400
Subject: [PATCH 07/37] Add LUMI eval compat

---
 examples/run_evalharness_deepspeed.md         |   1 +
 examples/run_evalharness_lumi.sh              | 113 ++++++++++++++++++
 tasks/eval_harness/download.py                |   1 +
 tasks/eval_harness/evaluate.py                |   2 +-
 .../deepspeed_to_megatron.py                  |   2 +-
 5 files changed, 117 insertions(+), 2 deletions(-)
 create mode 100644 examples/run_evalharness_lumi.sh

diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md
index 695d9d0aa..60f380d9c 100644
--- a/examples/run_evalharness_deepspeed.md
+++ b/examples/run_evalharness_deepspeed.md
@@ -15,6 +15,7 @@ Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `b
 start-prod
 pip install best-download==0.0.7
 pip install git+https://github.com/EleutherAI/lm-evaluation-harness
+pip install --upgrade scipy
 ```
 
 2. Pre-download needed datasets
diff --git a/examples/run_evalharness_lumi.sh b/examples/run_evalharness_lumi.sh
new file mode 100644
index 000000000..71fcd1e6d
--- /dev/null
+++ b/examples/run_evalharness_lumi.sh
@@ -0,0 +1,113 @@
+#!/bin/bash
+#SBATCH --exclude=nid005159
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=32
+#SBATCH --mem=256G
+#SBATCH -p eap
+#SBATCH -t 2-0:00:00
+#SBATCH --gpus-per-node=mi250:1
+#SBATCH --exclusive=user
+#SBATCH --hint=nomultithread
+#SBATCH --account=project_462000119
+#SBATCH -o logs/%j.out
+#SBATCH -e logs/%j.err
+
+# if run without sbatch, invoke here
+if [ -z $SLURM_JOB_ID ]; then
+    mkdir -p logs
+    sbatch "$0"
+    exit
+fi
+
+set -euo pipefail
+
+# symlink logs/latest_eval.out and logs/latest_eval.err
+ln -f -s $SLURM_JOB_ID.out logs/latest_eval.out
+ln -f -s $SLURM_JOB_ID.err logs/latest_eval.err
+
+# Data
+CHECKPOINT_PATH=/scratch/project_462000119/muennighoff/nov-2022-optimization/checkpoints/global_step10
+VARIANT=global_step10
+
+export HF_DATASETS_OFFLINE=1
+export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache
+
+VOCAB_FILE="gpt2/vocab.json"
+MERGE_FILE="gpt2/merges.txt"
+
+PP_SIZE=1
+TP_SIZE=1
+# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS
+# make as big as it can fit into gpu w/o OOM, but not too close to 100%
+EVAL_MICRO_BATCH_SIZE=1
+MICRO_BS_MULTIPLIER=1
+
+# Model parameters
+SEQ_LEN=2048
+
+# Dummy arguments
+MEGATRON_REQUIRED_ARGS=" \
+    --num-layers -1 \
+    --hidden-size -1 \
+    --num-attention-heads -1 \
+    --seq-length -1  \
+    --max-position-embeddings -1 \
+"
+
+ZERO_STAGE=0
+
+mkdir -p ds_configs
+DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json"
+
+cat <<EOF > $DS_CONFIG_PATH
+{
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": 1,
+    "gradient_clipping": 1.0,
+    "zero_optimization": {
+        "stage": $ZERO_STAGE
+    },
+    "bf16": {
+        "enabled": true
+    },
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
+EOF
+
+DEEPSPEED_ARGS=" \
+    --deepspeed \
+    --deepspeed_config $DS_CONFIG_PATH \
+    --zero-stage $ZERO_STAGE \
+    "
+
+CMD="Megatron-DeepSpeed/tasks/eval_harness/evaluate.py \
+    --load $CHECKPOINT_PATH \
+    --results_path $VARIANT-results.json \
+    --tensor-model-parallel-size $TP_SIZE  \
+    --pipeline-model-parallel-size $PP_SIZE \
+    --vocab-file $VOCAB_FILE \
+    --merge-file $MERGE_FILE \
+    --micro-batch-size $EVAL_MICRO_BATCH_SIZE \
+    --no-load-optim \
+    --no-load-rng \
+    --bf16 \
+    --inference \
+    --seq-length $SEQ_LEN \
+    --task_list piqa \
+    --intermed_results \
+    --adaptive_seq_len \
+    --micro_bs_multiplier $MICRO_BS_MULTIPLIER \
+    $MEGATRON_REQUIRED_ARGS \
+    $DEEPSPEED_ARGS \
+    "
+
+echo $CMD
+
+echo "START $SLURM_JOBID: $(date)"
+
+srun --label launch.sh $CMD
+
+echo "END $SLURM_JOBID: $(date)"
+
diff --git a/tasks/eval_harness/download.py b/tasks/eval_harness/download.py
index d2abcd83a..be5d5c303 100644
--- a/tasks/eval_harness/download.py
+++ b/tasks/eval_harness/download.py
@@ -1,6 +1,7 @@
 # Downloads the specified taks in the evaluation harness
 # This is particularly useful when running in environments where the GPU nodes 
 # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation.
+# May want to set a cache before, e.g. export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache
 
 from lm_eval import tasks
 from lm_eval.tasks import ALL_TASKS
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 68dd649fd..7e0cdb60e 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -260,7 +260,7 @@ def tokenizer_encode(self, text):
 from megatron.initialize import initialize_megatron
 import megatron
 
-from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
+from deepspeed.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
 from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint
 
 def override_args(args, override_args, skip_keys, skip_if_specified_keys):
diff --git a/tools/convert_checkpoint/deepspeed_to_megatron.py b/tools/convert_checkpoint/deepspeed_to_megatron.py
index 74e5ca7c9..08471d0bb 100755
--- a/tools/convert_checkpoint/deepspeed_to_megatron.py
+++ b/tools/convert_checkpoint/deepspeed_to_megatron.py
@@ -4,7 +4,7 @@
 import os
 import torch
 from collections import OrderedDict
-from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint
+from deepspeed.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint
 
 MODEL_KEY = 'model'
 ARGS_KEY = 'args'

From 32f039c2d08da415d49559cad907d50a6ed971a9 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 22 Nov 2022 15:37:24 +0400
Subject: [PATCH 08/37] Update tasks

---
 examples/run_evalharness_lumi.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_evalharness_lumi.sh b/examples/run_evalharness_lumi.sh
index 71fcd1e6d..1721d91d2 100644
--- a/examples/run_evalharness_lumi.sh
+++ b/examples/run_evalharness_lumi.sh
@@ -95,7 +95,7 @@ CMD="Megatron-DeepSpeed/tasks/eval_harness/evaluate.py \
     --bf16 \
     --inference \
     --seq-length $SEQ_LEN \
-    --task_list piqa \
+    --task_list copa,piqa,rte,winogrande,hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions \
     --intermed_results \
     --adaptive_seq_len \
     --micro_bs_multiplier $MICRO_BS_MULTIPLIER \

From 2ca2338c5633d0e5e529bbf7a613030564aa442b Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Tue, 22 Nov 2022 17:52:05 +0200
Subject: [PATCH 09/37] add inverse_sqrt lr decay style

---
 megatron/arguments.py      | 2 +-
 megatron/learning_rates.py | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index d2499d149..f06ff6dbd 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -610,7 +610,7 @@ def _add_learning_rate_args(parser):
                        'and initial warmup, the learing rate at each '
                        'iteration would be different.')
     group.add_argument('--lr-decay-style', type=str, default='linear',
-                       choices=['constant', 'linear', 'cosine'],
+                       choices=['constant', 'linear', 'cosine', 'inverse_sqrt'],
                        help='Learning rate decay function.')
     group.add_argument('--lr-decay-iters', type=int, default=None,
                        help='number of iterations to decay learning rate over,'
diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index ae1fcdb2b..8fa6f40af 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -68,6 +68,9 @@ def get_lr(self):
             if self.num_steps == self.warmup_steps and \
                 self.decay_tokens is not None:
                 self.warmup_tokens = self.num_tokens
+            # if self.decay_style == 'inverse_sqrt':
+            #     # use constant warmup for inverse_sqrt
+                # return 1e-2
             return self.max_lr * float(self.num_steps) / \
                 float(self.warmup_steps)
 
@@ -102,6 +105,9 @@ def get_lr(self):
             coeff = (1.0 - decay_ratio)
         elif self.decay_style == 'cosine':
             coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
+        elif self.decay_style == 'inverse_sqrt':
+            return self.max_lr * math.sqrt(float(self.warmup_steps)) / \
+                math.sqrt(float(num_steps_))
         else:
             raise Exception('{} decay style is not supported.'.format(
                 self.decay_style))

From ad60932feb79513fdfbbd5f4d82dfe25fc08eedd Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Tue, 22 Nov 2022 17:54:55 +0200
Subject: [PATCH 10/37] fix no warmup case

---
 megatron/learning_rates.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 8fa6f40af..e23758d95 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -106,8 +106,8 @@ def get_lr(self):
         elif self.decay_style == 'cosine':
             coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
         elif self.decay_style == 'inverse_sqrt':
-            return self.max_lr * math.sqrt(float(self.warmup_steps)) / \
-                math.sqrt(float(num_steps_))
+            return self.max_lr * math.sqrt(float(max(self.warmup_steps, 1))) / \
+                math.sqrt(float(max(num_steps_,1)))
         else:
             raise Exception('{} decay style is not supported.'.format(
                 self.decay_style))

From 0823ad8c1dc8a1b444dfc679648658892980b0fa Mon Sep 17 00:00:00 2001
From: Nouamane Tazi <nouamane98@gmail.com>
Date: Wed, 23 Nov 2022 15:05:27 +0100
Subject: [PATCH 11/37] use t5x formula

Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com>
---
 megatron/learning_rates.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index e23758d95..73e16d519 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -106,8 +106,7 @@ def get_lr(self):
         elif self.decay_style == 'cosine':
             coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
         elif self.decay_style == 'inverse_sqrt':
-            return self.max_lr * math.sqrt(float(max(self.warmup_steps, 1))) / \
-                math.sqrt(float(max(num_steps_,1)))
+            return self.max_lr / math.sqrt(float(max(num_steps_, 1)))
         else:
             raise Exception('{} decay style is not supported.'.format(
                 self.decay_style))

From a093db6fe567eae48d379b992b8fb16c97b9ef9f Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Wed, 23 Nov 2022 15:11:54 +0100
Subject: [PATCH 12/37] avoid num_steps > decay_steps case

---
 megatron/learning_rates.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 73e16d519..9a46d5ccd 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -78,6 +78,9 @@ def get_lr(self):
         if self.decay_style == 'constant':
             return self.max_lr
 
+        if self.decay_style == 'inverse_sqrt':
+            return self.max_lr / math.sqrt(float(max(num_steps_, 1)))
+
         if self.decay_tokens is None:
             # step-based decay
             
@@ -105,8 +108,6 @@ def get_lr(self):
             coeff = (1.0 - decay_ratio)
         elif self.decay_style == 'cosine':
             coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0)
-        elif self.decay_style == 'inverse_sqrt':
-            return self.max_lr / math.sqrt(float(max(num_steps_, 1)))
         else:
             raise Exception('{} decay style is not supported.'.format(
                 self.decay_style))

From b4601b9ef327025f678d01ef0ebf0f6d3c9be1a5 Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Wed, 23 Nov 2022 15:15:17 +0100
Subject: [PATCH 13/37] remove casting as math.sqrt does that

---
 megatron/learning_rates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 9a46d5ccd..777133ce8 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -79,7 +79,7 @@ def get_lr(self):
             return self.max_lr
 
         if self.decay_style == 'inverse_sqrt':
-            return self.max_lr / math.sqrt(float(max(num_steps_, 1)))
+            return self.max_lr / math.sqrt(max(num_steps_, 1))
 
         if self.decay_tokens is None:
             # step-based decay

From 4dae1399e4b5a2fa2f1659ccf8b6b4668f9ded62 Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Wed, 23 Nov 2022 15:28:42 +0100
Subject: [PATCH 14/37] add  lr-warmup-style argument taking "constant" or
 "linear" values

---
 megatron/arguments.py      |  2 ++
 megatron/learning_rates.py | 20 ++++++++++++--------
 megatron/training.py       |  1 +
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index f06ff6dbd..946ccda9a 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -621,6 +621,8 @@ def _add_learning_rate_args(parser):
     group.add_argument('--lr-decay-tokens', type=int, default=None,
                        help='number of tokens to decay learning rate over,'
                        ' If not None will override iter/sample-based decay')
+    group.add_argument('--lr-warmup-style', type=str, default='linear',
+                       choices=['constant', 'linear']),
     group.add_argument('--lr-warmup-fraction', type=float, default=None,
                        help='fraction of lr-warmup-(iters/samples) to use '
                        'for warmup (as a float)')
diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 777133ce8..77a74c683 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -23,7 +23,7 @@ class AnnealingLR(object):
     """Anneals the learning rate."""
 
     def __init__(self, optimizer, max_lr, min_lr,
-                 warmup_steps, decay_steps, decay_style,
+                 warmup_steps, decay_steps, decay_style, warmup_style,
                  use_checkpoint_lr_scheduler=True,
                  override_lr_scheduler=False):
         args = get_args()
@@ -46,6 +46,7 @@ def __init__(self, optimizer, max_lr, min_lr,
         self.warmup_tokens = 0
 
         self.decay_style = decay_style
+        self.warmup_style = warmup_style
 
         self.override_lr_scheduler = override_lr_scheduler
         self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler
@@ -63,23 +64,26 @@ def get_lr(self):
         """Learning rate decay functions from:
               https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
 
-        # Use linear warmup for the initial part.
+        # Use warmup for the initial part.
         if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
             if self.num_steps == self.warmup_steps and \
                 self.decay_tokens is not None:
                 self.warmup_tokens = self.num_tokens
-            # if self.decay_style == 'inverse_sqrt':
-            #     # use constant warmup for inverse_sqrt
-                # return 1e-2
-            return self.max_lr * float(self.num_steps) / \
-                float(self.warmup_steps)
+            if self.warmup_style == 'linear':
+                return self.max_lr * float(self.num_steps) / \
+                    float(self.warmup_steps)
+            elif self.warmup_style == 'constant':
+                return self.max_lr
+            else:
+                raise ValueError('Unknown warmup style: {}'.format(
+                    self.warmup_style))
 
         # If the learning rate is constant, just return the initial value.
         if self.decay_style == 'constant':
             return self.max_lr
 
         if self.decay_style == 'inverse_sqrt':
-            return self.max_lr / math.sqrt(max(num_steps_, 1))
+            return self.max_lr / math.sqrt(max(self.num_steps - self.warmup_steps, 1))
 
         if self.decay_tokens is None:
             # step-based decay
diff --git a/megatron/training.py b/megatron/training.py
index bd00bc77e..d86c51440 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -361,6 +361,7 @@ def get_learning_rate_scheduler(optimizer):
         warmup_steps=warmup_steps,
         decay_steps=decay_steps,
         decay_style=args.lr_decay_style,
+        warmup_style=args.lr_warmup_style,
         use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler,
         override_lr_scheduler=args.override_lr_scheduler)
 

From 5fbb1dd54c39d98059dc8ee688225981cb4466c1 Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Wed, 23 Nov 2022 19:28:13 +0100
Subject: [PATCH 15/37] refactor num_steps_

---
 megatron/learning_rates.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 77a74c683..b21890dab 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -83,7 +83,8 @@ def get_lr(self):
             return self.max_lr
 
         if self.decay_style == 'inverse_sqrt':
-            return self.max_lr / math.sqrt(max(self.num_steps - self.warmup_steps, 1))
+            num_steps_ = self.num_steps - self.warmup_steps
+            return self.max_lr / math.sqrt(max(num_steps_, 1))
 
         if self.decay_tokens is None:
             # step-based decay

From 6299fb24bb169b58581d103403cd10b489ec69de Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Wed, 23 Nov 2022 19:30:10 +0100
Subject: [PATCH 16/37] docs

---
 megatron/arguments.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index 946ccda9a..ed5cf2db6 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -622,7 +622,8 @@ def _add_learning_rate_args(parser):
                        help='number of tokens to decay learning rate over,'
                        ' If not None will override iter/sample-based decay')
     group.add_argument('--lr-warmup-style', type=str, default='linear',
-                       choices=['constant', 'linear']),
+                       choices=['constant', 'linear'], help='Learning rate '
+                          'warmup function.')
     group.add_argument('--lr-warmup-fraction', type=float, default=None,
                        help='fraction of lr-warmup-(iters/samples) to use '
                        'for warmup (as a float)')

From 4e8665093377eaf1713edfb8abbcf062a8034ca1 Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Thu, 24 Nov 2022 16:17:28 +0100
Subject: [PATCH 17/37] fix formulas

---
 megatron/learning_rates.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index b21890dab..5bc0c02ea 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -64,6 +64,19 @@ def get_lr(self):
         """Learning rate decay functions from:
               https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
 
+
+        if self.decay_style == 'inverse_sqrt':
+            if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
+                if self.warmup_style == 'linear':
+                    return self.max_lr * self.num_steps / (self.warmup_steps * self.warmup_steps**0.5)
+                elif self.warmup_style == 'constant':
+                    return self.max_lr / self.warmup_steps**0.5
+                else:
+                    raise ValueError('Unknown warmup style: {}'.format(
+                        self.warmup_style))
+                
+            return self.max_lr / (max(self.num_steps, 1))**0.5
+
         # Use warmup for the initial part.
         if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
             if self.num_steps == self.warmup_steps and \
@@ -82,9 +95,6 @@ def get_lr(self):
         if self.decay_style == 'constant':
             return self.max_lr
 
-        if self.decay_style == 'inverse_sqrt':
-            num_steps_ = self.num_steps - self.warmup_steps
-            return self.max_lr / math.sqrt(max(num_steps_, 1))
 
         if self.decay_tokens is None:
             # step-based decay

From 50c693599725bbab9d5416a880c822652d5f8d2c Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Thu, 24 Nov 2022 20:04:32 +0100
Subject: [PATCH 18/37] fix formula

Co-authored-by: thomasw21 <24695242+thomasw21@users.noreply.github.com>
---
 megatron/learning_rates.py | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 5bc0c02ea..4ae63d3ca 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -65,18 +65,6 @@ def get_lr(self):
               https://openreview.net/pdf?id=BJYwwY9ll pg. 4"""
 
 
-        if self.decay_style == 'inverse_sqrt':
-            if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
-                if self.warmup_style == 'linear':
-                    return self.max_lr * self.num_steps / (self.warmup_steps * self.warmup_steps**0.5)
-                elif self.warmup_style == 'constant':
-                    return self.max_lr / self.warmup_steps**0.5
-                else:
-                    raise ValueError('Unknown warmup style: {}'.format(
-                        self.warmup_style))
-                
-            return self.max_lr / (max(self.num_steps, 1))**0.5
-
         # Use warmup for the initial part.
         if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps:
             if self.num_steps == self.warmup_steps and \
@@ -96,6 +84,12 @@ def get_lr(self):
             return self.max_lr
 
 
+        # If linear
+        # In warmup phase: lr = max_lr
+        # In decay phase: lr = max_lr * sqrt(warmup_steps) / sqrt(num_steps)
+        if self.decay_style == 'inverse_sqrt':
+            return self.max_lr * (max(self.warmup_steps, 1) / max(self.num_steps, 1))**0.5
+
         if self.decay_tokens is None:
             # step-based decay
             

From 5c642dd30eceab840e186f44f06ac221ffc277a7 Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Mon, 28 Nov 2022 13:38:00 +0100
Subject: [PATCH 19/37] correct comment

---
 megatron/learning_rates.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index 4ae63d3ca..e74534775 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -84,7 +84,7 @@ def get_lr(self):
             return self.max_lr
 
 
-        # If linear
+        # If constant decay style
         # In warmup phase: lr = max_lr
         # In decay phase: lr = max_lr * sqrt(warmup_steps) / sqrt(num_steps)
         if self.decay_style == 'inverse_sqrt':

From 1b14a28cfc95dcb05be0bf79b4ba72394a490f1d Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Mon, 28 Nov 2022 15:26:41 +0100
Subject: [PATCH 20/37] note about replicating t5x

---
 megatron/learning_rates.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py
index e74534775..c886b9814 100644
--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
@@ -87,6 +87,7 @@ def get_lr(self):
         # If constant decay style
         # In warmup phase: lr = max_lr
         # In decay phase: lr = max_lr * sqrt(warmup_steps) / sqrt(num_steps)
+        # Note: To replicate t5x check https://github.com/TurkuNLP/Megatron-DeepSpeed/pull/2
         if self.decay_style == 'inverse_sqrt':
             return self.max_lr * (max(self.warmup_steps, 1) / max(self.num_steps, 1))**0.5
 

From 5365f41f37b8f8d8df9be8a52d2595c760de08ef Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Tue, 6 Dec 2022 20:26:44 +0100
Subject: [PATCH 21/37] quick fix for upper triang masked softmax cuda kernel
 for seq_len < 8192

---
 .../scaled_upper_triang_masked_softmax.h      | 20 +++++++++++++++++--
 ...scaled_upper_triang_masked_softmax_cuda.cu |  2 +-
 .../fused_kernels/tests/test_fused_kernels.py |  4 ++++
 megatron/initialize.py                        |  2 +-
 megatron/model/fused_softmax.py               |  4 ++--
 5 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
index ee140c037..4e6a467ce 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h
@@ -341,7 +341,7 @@ void dispatch_scaled_upper_triang_masked_softmax_forward(
     int softmax_elements_stride, 
     int attn_batches)
 {
-    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 );
+    TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 8192 );
     if (softmax_elements == 0) {
         return;
     } else {
@@ -416,6 +416,14 @@ void dispatch_scaled_upper_triang_masked_softmax_forward(
                 scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
                 break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_forward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
             default:
                 break;
         }
@@ -432,7 +440,7 @@ void dispatch_scaled_upper_triang_masked_softmax_backward(
     int softmax_elements_stride, 
     int attn_batches)
 {
-    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 );
+    TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 8192 );
     if (softmax_elements == 0) {
        return;
     } else {
@@ -507,6 +515,14 @@ void dispatch_scaled_upper_triang_masked_softmax_backward(
                 scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 11>
                     <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
                 break;
+            case 12: // 4096
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 12>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
+            case 13: // 8192
+                scaled_upper_triang_masked_softmax_warp_backward<input_t, output_t, acc_t, 13>
+                    <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements);
+                break;
             default:
                 break;
         }
diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
index 59e452584..4aa9a702a 100644
--- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
+++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu
@@ -37,7 +37,7 @@ torch::Tensor fwd_cuda(
   // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len]
   const int attn_batches = input.size(0);
   const int seq_len = input.size(1);
-  TORCH_INTERNAL_ASSERT(seq_len <= 2048);
+  TORCH_INTERNAL_ASSERT(seq_len <= 8192);
 
   // Output 
   auto act_options = input.options().requires_grad(false);
diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py
index f8d5027a1..3e7deb913 100644
--- a/megatron/fused_kernels/tests/test_fused_kernels.py
+++ b/megatron/fused_kernels/tests/test_fused_kernels.py
@@ -3,6 +3,10 @@
 import torch
 from torch.nn import LayerNorm
 
+import sys
+# add to path
+sys.path.append("/home/nouamane/projects/Megatron-DeepSpeed/")
+import megatron
 from megatron.model.enums import AttnMaskType
 from megatron.model.fused_layer_norm import MixedFusedLayerNorm
 from megatron.model.fused_softmax import FusedScaleMaskSoftmax
diff --git a/megatron/initialize.py b/megatron/initialize.py
index b967a228a..2e92b15af 100644
--- a/megatron/initialize.py
+++ b/megatron/initialize.py
@@ -194,7 +194,7 @@ def _compile_dependencies():
         args.micro_batch_size
     # Constraints on sequence length and attn_batch_size to enable warp based
     # optimization and upper triangular optimization (for causal mask)
-    custom_kernel_constraint = seq_len > 16 and seq_len <=2048 and \
+    custom_kernel_constraint = seq_len > 16 and seq_len <=8192 and \
         seq_len % 4 == 0 and attn_batch_size % 4 == 0
     # Print a warning.
     if not ((args.fp16 or args.bf16) and
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 07192e2bf..57c6c4769 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -167,11 +167,11 @@ def is_kernel_available(self, mask, b, np, sq, sk):
         if (
             self.scaled_masked_softmax_fusion  # user want to fuse
             and self.input_in_float16  # input must be fp16
-            and 16 < sk <= 4096  # sk must be 16 ~ 4096
+            and 16 < sk <= 8192  # sk must be 16 ~ 8192
             and sq % 4 == 0  # sq must be divisor of 4
             and attn_batches % 4 == 0  # np * b must be divisor of 4
         ):
-            if 0 <= sk <= 4096:
+            if 0 <= sk <= 8192:
                 batch_per_block = self.get_batch_per_block(sq, sk, b, np)
 
                 if self.attn_mask_type == AttnMaskType.causal:

From c41cc5e0e2cbc4046dd9646c925f162a0d16ee41 Mon Sep 17 00:00:00 2001
From: Sampo Pyysalo <sampo.pyysalo@gmail.com>
Date: Wed, 7 Dec 2022 14:04:33 +0200
Subject: [PATCH 22/37] Use torch.multiprocessing.set_start_method('spawn')

Resolves DataLoader segfaults seen in some multinode runs with PP > 1.
---
 megatron/data/indexed_dataset.py | 2 +-
 pretrain_gpt.py                  | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index d0d312544..0322cba86 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -513,7 +513,7 @@ def __getstate__(self):
         return self._path
 
     def __setstate__(self, state):
-        self._do_init(state)
+        self._do_init(state, False)
 
     def _do_init(self, path, skip_warmup):
         self._path = path
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 5f8b104fc..2d0691f4e 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -228,6 +228,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
 @record
 def main():
+    torch.multiprocessing.set_start_method('spawn')
     pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
              args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
 

From 6732bc9b568e568498fd291895579a359afd080b Mon Sep 17 00:00:00 2001
From: Sampo Pyysalo <sampo.pyysalo@gmail.com>
Date: Fri, 9 Dec 2022 08:44:46 +0200
Subject: [PATCH 23/37] skip_warmup on __setstate__

---
 megatron/data/indexed_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py
index 0322cba86..403e4b5d2 100644
--- a/megatron/data/indexed_dataset.py
+++ b/megatron/data/indexed_dataset.py
@@ -513,7 +513,7 @@ def __getstate__(self):
         return self._path
 
     def __setstate__(self, state):
-        self._do_init(state, False)
+        self._do_init(state, True)
 
     def _do_init(self, path, skip_warmup):
         self._path = path

From ab29faf5e6106ece378019e1712820095e75e5a7 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Wed, 28 Dec 2022 18:09:06 +0400
Subject: [PATCH 24/37] Copy preliminary UL2

---
 megatron/arguments.py           |  50 +++++-
 megatron/data/dataset_utils.py  |  61 ++++++-
 megatron/data/t5_dataset.py     |  52 ++++--
 megatron/data/ul2_dataset.py    | 281 ++++++++++++++++++++++++++++++++
 megatron/enums.py               |   5 +
 megatron/model/fused_softmax.py |   2 +-
 megatron/tokenizer/tokenizer.py |  12 ++
 pretrain_ul2.py                 | 185 +++++++++++++++++++++
 8 files changed, 628 insertions(+), 20 deletions(-)
 create mode 100644 megatron/data/ul2_dataset.py
 create mode 100644 pretrain_ul2.py

diff --git a/megatron/arguments.py b/megatron/arguments.py
index ed5cf2db6..cca7bd57d 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -24,7 +24,7 @@
 import torch
 import deepspeed
 
-from megatron.enums import PositionEmbeddingType
+from megatron.enums import PositionEmbeddingType, UL2ModelType
 import megatron
 from megatron.logging import log_levels
 
@@ -49,6 +49,7 @@ def parse_args(extra_args_provider=None, defaults={},
     parser = _add_autoresume_args(parser)
     parser = _add_biencoder_args(parser)
     parser = _add_vit_args(parser)
+    parser = _add_ul2_args(parser)
     parser = _add_logging_args(parser)
     parser = _add_zero_args(parser)
     parser = _add_memoryopt_args(parser)
@@ -309,6 +310,17 @@ def parse_args(extra_args_provider=None, defaults={},
                     "skip train iterations should be specified as two numbers, i.e. start-end"
                 )
         args.skip_train_iteration_range = skip_train_iteration_range
+    
+    args.ul2_model_type = UL2ModelType(args.ul2_model_type)
+    if (
+            args.ul2_model_type is not UL2ModelType.ENCODER_DECODER
+            and args.decoder_seq_length is not None
+    ):
+        print(
+            f'WARNING: `--decoder_seq_length` is ignored when '
+            f'`--ul2-model-type` is not '
+            f'"{UL2ModelType.ENCODER_DECODER.value}"!'
+        )
 
     if args.use_bnb_optimizer:
         try:
@@ -1032,6 +1044,42 @@ def _add_vit_args(parser):
 
     return parser
 
+def _add_ul2_args(parser):
+    group = parser.add_argument_group(title="UL2")
+
+    group.add_argument('--ul2-model-type', type=str, default='ED',
+                       choices=['ED', 'ND', 'CD'],
+                       help='What type of model to use for UL2 pretraining. '
+                       'ED = encoder-decoder; ND = non-causal decoder-only; '
+                       'CD = causal decoder-only')
+    group.add_argument('--ul2-denoiser-ratios', nargs='+', type=float,
+                       default=None,
+                       help='Probability of each denoising objective to be '
+                       'selected. Uniform distribution by default.')
+    group.add_argument('--ul2-denoisers', nargs='+', type=str,
+                       default=['R', 'R', 'S', 'X', 'X', 'X', 'X'],
+                       choices=['R', 'S', 'X'],
+                       help='What type of UL2 denoising objective the other '
+                       'UL2 configurations refer to.')
+    group.add_argument('--ul2-mean-span-lengths', nargs='+', type=float,
+                       default=[3, 8, 0.25, 3, 8, 64, 64],
+                       help='Mean length for sampling span lengths. '
+                       'Numbers < 1 indicate a mean length of the sequence '
+                       'length times that number.')
+    group.add_argument('--ul2-mask-ratios', nargs='+', type=float,
+                       default=[0.15, 0.15, 0.25, 0.5, 0.5, 0.15, 0.5],
+                       help='Ratio of masked token in the full sequence.')
+    group.add_argument('--ul2-r-denoiser-token', type=str, default='[R]',
+                       help='What token to prepend for the UL2 R-denoising '
+                       'objective.')
+    group.add_argument('--ul2-s-denoiser-token', type=str, default='[S]',
+                       help='What token to prepend for the UL2 S-denoising '
+                       'objective.')
+    group.add_argument('--ul2-x-denoiser-token', type=str, default='[X]',
+                       help='What token to prepend for the UL2 X-denoising '
+                       'objective.')
+
+    return parser
 
 def _add_zero_args(parser):
     """Text generate arguments."""
diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 3841e263e..60d4e0d90 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -18,6 +18,7 @@
 #   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
 # with some modifications.
 
+from enum import Enum
 import math
 import os
 import time
@@ -37,8 +38,16 @@
 DSET_TYPE_BERT = 'standard_bert'
 DSET_TYPE_ICT = 'ict'
 DSET_TYPE_T5  = 't5'
+DSET_TYPE_UL2  = 'ul2'
 
-DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5]
+DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_UL2]
+
+
+class SamplingStyle(Enum):
+    POISSON = 'poisson'
+    GEOMETRIC = 'geometric'
+    UNIFORM = 'uniform'
+    NORMAL = 'normal'
 
 
 def analyze_data_prefix(data_prefix):
@@ -194,9 +203,16 @@ def create_masked_lm_predictions(tokens,
                                  favor_longer_ngram=False,
                                  do_permutation=False,
                                  geometric_dist=False,
-                                 masking_style="bert"):
+                                 masking_style="bert",
+                                 sampling_style=SamplingStyle.POISSON,
+                                 prefix_lm=False):
     """Creates the predictions for the masked LM objective.
     Note: Tokens here are vocab ids and not text tokens."""
+    if not isinstance(sampling_style, SamplingStyle):
+        sampling_style = SamplingStyle(sampling_style)
+    # Backward-compatibility
+    if geometric_dist:
+        sampling_style = SamplingStyle.GEOMETRIC
 
     cand_indexes = []
     # Note(mingdachen): We create a list for recording if the piece is
@@ -235,18 +251,24 @@ def create_masked_lm_predictions(tokens,
                          max(1, int(round(len(tokens) * masked_lm_prob))))
 
     ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
-    if not geometric_dist:
+    if sampling_style is SamplingStyle.POISSON:
         # Note(mingdachen):
         # By default, we set the probilities to favor shorter ngram sequences.
         pvals = 1. / np.arange(1, max_ngrams + 1)
         pvals /= pvals.sum(keepdims=True)
         if favor_longer_ngram:
             pvals = pvals[::-1]
+    elif sampling_style is SamplingStyle.NORMAL:
+        normal_mean = (max_ngrams + 1) / 2
 
     ngram_indexes = []
     for idx in range(len(cand_indexes)):
         ngram_index = []
         for n in ngrams:
+            if prefix_lm:
+                last_cand_index_index = min(idx + n - 1, len(cand_indexes) - 1)
+                if cand_indexes[last_cand_index_index][-1] < len(tokens) - 1:
+                    continue
             ngram_index.append(cand_indexes[idx:idx + n])
         ngram_indexes.append(ngram_index)
 
@@ -266,15 +288,25 @@ def create_masked_lm_predictions(tokens,
                 if index in covered_indexes:
                     continue
 
-        if not geometric_dist:
+        if sampling_style is SamplingStyle.POISSON:
             n = np_rng.choice(ngrams[:len(cand_index_set)],
                               p=pvals[:len(cand_index_set)] /
                               pvals[:len(cand_index_set)].sum(keepdims=True))
-        else:
+        elif sampling_style is SamplingStyle.GEOMETRIC:
             # Sampling "n" from the geometric distribution and clipping it to
             # the max_ngrams. Using p=0.2 default from the SpanBERT paper
             # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1)
             n = min(np_rng.geometric(0.2), max_ngrams)
+        elif sampling_style is SamplingStyle.UNIFORM:
+            n = np_rng.choice(ngrams[:len(cand_index_set)])
+        elif sampling_style is SamplingStyle.NORMAL:
+            n = round(np.clip(
+                np_rng.normal(loc=normal_mean),
+                1,
+                len(cand_index_set),
+            ))
+        else:
+            raise ValueError('unknown sampling style')
 
         index_set = sum(cand_index_set[n - 1], [])
         n -= 1
@@ -522,6 +554,7 @@ def build_dataset(index, name):
         from megatron.data.bert_dataset import BertDataset
         from megatron.data.ict_dataset import ICTDataset
         from megatron.data.t5_dataset import T5Dataset
+        from megatron.data.ul2_dataset import UL2Dataset
         dataset = None
         if splits[index + 1] > splits[index]:
             # Get the pointer to the original doc-idx so we can set it later.
@@ -560,6 +593,24 @@ def build_dataset(index, name):
                     short_seq_prob=short_seq_prob,
                     **kwargs
                 )
+            elif dataset_type == DSET_TYPE_UL2:
+                args = get_args()
+                dataset = UL2Dataset(
+                    indexed_dataset=indexed_dataset,
+                    model_type=args.ul2_model_type,
+                    denoiser_ratios=args.ul2_denoiser_ratios,
+                    denoisers=args.ul2_denoisers,
+                    mean_span_lengths=args.ul2_mean_span_lengths,
+                    mask_ratios=args.ul2_mask_ratios,
+                    denoiser_tokens={
+                        'R': args.ul2_r_denoiser_token,
+                        'S': args.ul2_s_denoiser_token,
+                        'X': args.ul2_x_denoiser_token,
+                    },
+                    max_seq_length_dec=max_seq_length_dec,
+                    short_seq_prob=short_seq_prob,
+                    **kwargs,
+                )
             elif dataset_type == DSET_TYPE_BERT:
                 dataset = BertDataset(
                     indexed_dataset=indexed_dataset,
diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index 42110b923..af3374da6 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -26,6 +26,27 @@
     get_samples_mapping
 )
 
+
+class LengthExceededError(ValueError):
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = (
+                'The sequence input became too long. '
+                'Try to increase `--seq-length` or `--encoder-seq-length`.'
+            )
+        super().__init__(msg)
+
+
+class DecoderLengthExceededError(ValueError):
+    def __init__(self, msg=None):
+        if msg is None:
+            msg = (
+                'The sequence input for the decoder became too long. '
+                'Try to increase `--decoder-seq-length`.'
+            )
+        super().__init__(msg)
+
+
 class T5Dataset(torch.utils.data.Dataset):
 
     def __init__(self, name, indexed_dataset, data_prefix,
@@ -157,38 +178,41 @@ def build_training_sample(sample, target_seq_length,
     return train_sample
 
 
-def pad_and_convert_to_numpy(tokens, masked_positions,
-                             masked_labels, pad_id,
-                             max_seq_length, max_seq_length_dec,
-                             masked_spans=None, bos_id=None,
-                             eos_id=None, sentinel_tokens=None):
-    """Pad sequences and convert them to numpy."""
-
+def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
+                           eos_id=None, sentinel_tokens=None):
     sentinel_tokens = collections.deque(sentinel_tokens)
     t5_input = []
     (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
     (start_index, end_index) = (0, None)
     for span in masked_spans:
         flag = sentinel_tokens.popleft()
-
         # Append the same tokens in decoder input and output
         t5_decoder_in.append(flag)
         t5_decoder_in.extend(span.label)
         t5_decoder_out.append(flag)
         t5_decoder_out.extend(span.label)
-
         end_index = span.index[0]
         t5_input.extend(tokens[start_index: end_index])
         t5_input.append(flag)
-
         # the next start index is the token after the last span token
         start_index = span.index[-1] + 1
-
     # Add <eos> token to the t5_decoder_out
     t5_decoder_out.append(eos_id)
 
     # Add the remaining tokens to the t5 input
     t5_input.extend(tokens[start_index:])
+    return t5_input, t5_decoder_in, t5_decoder_out
+
+
+def pad_and_convert_to_numpy(tokens, masked_positions,
+                             masked_labels, pad_id,
+                             max_seq_length, max_seq_length_dec,
+                             masked_spans=None, bos_id=None,
+                             eos_id=None, sentinel_tokens=None):
+    """Pad sequences and convert them to numpy."""
+
+    t5_input, t5_decoder_in, t5_decoder_out = merge_subsequent_masks(
+        tokens, masked_spans, bos_id, eos_id, sentinel_tokens)
 
     # assert (len(t5_input) - len(masked_spans)) + \
     #        (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens)
@@ -198,7 +222,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
     # Encoder-side padding mask.
     num_tokens = len(t5_input)
     padding_length = max_seq_length - num_tokens
-    assert padding_length >= 0
+    if padding_length < 0:
+        raise LengthExceededError()
     assert len(masked_positions) == len(masked_labels)
 
     # Tokens..
@@ -208,7 +233,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
     # Decoder-side padding mask.
     num_tokens_dec = len(t5_decoder_in)
     padding_length_dec = max_seq_length_dec - num_tokens_dec
-    assert padding_length_dec >= 0
+    if padding_length_dec < 0:
+        raise DecoderLengthExceededError()
     filler_dec = [pad_id] * padding_length_dec
     tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64)
 
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
new file mode 100644
index 000000000..4f80c973b
--- /dev/null
+++ b/megatron/data/ul2_dataset.py
@@ -0,0 +1,281 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""UL2-style dataset."""
+
+import math
+
+import numpy as np
+
+from megatron import get_tokenizer
+from megatron.data.dataset_utils import (
+    create_masked_lm_predictions,
+    get_samples_mapping,
+    SamplingStyle
+)
+from megatron.data.t5_dataset import (
+    LengthExceededError,
+    make_history_mask,
+    merge_subsequent_masks,
+    pad_and_convert_to_numpy,
+    T5Dataset,
+)
+from megatron.enums import UL2ModelType
+
+
+def is_decoder_only(ul2_model_type):
+    """Return whether we use a decoder-only model."""
+    assert isinstance(ul2_model_type, UL2ModelType)
+    return ul2_model_type is not UL2ModelType.ENCODER_DECODER
+
+
+def is_prefix_lm(ul2_model_type):
+    """Return whether we use a non-causal decoder-only model."""
+    assert isinstance(ul2_model_type, UL2ModelType)
+    return ul2_model_type is UL2ModelType.NON_CAUSAL_DECODER
+
+
+class UL2Dataset(T5Dataset):
+
+    def __init__(self, name, indexed_dataset, data_prefix,
+                 num_epochs, max_num_samples, model_type,
+                 denoiser_ratios, denoisers, mean_span_lengths,
+                 mask_ratios, denoiser_tokens, max_seq_length,
+                 max_seq_length_dec, short_seq_prob, seed):
+
+        if denoiser_ratios is None:
+            # Uniform distribution by default.
+            denoiser_ratios = [1 / len(denoisers)] * len(denoisers)
+
+        assert (
+            len(denoiser_ratios) == len(denoisers)
+            == len(mean_span_lengths) == len(mask_ratios)
+        ), (
+            'some UL2 configurations do not correspond to the amount of '
+            'denoising objectives'
+        )
+
+        super().__init__(name, indexed_dataset, data_prefix,
+                         num_epochs, max_num_samples, None,
+                         max_seq_length, max_seq_length_dec,
+                         short_seq_prob, seed)
+
+        # Params to store.
+        self.model_type = model_type
+        self.denoiser_ratios = [
+            denoiser_ratio / sum(denoiser_ratios)
+            for denoiser_ratio in denoiser_ratios
+        ]
+        self.denoisers = [denoiser.upper() for denoiser in denoisers]
+        self.mean_span_lengths = mean_span_lengths
+        self.mask_ratios = mask_ratios
+
+        # Vocab stuff.
+        tokenizer = get_tokenizer()
+        # Remove CLS token because we don't need it.
+        del self.cls_id
+        self.cls_ids = {
+            denoiser: tokenizer.vocab[token]
+            for (denoiser, token) in denoiser_tokens.items()
+        }
+        # cls_token = self.vocab_id_to_token_dict[tokenizer.cls]
+        # if cls_token not in self.cls_ids:
+        #     self.cls_ids[cls_token] = tokenizer.cls
+
+        # Filter out denoiser tokens.
+        self.sentinel_tokens = [
+            token
+            for token in tokenizer.additional_special_tokens_ids
+            if token not in self.cls_ids.values()
+        ]
+        assert len(self.sentinel_tokens) > 0, \
+            "Provide the argument --vocab-extra-ids 100 to the script"
+
+    def __getitem__(self, idx):
+
+        start_index, end_index, seq_length = self.samples_mapping[idx]
+        sample = []
+        for index in range(start_index, end_index):
+            sample.append(self.indexed_dataset[index])
+        # Note that this rng state should be numpy and not python since
+        # python randint is inclusive whereas the numpy one is exclusive.
+        np_rng = np.random.RandomState(seed=(self.seed + idx))
+        return build_training_sample(sample, seq_length,
+                                     self.max_seq_length,  # needed for padding
+                                     self.max_seq_length_dec,
+                                     self.vocab_id_list,
+                                     self.vocab_id_to_token_dict,
+                                     self.cls_ids, self.sep_id,
+                                     self.mask_id, self.pad_id,
+                                     self.model_type, self.denoiser_ratios,
+                                     self.denoisers, self.mean_span_lengths,
+                                     self.mask_ratios, np_rng, self.bos_id,
+                                     self.eos_id, self.sentinel_tokens)
+
+
+def build_training_sample(sample, target_seq_length,
+                          max_seq_length, max_seq_length_dec,
+                          vocab_id_list, vocab_id_to_token_dict,
+                          cls_ids, sep_id, mask_id, pad_id,
+                          model_type, denoiser_ratios,
+                          denoisers, mean_span_lengths,
+                          mask_ratios, np_rng,
+                          bos_id=None, eos_id=None,
+                          sentinel_tokens=None):
+    """Build training sample.
+
+    Arguments:
+        sample: A list of sentences in which each sentence is a list token ids.
+        target_seq_length: Desired sequence length.
+        max_seq_length: Maximum length of the sequence. All values are padded to
+            this length.
+        vocab_id_list: List of vocabulary ids. Used to pick a random id.
+        vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
+        cls_ids: Start of example ids.
+        sep_id: Separator id.
+        mask_id: Mask token id.
+        pad_id: Padding token id.
+        model_type: What type of model is used.
+        denoiser_ratios: Probability of each denoising objective to be selected.
+        denoisers: What type of UL2 denoising objective the other UL2
+              configurations refer to.
+        mean_span_lengths: Mean length for sampling span lengths. Numbers < 1
+              indicate a mean length of the sequence length times that number.
+        mask_ratios: Ratio of masked token in the full sequence.
+        np_rng: Random number genenrator. Note that this rng state should be
+              numpy and not python since python randint is inclusive for
+              the opper bound whereas the numpy one is exclusive.
+        bos_id: start of decoder example id
+        eos_id: end of generation id
+        sentinel_tokens: unique value to be substituted for every replaced span
+    """
+
+    # Denoiser selection
+    denoiser_index = np_rng.choice(np.arange(len(denoisers)), p=denoiser_ratios)
+    denoiser = denoisers[denoiser_index]
+    masked_lm_prob = mask_ratios[denoiser_index]
+
+    assert target_seq_length <= max_seq_length
+
+    # flatten sentences into one list
+    tokens = [token for sentence in sample for token in sentence]
+
+    max_num_tokens = target_seq_length
+    # if is_decoder_only(model_type):
+    #     # Keep space for repeated `extra_id` tokens; not the most data
+    #     # efficient since we calculate this based on the maximum number
+    #     # of possible `extra_id` tokens.
+    #     safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
+    #     truncated = len(tokens) > safe_max_seq_len
+    #     tokens = tokens[:safe_max_seq_len]
+    # else:
+    # Truncate to `target_sequence_length`.
+    truncated = len(tokens) > max_num_tokens
+    tokens = tokens[:max_num_tokens]
+
+    # Prepend objective token.
+    cls_id = cls_ids.get(denoiser)
+    if cls_id is None:
+        raise ValueError('unknown denoiser')
+    tokens = [cls_id] + tokens
+
+    # Masking.
+    max_predictions_per_seq = masked_lm_prob * len(tokens)
+    mean_ngrams = mean_span_lengths[denoiser_index]
+    if mean_ngrams < 1:
+        mean_ngrams = round(len(tokens) * mean_ngrams)
+    max_ngrams = mean_ngrams * 2 - 1
+
+    if denoiser == 'R' or denoiser == 'X':
+        sampling_style = SamplingStyle.NORMAL
+        prefix_lm = False
+    elif denoiser == 'S':
+        sampling_style = SamplingStyle.UNIFORM
+        prefix_lm = True
+    else:
+        raise ValueError('unknown denoiser')
+    (
+        tokens, masked_positions, masked_labels, _, masked_spans,
+    ) = create_masked_lm_predictions(
+        tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob,
+        cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng,
+        max_ngrams=max_ngrams, masking_style="t5",
+        sampling_style=sampling_style, prefix_lm=prefix_lm,
+    )
+
+    if is_decoder_only(model_type):
+        # Concatenate to one sequence.
+        tokens_enc, tokens_dec_in, labels = merge_subsequent_masks(
+            tokens, masked_spans, bos_id, eos_id, sentinel_tokens)
+
+        # Move EOS tokens to end of sequence.
+        while tokens_enc[-1] == eos_id:
+            del tokens_enc[-1]
+            tokens_dec_in.append(eos_id)
+            labels.append(eos_id)
+
+        num_labels = len(labels)
+
+        # Move BOS token to start of sequence.
+        tokens_dec_in = tokens_dec_in[1:]
+        tokens = np.array((
+            [bos_id]
+            + tokens_enc
+            + [sep_id]
+            + tokens_dec_in
+        ), dtype=np.int64)
+        labels = np.array((
+            tokens_enc
+            + [sep_id]
+            + labels
+        ), dtype=np.int64)
+
+        if max_seq_length - len(tokens) < 0:
+            raise LengthExceededError()
+
+        loss_mask = np.zeros(len(tokens), dtype=np.int64)
+        loss_mask[-num_labels:] = 1
+
+        dec_mask = make_history_mask(tokens)
+        if is_prefix_lm(model_type):
+            dec_mask[:-num_labels, :-num_labels] = 1
+
+        train_sample = {
+            'text': tokens,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'truncated': int(truncated),
+            'dec_mask': dec_mask,
+        }
+    else:
+        # Padding.
+        tokens_enc, tokens_dec_in, labels, enc_mask, \
+        dec_mask, enc_dec_mask, loss_mask \
+            = pad_and_convert_to_numpy(tokens, masked_positions,
+                                       masked_labels, pad_id, max_seq_length,
+                                       max_seq_length_dec, masked_spans,
+                                       bos_id, eos_id, sentinel_tokens)
+
+        train_sample = {
+            'text_enc': tokens_enc,
+            'text_dec': tokens_dec_in,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'truncated': int(truncated),
+            'enc_mask': enc_mask,
+            'dec_mask': dec_mask,
+            'enc_dec_mask': enc_dec_mask,
+        }
+    return train_sample
diff --git a/megatron/enums.py b/megatron/enums.py
index 90d00a071..2961cbb66 100644
--- a/megatron/enums.py
+++ b/megatron/enums.py
@@ -33,3 +33,8 @@ class PositionEmbeddingType(enum.Enum):
     rotary = 1
     absolute = 2
     alibi = 3
+
+class UL2ModelType(enum.Enum):
+    ENCODER_DECODER = 'ED'
+    NON_CAUSAL_DECODER = 'ND'
+    CAUSAL_DECODER = 'CD'
diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py
index 57c6c4769..ccdfbaaef 100644
--- a/megatron/model/fused_softmax.py
+++ b/megatron/model/fused_softmax.py
@@ -214,7 +214,7 @@ def forward_torch_softmax(self, input, mask):
         if self.scale is not None:
             input = input * self.scale
 
-        if self.attn_mask_type == AttnMaskType.causal:
+        if self.attn_mask_type == AttnMaskType.causal and mask is None:
             assert mask is None
             assert input.shape[2] == input.shape[3]
             mask = self.get_causal_mask(input.shape[2])
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 09304b1dd..c0356a12c 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -388,6 +388,18 @@ def eos(self):
         candidate = self.tokenizer.eos_token_id
         return self._check_token_candidate(candidate)
 
+    @property
+    def bos_token_id(self):
+        """Id of the beginning of sentence token in the vocabulary."""
+        candidate = self.tokenizer.bos_token_id
+        return self._check_token_candidate(candidate)
+
+    @property
+    def eos_token_id(self):
+        """Id of the end of sentence token in the vocabulary."""
+        candidate = self.tokenizer.eos_token_id
+        return self._check_token_candidate(candidate)
+
     @property
     def additional_special_tokens_ids(self):
         """ All the additional special tokens you may want to use (list of strings)."""
diff --git a/pretrain_ul2.py b/pretrain_ul2.py
new file mode 100644
index 000000000..cab24ced0
--- /dev/null
+++ b/pretrain_ul2.py
@@ -0,0 +1,185 @@
+# coding=utf-8
+# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Pretrain UL2"""
+
+from functools import partial
+
+import torch
+
+from megatron import (
+    get_args,
+    get_timers,
+    mpu,
+    print_rank_0
+)
+from megatron.data.dataset_utils import build_train_valid_test_datasets
+from megatron.data.ul2_dataset import (
+    is_decoder_only as _is_decoder_only,
+    is_prefix_lm as _is_prefix_lm,
+)
+from megatron.model.gpt_model import GPTModel
+from megatron.model.t5_model import T5Model, t5_position_ids
+from megatron.training import pretrain
+from megatron.utils import average_losses_across_data_parallel_group
+
+
+def is_decoder_only():
+    """Return whether we use a decoder-only model."""
+    args = get_args()
+    return _is_decoder_only(args.ul2_model_type)
+
+
+def is_prefix_lm():
+    """Return whether we use a non-causal decoder-only model."""
+    args = get_args()
+    return _is_prefix_lm(args.ul2_model_type)
+
+
+def model_provider(pre_process=True, post_process=True):
+    """Build the model."""
+    assert pre_process and post_process, "UL2 doesn't yet support pipelining"
+
+    print_rank_0('building UL2 model ...')
+    if is_decoder_only():
+        print_rank_0('Using decoder-only UL2 model.')
+        model = GPTModel(
+            num_tokentypes=0,
+            parallel_output=True,
+            pre_process=pre_process,
+            post_process=post_process,
+            prefix_lm=is_prefix_lm(),
+        )
+    else:
+        print_rank_0('Using encoder-decoder UL2 model.')
+        model = T5Model(num_tokentypes=0, parallel_output=True)
+    return model
+
+
+def get_batch(data_iterator):
+    """Build the batch."""
+
+    if is_decoder_only():
+        keys = ['text', 'labels', 'loss_mask', 'dec_mask']
+    else:
+        keys = ['text_enc', 'text_dec', 'labels', 'loss_mask',
+                'enc_mask', 'dec_mask', 'enc_dec_mask']
+    datatype = torch.int64
+
+    # Broadcast data.
+    if data_iterator is not None:
+        data = next(data_iterator)
+    else:
+        data = None
+    data_b = mpu.broadcast_data(keys, data, datatype)
+
+    # Unpack.
+    if is_decoder_only():
+        tokens = data_b['text'].long()
+        labels = data_b['labels'].long()
+        loss_mask = data_b['loss_mask'].float()
+
+        dec_mask = (data_b['dec_mask'] < 0.5)
+        return tokens, loss_mask, labels, dec_mask
+    else:
+        tokens_enc = data_b['text_enc'].long()
+        tokens_dec = data_b['text_dec'].long()
+        labels = data_b['labels'].long()
+        loss_mask = data_b['loss_mask'].float()
+
+        enc_mask = (data_b['enc_mask'] < 0.5)
+        dec_mask = (data_b['dec_mask'] < 0.5)
+        enc_dec_mask = (data_b['enc_dec_mask'] < 0.5)
+
+        return tokens_enc, tokens_dec, loss_mask, labels, \
+               enc_mask, dec_mask, enc_dec_mask
+
+
+def loss_func(loss_mask, output_tensor):
+    if is_decoder_only():
+        lm_loss_ = output_tensor
+    else:
+        lm_loss_, _ = output_tensor
+
+    lm_loss_ = lm_loss_.float()
+    lm_loss = torch.sum(
+        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
+
+    loss = lm_loss
+    averaged_losses = average_losses_across_data_parallel_group([lm_loss])
+
+    return loss, {'lm loss': averaged_losses[0]}
+
+
+def forward_step(data_iterator, model):
+    """Forward step."""
+    args = get_args()
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch generator').start()
+    if is_decoder_only():
+        (tokens, loss_mask, lm_labels, dec_mask) = get_batch(data_iterator)
+    else:
+        (
+            tokens_enc, tokens_dec, loss_mask, lm_labels,
+            enc_mask, dec_mask, enc_dec_mask,
+        ) = get_batch(data_iterator)
+    timers('batch generator').stop()
+
+    # Forward model lm_labels
+    if is_decoder_only():
+        position_ids = t5_position_ids(tokens)
+        output_tensor = model(tokens, position_ids, dec_mask,
+                              labels=lm_labels)
+    else:
+        output_tensor = model(tokens_enc,
+                              tokens_dec,
+                              enc_mask,
+                              dec_mask,
+                              enc_dec_mask,
+                              tokentype_ids=None,
+                              lm_labels=lm_labels)
+
+    return output_tensor, partial(loss_func, loss_mask)
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build train, valid, and test datasets."""
+    args = get_args()
+
+    print_rank_0('> building train, validation, and test datasets '
+                 'for UL2 ...')
+    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+        data_prefix=args.data_path,
+        data_impl=args.data_impl,
+        splits_string=args.split,
+        train_valid_test_num_samples=train_val_test_num_samples,
+        max_seq_length=args.encoder_seq_length,
+        max_seq_length_dec=args.decoder_seq_length,
+        masked_lm_prob=args.mask_prob,
+        short_seq_prob=args.short_seq_prob,
+        seed=args.seed,
+        skip_warmup=(not args.mmap_warmup),
+        dataset_type='ul2')
+    print_rank_0("> finished creating UL2 datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+
+    pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
+             args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})

From 9328ad2cac3feb83d39dfc6d55b3ffeb9684c86b Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Thu, 29 Dec 2022 14:29:07 +0400
Subject: [PATCH 25/37] DeepSpeed compat

---
 megatron/data/ul2_dataset.py    |  39 +++++++--
 megatron/tokenizer/tokenizer.py |  60 ++++++++++++-
 pretrain_ul2.py                 | 148 ++++++++++++++++++++++++++------
 3 files changed, 213 insertions(+), 34 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 4f80c973b..512086be3 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -67,10 +67,39 @@ def __init__(self, name, indexed_dataset, data_prefix,
             'denoising objectives'
         )
 
-        super().__init__(name, indexed_dataset, data_prefix,
-                         num_epochs, max_num_samples, None,
-                         max_seq_length, max_seq_length_dec,
-                         short_seq_prob, seed)
+        # Params to store.
+        self.name = name
+        self.seed = seed
+        self.masked_lm_prob = short_seq_prob
+        self.max_seq_length = max_seq_length
+        self.max_seq_length_dec = max_seq_length_dec
+
+        # Dataset.
+        self.indexed_dataset = indexed_dataset
+
+        # Build the samples mapping.
+        self.samples_mapping = get_samples_mapping(self.indexed_dataset,
+                                                   data_prefix,
+                                                   num_epochs,
+                                                   max_num_samples,
+                                                   self.max_seq_length - 2, # account for added tokens
+                                                   short_seq_prob,
+                                                   self.seed,
+                                                   self.name,
+                                                   False)
+
+        # Vocab stuff.
+        tokenizer = get_tokenizer()
+        self.vocab_id_list = list(tokenizer.inv_vocab.keys())
+        self.vocab_id_to_token_dict = tokenizer.inv_vocab
+        # self.cls_id = tokenizer.cls
+        self.sep_id = tokenizer.sep
+        self.mask_id = tokenizer.mask
+        self.pad_id = tokenizer.pad
+        self.bos_id = tokenizer.bos_token_id
+        self.eos_id = tokenizer.eos_token_id
+        self.sentinel_tokens = tokenizer.additional_special_tokens_ids
+        assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script"
 
         # Params to store.
         self.model_type = model_type
@@ -85,7 +114,7 @@ def __init__(self, name, indexed_dataset, data_prefix,
         # Vocab stuff.
         tokenizer = get_tokenizer()
         # Remove CLS token because we don't need it.
-        del self.cls_id
+        # del self.cls_id
         self.cls_ids = {
             denoiser: tokenizer.vocab[token]
             for (denoiser, token) in denoiser_tokens.items()
diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index c0356a12c..7d0340ca2 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -40,7 +40,9 @@ def build_tokenizer(args):
                                             vocab_extra_ids=args.vocab_extra_ids)
     elif args.tokenizer_type == 'GPT2BPETokenizer':
         assert args.merge_file is not None
-        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
+        tokenizer = _GPT2BPETokenizer(
+            args.vocab_file, args.merge_file, vocab_extra_ids=args.vocab_extra_ids
+        )
     elif args.tokenizer_type == "PretrainedFromHF":
         assert args.tokenizer_name_or_path is not None
 
@@ -286,13 +288,36 @@ def additional_special_tokens(self, value):
 class _GPT2BPETokenizer(AbstractTokenizer):
     """Original GPT2 BPE tokenizer."""
 
-    def __init__(self, vocab_file, merge_file):
+    def __init__(self, vocab_file, merge_file, vocab_extra_ids=0):
         name = 'GPT2 BPE'
         super().__init__(name)
 
         self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
                                        special_tokens=[], max_len=None)
-        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
+        self.eod_id = self.eos_token_id = self.tokenizer.encoder['<|endoftext|>']
+
+        self.bod_id = self.bos_token_id = self.tokenizer.encoder['[EOS]']
+        self.sep_id = self.tokenizer.encoder['[SEP]']
+        self.mask_id = self.tokenizer.encoder['[MASK]']
+        self.pad_id = self.tokenizer.encoder['[PAD]']
+
+        additional_special_tokens = []
+        self._additional_special_tokens = []
+        additional_special_tokens.extend(
+            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
+        self.add_additional_special_tokens(additional_special_tokens)
+
+    def add_additional_special_tokens(self, tokens_list):
+        setattr(self, "additional_special_tokens", tokens_list)
+        for value in tokens_list:
+            self.add_token(value)
+
+    def add_token(self, token):
+        if token not in self.vocab:
+            self.inv_vocab[self.vocab_size] = token
+            # self.vocab_size comes from len(vocab)
+            # and it will increase as we add elements
+            self.vocab[token] = self.vocab_size
 
     @property
     def vocab_size(self):
@@ -316,6 +341,35 @@ def detokenize(self, token_ids):
     def eod(self):
         return self.eod_id
 
+    @property
+    def bod(self):
+        return self.bod_id
+
+    @property
+    def sep(self):
+        return self.sep_id
+
+    @property
+    def mask(self):
+        return self.mask_id     
+
+    @property
+    def pad(self):
+        return self.pad_id
+
+    @property
+    def additional_special_tokens(self):
+        """ All the additional special tokens you may want to use (list of strings)."""
+        return self._additional_special_tokens
+        
+    @property
+    def additional_special_tokens_ids(self):
+        """ Ids of all the additional special tokens in the vocabulary (list of integers)."""
+        return [self.vocab.get(token) for token in self._additional_special_tokens]
+
+    @additional_special_tokens.setter
+    def additional_special_tokens(self, value):
+        self._additional_special_tokens = value
 
 class _AutoTokenizer(AbstractTokenizer):
     """AutoTokenizer for Hf Pretrained model loading."""
diff --git a/pretrain_ul2.py b/pretrain_ul2.py
index cab24ced0..f942566f3 100644
--- a/pretrain_ul2.py
+++ b/pretrain_ul2.py
@@ -17,6 +17,7 @@
 
 from functools import partial
 
+import deepspeed
 import torch
 
 from megatron import (
@@ -26,11 +27,13 @@
     print_rank_0
 )
 from megatron.data.dataset_utils import build_train_valid_test_datasets
+from megatron.data.gpt_dataset import build_dataset_group
 from megatron.data.ul2_dataset import (
     is_decoder_only as _is_decoder_only,
     is_prefix_lm as _is_prefix_lm,
 )
-from megatron.model.gpt_model import GPTModel
+from megatron.enums import AttnMaskType
+from megatron.model.gpt_model import GPTModel, GPTModelPipe
 from megatron.model.t5_model import T5Model, t5_position_ids
 from megatron.training import pretrain
 from megatron.utils import average_losses_across_data_parallel_group
@@ -51,21 +54,77 @@ def is_prefix_lm():
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
     assert pre_process and post_process, "UL2 doesn't yet support pipelining"
-
+    
     print_rank_0('building UL2 model ...')
+    args = get_args()
+
+    with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(),
+                             remote_device=None if args.remote_device == 'none' else args.remote_device,
+                             config_dict_or_path=args.deepspeed_config,
+                             enabled=args.zero_stage == 3,
+                             mpu=mpu):
+        if args.deepspeed and is_decoder_only():
+            args.pretrain_causal_attention = True
+            model = GPTModelPipe(
+                num_tokentypes=0,
+                parallel_output=True,
+                attn_mask_type=AttnMaskType.causal
+            )
+            # This is a hack to give us a reference to get_batch_pipe from within training.py
+            # We need to call model.set_batch_fn after deepspeed.initialize
+            model._megatron_batch_fn = get_batch_pipe
+        elif is_decoder_only():
+            print_rank_0('Using decoder-only UL2 model.')
+            model = GPTModel(
+                num_tokentypes=0,
+                parallel_output=True,
+                pre_process=pre_process,
+                post_process=post_process,
+                prefix_lm=is_prefix_lm(),
+            )
+        else:
+            print_rank_0('Using encoder-decoder UL2 model.')
+            model = T5Model(num_tokentypes=0, parallel_output=True)
+    return model
+
+from megatron.global_vars import get_tokenizer
+def visualize_model_inputs(tokens, attention_mask, labels, loss_mask):
+    tok = get_tokenizer()
+    print("TOKENS:", ",".join([tok.detokenize(tokens[0, i]) for i in range(100)]))
+    print("ATTN:", attention_mask[0, :, :100, :100])
+    print("LABS:", labels[0, :100])
+    print("LOSSMSK:", loss_mask[:100])
+
+def get_batch_pipe(data):
+    """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`"""
     if is_decoder_only():
-        print_rank_0('Using decoder-only UL2 model.')
-        model = GPTModel(
-            num_tokentypes=0,
-            parallel_output=True,
-            pre_process=pre_process,
-            post_process=post_process,
-            prefix_lm=is_prefix_lm(),
-        )
+        keys = ['text', 'labels', 'loss_mask', 'dec_mask']
     else:
-        print_rank_0('Using encoder-decoder UL2 model.')
-        model = T5Model(num_tokentypes=0, parallel_output=True)
-    return model
+        keys = ['text_enc', 'text_dec', 'labels', 'loss_mask',
+                'enc_mask', 'dec_mask', 'enc_dec_mask']
+    datatype = torch.int64
+
+    data_b = mpu.broadcast_data(keys, data, datatype)    
+    
+
+    print(
+        visualize_model_inputs(
+            data_b['text'],
+            data_b['dec_mask'],
+            data_b['labels'],
+            data_b['loss_mask'],
+        )
+    )
+
+    tokens = data_b['text'].long()
+    labels = data_b['labels'].long()
+    loss_mask = data_b['loss_mask'].float()
+
+    dec_mask = (data_b['dec_mask'] < 0.5)
+
+    position_ids = t5_position_ids(tokens)
+
+    return (tokens, position_ids, dec_mask), (labels, loss_mask)
 
 
 def get_batch(data_iterator):
@@ -85,6 +144,15 @@ def get_batch(data_iterator):
         data = None
     data_b = mpu.broadcast_data(keys, data, datatype)
 
+    print(
+        visualize_model_inputs(
+            data_b['text'],
+            data_b['dec_mask'],
+            data_b['labels'],
+            data_b['loss_mask'],
+        )
+    )
+
     # Unpack.
     if is_decoder_only():
         tokens = data_b['text'].long()
@@ -162,19 +230,47 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
     print_rank_0('> building train, validation, and test datasets '
                  'for UL2 ...')
-    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
-        data_prefix=args.data_path,
-        data_impl=args.data_impl,
-        splits_string=args.split,
-        train_valid_test_num_samples=train_val_test_num_samples,
-        max_seq_length=args.encoder_seq_length,
-        max_seq_length_dec=args.decoder_seq_length,
-        masked_lm_prob=args.mask_prob,
-        short_seq_prob=args.short_seq_prob,
-        seed=args.seed,
-        skip_warmup=(not args.mmap_warmup),
-        dataset_type='ul2')
-    print_rank_0("> finished creating UL2 datasets ...")
+    if args.data_path:
+        train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+            data_prefix=args.data_path,
+            data_impl=args.data_impl,
+            splits_string=args.split,
+            train_valid_test_num_samples=train_val_test_num_samples,
+            max_seq_length=args.encoder_seq_length,
+            max_seq_length_dec=args.decoder_seq_length,
+            masked_lm_prob=args.mask_prob,
+            short_seq_prob=args.short_seq_prob,
+            seed=args.seed,
+            skip_warmup=(not args.mmap_warmup),
+            dataset_type='ul2')
+        print_rank_0("> finished creating UL2 datasets ...")
+    elif args.train_weighted_split_paths:
+        assigned_train_valid_test = []
+        if args.train_weighted_split_paths is not None:
+            train_ds = []
+            assigned_train_valid_test.append("train")
+        if args.valid_weighted_split_paths is not None:
+            valid_ds = []
+            assigned_train_valid_test.append("valid")
+        if args.test_weighted_split_paths is not None:
+            test_ds = []
+            assigned_train_valid_test.append("test")
+
+        for s in assigned_train_valid_test:
+            data_groups = zip(eval(f"args.{s}_weighted_split_paths"),
+                                eval(f"args.{s}_weighted_split_weights"),
+                                eval(f"args.{s}_weighted_split_splits"),
+                                eval(f"args.{s}_weighted_split_names"))
+            for paths, weights, splits, name in data_groups:
+                d = build_dataset_group(name, paths, weights, splits,
+                                        args.data_impl,
+                                        train_val_test_num_samples,
+                                        args.seq_length, args.seed,
+                                        (not args.mmap_warmup),
+                                        train_valid_test=s)
+                eval(f"{s}_ds").append(d)
+    else:
+        raise NotImplementedError("No dataloading argument passed")                           
 
     return train_ds, valid_ds, test_ds
 

From 351f4f24465c974ab7c22dbc661d460006f25b01 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Fri, 30 Dec 2022 20:18:24 +0400
Subject: [PATCH 26/37] DS Group compat

---
 megatron/data/gpt_dataset.py | 37 ++++++++++++++++++++++++++++++------
 megatron/data/t5_dataset.py  |  4 ++++
 megatron/data/ul2_dataset.py | 24 +++++++++++++++--------
 pretrain_gpt.py              |  1 -
 pretrain_ul2.py              | 29 ++++++++++++++++------------
 5 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py
index 0db1aa2fe..534d3bc3c 100644
--- a/megatron/data/gpt_dataset.py
+++ b/megatron/data/gpt_dataset.py
@@ -21,12 +21,12 @@
 import numpy as np
 import torch
 
-from megatron import mpu, print_rank_0
+from megatron import get_args, mpu, print_rank_0
 from megatron.data.blendable_dataset import BlendableDataset
 from megatron.data.dataset_utils import get_datasets_weights_and_num_samples
 from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_
 from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset
-
+from megatron.data.ul2_dataset import UL2Dataset
 
 def build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
                                     train_valid_test_num_samples,
@@ -154,10 +154,35 @@ def build_dataset(name):
         if splits[1] > splits[0]:
             documents = np.arange(start=splits[0], stop=splits[1],
                                   step=1, dtype=np.int32)
-            dataset = GPTDataset(name, data_prefix,
-                                  documents, indexed_dataset,
-                                  train_valid_test_num_samples[index],
-                                  seq_length, seed)
+
+            args = get_args()
+            if args.ul2_model_type:
+                dataset = UL2Dataset(
+                    name=name,
+                    data_prefix=data_prefix,
+                    num_epochs=None,
+                    max_num_samples=train_valid_test_num_samples[index],
+                    max_seq_length=seq_length,
+                    seed=seed,
+                    indexed_dataset=indexed_dataset,
+                    model_type=args.ul2_model_type,
+                    denoiser_ratios=args.ul2_denoiser_ratios,
+                    denoisers=args.ul2_denoisers,
+                    mean_span_lengths=args.ul2_mean_span_lengths,
+                    mask_ratios=args.ul2_mask_ratios,
+                    denoiser_tokens={
+                        'R': args.ul2_r_denoiser_token,
+                        'S': args.ul2_s_denoiser_token,
+                        'X': args.ul2_x_denoiser_token,
+                    },
+                    max_seq_length_dec=seq_length,
+                    short_seq_prob=args.short_seq_prob,
+                )
+            else:
+                dataset = GPTDataset(name, data_prefix,
+                                    documents, indexed_dataset,
+                                    train_valid_test_num_samples[index],
+                                    seq_length, seed)
         return dataset
 
     dataset = build_dataset(dataset_group_name)
diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index af3374da6..be52206ec 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -186,16 +186,20 @@ def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
     (start_index, end_index) = (0, None)
     for span in masked_spans:
         flag = sentinel_tokens.popleft()
+
         # Append the same tokens in decoder input and output
         t5_decoder_in.append(flag)
         t5_decoder_in.extend(span.label)
         t5_decoder_out.append(flag)
         t5_decoder_out.extend(span.label)
+
         end_index = span.index[0]
         t5_input.extend(tokens[start_index: end_index])
         t5_input.append(flag)
+
         # the next start index is the token after the last span token
         start_index = span.index[-1] + 1
+
     # Add <eos> token to the t5_decoder_out
     t5_decoder_out.append(eos_id)
 
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 512086be3..6c196da2f 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -202,17 +202,17 @@ def build_training_sample(sample, target_seq_length,
     tokens = [token for sentence in sample for token in sentence]
 
     max_num_tokens = target_seq_length
-    # if is_decoder_only(model_type):
+    if is_decoder_only(model_type):
     #     # Keep space for repeated `extra_id` tokens; not the most data
     #     # efficient since we calculate this based on the maximum number
     #     # of possible `extra_id` tokens.
-    #     safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
-    #     truncated = len(tokens) > safe_max_seq_len
-    #     tokens = tokens[:safe_max_seq_len]
-    # else:
+        safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
+        truncated = len(tokens) > safe_max_seq_len
+        tokens = tokens[:safe_max_seq_len]
+    else:
     # Truncate to `target_sequence_length`.
-    truncated = len(tokens) > max_num_tokens
-    tokens = tokens[:max_num_tokens]
+        truncated = len(tokens) > max_num_tokens
+        tokens = tokens[:max_num_tokens]
 
     # Prepend objective token.
     cls_id = cls_ids.get(denoiser)
@@ -277,9 +277,16 @@ def build_training_sample(sample, target_seq_length,
         loss_mask = np.zeros(len(tokens), dtype=np.int64)
         loss_mask[-num_labels:] = 1
 
+        padding = [pad_id] * (max_seq_length - len(tokens))
+        tokens = np.concatenate((tokens, padding), axis=0)
+        labels = np.concatenate((labels, padding), axis=0)
+        loss_mask = np.concatenate((loss_mask, np.zeros(len(padding), dtype=np.int64)), axis=0)
+
         dec_mask = make_history_mask(tokens)
         if is_prefix_lm(model_type):
-            dec_mask[:-num_labels, :-num_labels] = 1
+            dec_mask[
+                :-num_labels-len(padding), :-num_labels-len(padding)
+            ] = 1
 
         train_sample = {
             'text': tokens,
@@ -288,6 +295,7 @@ def build_training_sample(sample, target_seq_length,
             'truncated': int(truncated),
             'dec_mask': dec_mask,
         }
+
     else:
         # Padding.
         tokens_enc, tokens_dec_in, labels, enc_mask, \
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 2d0691f4e..5f8b104fc 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -228,7 +228,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
 
 @record
 def main():
-    torch.multiprocessing.set_start_method('spawn')
     pretrain(train_valid_test_datasets_provider, model_provider, forward_step,
              args_defaults={'tokenizer_type': 'GPT2BPETokenizer'})
 
diff --git a/pretrain_ul2.py b/pretrain_ul2.py
index f942566f3..1342581e6 100644
--- a/pretrain_ul2.py
+++ b/pretrain_ul2.py
@@ -89,10 +89,13 @@ def model_provider(pre_process=True, post_process=True):
 
 from megatron.global_vars import get_tokenizer
 def visualize_model_inputs(tokens, attention_mask, labels, loss_mask):
+    print("SHAPES", tokens.shape, attention_mask.shape, labels.shape, loss_mask.shape)
     tok = get_tokenizer()
-    print("TOKENS:", ",".join([tok.detokenize(tokens[0, i]) for i in range(100)]))
-    print("ATTN:", attention_mask[0, :, :100, :100])
-    print("LABS:", labels[0, :100])
+
+    print("TOKENS:", tok.detokenize(tokens[0, :].cpu().numpy().tolist()))
+    print("LABELS:", tok.detokenize(labels[0, :].cpu().numpy().tolist()))
+
+    print("ATTN:", attention_mask[:100])
     print("LOSSMSK:", loss_mask[:100])
 
 def get_batch_pipe(data):
@@ -107,14 +110,14 @@ def get_batch_pipe(data):
     data_b = mpu.broadcast_data(keys, data, datatype)    
     
 
-    print(
-        visualize_model_inputs(
-            data_b['text'],
-            data_b['dec_mask'],
-            data_b['labels'],
-            data_b['loss_mask'],
-        )
-    )
+    # print(
+    #     visualize_model_inputs(
+    #         data_b['text'],
+    #         data_b['dec_mask'],
+    #         data_b['labels'],
+    #         data_b['loss_mask'],
+    #     )
+    # )
 
     tokens = data_b['text'].long()
     labels = data_b['labels'].long()
@@ -227,6 +230,7 @@ def forward_step(data_iterator, model):
 def train_valid_test_datasets_provider(train_val_test_num_samples):
     """Build train, valid, and test datasets."""
     args = get_args()
+    train_ds, valid_ds, test_ds = None, None, None
 
     print_rank_0('> building train, validation, and test datasets '
                  'for UL2 ...')
@@ -243,7 +247,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
             seed=args.seed,
             skip_warmup=(not args.mmap_warmup),
             dataset_type='ul2')
-        print_rank_0("> finished creating UL2 datasets ...")
     elif args.train_weighted_split_paths:
         assigned_train_valid_test = []
         if args.train_weighted_split_paths is not None:
@@ -272,6 +275,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
     else:
         raise NotImplementedError("No dataloading argument passed")                           
 
+    print_rank_0("> finished creating UL2 datasets ...")
+    
     return train_ds, valid_ds, test_ds
 
 

From abc19b836bd7a3e03db0650e34a3440a7b492227 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Fri, 30 Dec 2022 22:35:03 +0400
Subject: [PATCH 27/37] Adapt eval for denoiser

---
 tasks/eval_harness/evaluate.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 7e0cdb60e..51d107e4f 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -42,6 +42,8 @@ def __init__(self, model, tokenizer):
         self.tokenizer = tokenizer
         self.VOCAB_SIZE = tokenizer.vocab_size
         self.EOT_TOKEN_ID = tokenizer.eod
+        self.add_denoiser = args.add_denoiser
+        self.DENOISER_TOKEN_ID = tokenizer.tokenize("[S]")[0]
 
         self._max_length = args.seq_length
 
@@ -80,9 +82,15 @@ def loglikelihood(self, requests):
         for context, continuation in requests:
             if context == "":
                 # end of text as context
-                context_enc = [self.EOT_TOKEN_ID]
+                if self.add_denoiser:
+                    context_enc = [self.DENOISER_TOKEN_ID] + [self.EOT_TOKEN_ID]
+                else:
+                    context_enc = [self.EOT_TOKEN_ID]
             else:
-                context_enc = self.tokenizer_encode(context)
+                if self.add_denoiser:
+                    context_enc = [self.DENOISER_TOKEN_ID] + self.tokenizer_encode(context)
+                else:
+                    context_enc = self.tokenizer_encode(context)
 
             continuation_enc = self.tokenizer_encode(continuation)
 
@@ -390,6 +398,7 @@ def tasks_args(parser):
     group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
+    group.add_argument('--add_denoiser',  default = False, action='store_true', help='Whether to add a denoiser to the model')
     return parser
 
 from megatron.global_vars import _parse_args

From 816c32d1c20623214ef5f1e9b4d6d3aa7f44c5ff Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 3 Jan 2023 23:14:32 +0400
Subject: [PATCH 28/37] Simpler padding

---
 megatron/data/dataset_utils.py | 11 +++++-
 megatron/data/t5_dataset.py    | 26 ++++++++++-----
 megatron/data/ul2_dataset.py   | 61 +++++++++++++++++-----------------
 pretrain_ul2.py                |  2 ++
 4 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 60d4e0d90..2617f5927 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -249,6 +249,12 @@ def create_masked_lm_predictions(tokens,
 
     num_to_predict = min(max_predictions_per_seq,
                          max(1, int(round(len(tokens) * masked_lm_prob))))
+    if prefix_lm:
+        # Adjust probabilities so that the mean is centered at the
+        # correct position.
+        # If we do not do this, the mean is at
+        # `len(tokens) * masked_lm_prob / 2`.
+        masked_lm_prob *= 2
 
     ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
     if sampling_style is SamplingStyle.POISSON:
@@ -271,7 +277,10 @@ def create_masked_lm_predictions(tokens,
                     continue
             ngram_index.append(cand_indexes[idx:idx + n])
         ngram_indexes.append(ngram_index)
-
+        if prefix_lm:
+            # No need to go further – we would only produce
+            # duplicate entries by continuing for this `idx`.
+            break
     np_rng.shuffle(ngram_indexes)
 
     (masked_lms, masked_spans) = ([], [])
diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py
index be52206ec..f3ac34bf0 100644
--- a/megatron/data/t5_dataset.py
+++ b/megatron/data/t5_dataset.py
@@ -125,6 +125,8 @@ def build_training_sample(sample, target_seq_length,
         target_seq_length: Desired sequence length.
         max_seq_length: Maximum length of the sequence. All values are padded to
             this length.
+        max_seq_length_dec: Maximum length of the decoder input sequence. All
+            values are padded to this length.            
         vocab_id_list: List of vocabulary ids. Used to pick a random id.
         vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
         cls_id: Start of example id.
@@ -179,23 +181,29 @@ def build_training_sample(sample, target_seq_length,
 
 
 def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None,
-                           eos_id=None, sentinel_tokens=None):
-    sentinel_tokens = collections.deque(sentinel_tokens)
+                           eos_id=None, sentinel_tokens=None, prefix_lm=False):
+    if prefix_lm:
+        assert len(masked_spans) <= 1, \
+            'Received more than one masked span for PrefixLM masking'
+    else:
+        sentinel_tokens = collections.deque(sentinel_tokens)    
     t5_input = []
     (t5_decoder_in, t5_decoder_out) = ([bos_id], [])
     (start_index, end_index) = (0, None)
     for span in masked_spans:
-        flag = sentinel_tokens.popleft()
+        if not prefix_lm:
+            flag = sentinel_tokens.popleft()
+            # Append the same tokens in decoder input and output
+            t5_decoder_in.append(flag)
+            t5_decoder_out.append(flag)            
 
-        # Append the same tokens in decoder input and output
-        t5_decoder_in.append(flag)
         t5_decoder_in.extend(span.label)
-        t5_decoder_out.append(flag)
         t5_decoder_out.extend(span.label)
 
         end_index = span.index[0]
         t5_input.extend(tokens[start_index: end_index])
-        t5_input.append(flag)
+        if not prefix_lm:
+            t5_input.append(flag)
 
         # the next start index is the token after the last span token
         start_index = span.index[-1] + 1
@@ -212,11 +220,11 @@ def pad_and_convert_to_numpy(tokens, masked_positions,
                              masked_labels, pad_id,
                              max_seq_length, max_seq_length_dec,
                              masked_spans=None, bos_id=None,
-                             eos_id=None, sentinel_tokens=None):
+                             eos_id=None, sentinel_tokens=None, prefix_lm=False):
     """Pad sequences and convert them to numpy."""
 
     t5_input, t5_decoder_in, t5_decoder_out = merge_subsequent_masks(
-        tokens, masked_spans, bos_id, eos_id, sentinel_tokens)
+        tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm)
 
     # assert (len(t5_input) - len(masked_spans)) + \
     #        (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens)
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 6c196da2f..adfc3b17a 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -15,14 +15,11 @@
 
 """UL2-style dataset."""
 
-import math
-
 import numpy as np
 
 from megatron import get_tokenizer
 from megatron.data.dataset_utils import (
     create_masked_lm_predictions,
-    get_samples_mapping,
     SamplingStyle
 )
 from megatron.data.t5_dataset import (
@@ -164,12 +161,13 @@ def build_training_sample(sample, target_seq_length,
                           bos_id=None, eos_id=None,
                           sentinel_tokens=None):
     """Build training sample.
-
     Arguments:
         sample: A list of sentences in which each sentence is a list token ids.
         target_seq_length: Desired sequence length.
         max_seq_length: Maximum length of the sequence. All values are padded to
             this length.
+        max_seq_length_dec: Maximum length of the decoder input sequence. All
+            values are padded to this length.
         vocab_id_list: List of vocabulary ids. Used to pick a random id.
         vocab_id_to_token_dict: A dictionary from vocab ids to text tokens.
         cls_ids: Start of example ids.
@@ -202,17 +200,17 @@ def build_training_sample(sample, target_seq_length,
     tokens = [token for sentence in sample for token in sentence]
 
     max_num_tokens = target_seq_length
-    if is_decoder_only(model_type):
+    # if is_decoder_only(model_type):
     #     # Keep space for repeated `extra_id` tokens; not the most data
     #     # efficient since we calculate this based on the maximum number
     #     # of possible `extra_id` tokens.
-        safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
-        truncated = len(tokens) > safe_max_seq_len
-        tokens = tokens[:safe_max_seq_len]
-    else:
+    #     safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
+    #     truncated = len(tokens) > safe_max_seq_len
+    #     tokens = tokens[:safe_max_seq_len]
+    # else:
     # Truncate to `target_sequence_length`.
-        truncated = len(tokens) > max_num_tokens
-        tokens = tokens[:max_num_tokens]
+    truncated = len(tokens) > max_num_tokens
+    tokens = tokens[:max_num_tokens]
 
     # Prepend objective token.
     cls_id = cls_ids.get(denoiser)
@@ -221,10 +219,12 @@ def build_training_sample(sample, target_seq_length,
     tokens = [cls_id] + tokens
 
     # Masking.
-    max_predictions_per_seq = masked_lm_prob * len(tokens)
+    # Ensure we always have at least one prediction.
+    max_predictions_per_seq = max(1.0, masked_lm_prob * len(tokens))
     mean_ngrams = mean_span_lengths[denoiser_index]
     if mean_ngrams < 1:
-        mean_ngrams = round(len(tokens) * mean_ngrams)
+        # Ensure we always obtain at least one `max_ngrams`.
+        mean_ngrams = max(1, round(len(tokens) * mean_ngrams))
     max_ngrams = mean_ngrams * 2 - 1
 
     if denoiser == 'R' or denoiser == 'X':
@@ -247,7 +247,7 @@ def build_training_sample(sample, target_seq_length,
     if is_decoder_only(model_type):
         # Concatenate to one sequence.
         tokens_enc, tokens_dec_in, labels = merge_subsequent_masks(
-            tokens, masked_spans, bos_id, eos_id, sentinel_tokens)
+            tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm)
 
         # Move EOS tokens to end of sequence.
         while tokens_enc[-1] == eos_id:
@@ -259,34 +259,35 @@ def build_training_sample(sample, target_seq_length,
 
         # Move BOS token to start of sequence.
         tokens_dec_in = tokens_dec_in[1:]
-        tokens = np.array((
+        tokens = (
             [bos_id]
             + tokens_enc
             + [sep_id]
             + tokens_dec_in
-        ), dtype=np.int64)
+        )
+
+        # Pad and convert to NumPy.
+        padding_length = max_seq_length - len(tokens)
+        if padding_length < 0:
+            raise LengthExceededError()
+        filler = [pad_id] * padding_length
+
+        tokens = np.array(tokens + filler, dtype=np.int64)
         labels = np.array((
             tokens_enc
             + [sep_id]
             + labels
+            + filler
         ), dtype=np.int64)
 
-        if max_seq_length - len(tokens) < 0:
-            raise LengthExceededError()
-
         loss_mask = np.zeros(len(tokens), dtype=np.int64)
-        loss_mask[-num_labels:] = 1
-
-        padding = [pad_id] * (max_seq_length - len(tokens))
-        tokens = np.concatenate((tokens, padding), axis=0)
-        labels = np.concatenate((labels, padding), axis=0)
-        loss_mask = np.concatenate((loss_mask, np.zeros(len(padding), dtype=np.int64)), axis=0)
+        labels_start_neg_index = -(num_labels + padding_length)
+        labels_end_neg_index = -padding_length if padding_length > 0 else None
+        loss_mask[labels_start_neg_index:labels_end_neg_index] = 1
 
         dec_mask = make_history_mask(tokens)
         if is_prefix_lm(model_type):
-            dec_mask[
-                :-num_labels-len(padding), :-num_labels-len(padding)
-            ] = 1
+            dec_mask[:labels_start_neg_index, :labels_start_neg_index] = 1
 
         train_sample = {
             'text': tokens,
@@ -295,7 +296,6 @@ def build_training_sample(sample, target_seq_length,
             'truncated': int(truncated),
             'dec_mask': dec_mask,
         }
-
     else:
         # Padding.
         tokens_enc, tokens_dec_in, labels, enc_mask, \
@@ -303,7 +303,8 @@ def build_training_sample(sample, target_seq_length,
             = pad_and_convert_to_numpy(tokens, masked_positions,
                                        masked_labels, pad_id, max_seq_length,
                                        max_seq_length_dec, masked_spans,
-                                       bos_id, eos_id, sentinel_tokens)
+                                       bos_id, eos_id, sentinel_tokens,
+                                       prefix_lm)
 
         train_sample = {
             'text_enc': tokens_enc,
diff --git a/pretrain_ul2.py b/pretrain_ul2.py
index 1342581e6..9b64b6daf 100644
--- a/pretrain_ul2.py
+++ b/pretrain_ul2.py
@@ -124,6 +124,7 @@ def get_batch_pipe(data):
     loss_mask = data_b['loss_mask'].float()
 
     dec_mask = (data_b['dec_mask'] < 0.5)
+    dec_mask = dec_mask.unsqueeze(1)
 
     position_ids = t5_position_ids(tokens)
 
@@ -163,6 +164,7 @@ def get_batch(data_iterator):
         loss_mask = data_b['loss_mask'].float()
 
         dec_mask = (data_b['dec_mask'] < 0.5)
+        dec_mask = dec_mask.unsqueeze(1)
         return tokens, loss_mask, labels, dec_mask
     else:
         tokens_enc = data_b['text_enc'].long()

From bdbd54a0de63e778bac28119b835aa381a7dad85 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 3 Jan 2023 23:28:41 +0400
Subject: [PATCH 29/37] Fix sampling

---
 megatron/data/ul2_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index adfc3b17a..dd96c00d2 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -20,6 +20,7 @@
 from megatron import get_tokenizer
 from megatron.data.dataset_utils import (
     create_masked_lm_predictions,
+    get_samples_mapping,    
     SamplingStyle
 )
 from megatron.data.t5_dataset import (

From cacf267c87b3f694dc85e340bd93d5adc6be0c57 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 3 Jan 2023 23:43:30 +0400
Subject: [PATCH 30/37] Switch padding

---
 megatron/data/ul2_dataset.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index dd96c00d2..065125923 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -15,6 +15,8 @@
 
 """UL2-style dataset."""
 
+import math
+
 import numpy as np
 
 from megatron import get_tokenizer
@@ -201,17 +203,17 @@ def build_training_sample(sample, target_seq_length,
     tokens = [token for sentence in sample for token in sentence]
 
     max_num_tokens = target_seq_length
-    # if is_decoder_only(model_type):
+    if is_decoder_only(model_type):
     #     # Keep space for repeated `extra_id` tokens; not the most data
     #     # efficient since we calculate this based on the maximum number
     #     # of possible `extra_id` tokens.
-    #     safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
-    #     truncated = len(tokens) > safe_max_seq_len
-    #     tokens = tokens[:safe_max_seq_len]
-    # else:
+        safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob))
+        truncated = len(tokens) > safe_max_seq_len
+        tokens = tokens[:safe_max_seq_len]
+    else:
     # Truncate to `target_sequence_length`.
-    truncated = len(tokens) > max_num_tokens
-    tokens = tokens[:max_num_tokens]
+        truncated = len(tokens) > max_num_tokens
+        tokens = tokens[:max_num_tokens]
 
     # Prepend objective token.
     cls_id = cls_ids.get(denoiser)

From 557b09ceb70d327d8181d4a2fa010412155d6da7 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 24 Jan 2023 03:45:41 +0400
Subject: [PATCH 31/37] Upate sampling

---
 megatron/data/dataset_utils.py | 73 +++++++++++++++++++++++-----------
 megatron/data/ul2_dataset.py   |  9 ++++-
 2 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index 2617f5927..e76a951ab 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -219,6 +219,7 @@ def create_masked_lm_predictions(tokens,
     # the starting piece of current token, where 1 means true, so that
     # on-the-fly whole word masking is possible.
     token_boundary = [0] * len(tokens)
+    num_filtered_tokens = 0
 
     for (i, token) in enumerate(tokens):
         if token == cls_id or token == sep_id:
@@ -237,6 +238,7 @@ def create_masked_lm_predictions(tokens,
             cand_indexes.append([i])
             if is_start_piece(vocab_id_to_token_dict[token]):
                 token_boundary[i] = 1
+        num_filtered_tokens += 1
 
     output_tokens = list(tokens)
 
@@ -249,12 +251,16 @@ def create_masked_lm_predictions(tokens,
 
     num_to_predict = min(max_predictions_per_seq,
                          max(1, int(round(len(tokens) * masked_lm_prob))))
-    if prefix_lm:
-        # Adjust probabilities so that the mean is centered at the
-        # correct position.
-        # If we do not do this, the mean is at
-        # `len(tokens) * masked_lm_prob / 2`.
-        masked_lm_prob *= 2
+
+    if sampling_style is SamplingStyle.NORMAL:
+        # First, we get the center of our normal distribution from
+        # `max_ngrams`. Keeping the meaning of `max_ngrams` this way
+        # plays nicely with the other probability distributions in terms
+        # of math.
+        normal_mean = (max_ngrams + 1) / 2
+        # However, we do not want to bound the maximum number of
+        # n-grams.
+        max_ngrams = num_filtered_tokens - 1
 
     ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64)
     if sampling_style is SamplingStyle.POISSON:
@@ -264,24 +270,45 @@ def create_masked_lm_predictions(tokens,
         pvals /= pvals.sum(keepdims=True)
         if favor_longer_ngram:
             pvals = pvals[::-1]
-    elif sampling_style is SamplingStyle.NORMAL:
-        normal_mean = (max_ngrams + 1) / 2
 
-    ngram_indexes = []
-    for idx in range(len(cand_indexes)):
-        ngram_index = []
-        for n in ngrams:
-            if prefix_lm:
-                last_cand_index_index = min(idx + n - 1, len(cand_indexes) - 1)
-                if cand_indexes[last_cand_index_index][-1] < len(tokens) - 1:
-                    continue
-            ngram_index.append(cand_indexes[idx:idx + n])
-        ngram_indexes.append(ngram_index)
-        if prefix_lm:
-            # No need to go further – we would only produce
-            # duplicate entries by continuing for this `idx`.
-            break
-    np_rng.shuffle(ngram_indexes)
+    if prefix_lm:
+        # We only do one span searching loop anyway, so this does not
+        # matter in terms of random search. However, we do want to allow
+        # sequences greater than the mean ratio.
+        num_to_predict = max_predictions_per_seq
+
+        # Find first index which is greater than the number of
+        # predictions.
+        first_gt_index = next(
+            (
+                i
+                for (i, x) in enumerate(cand_indexes)
+                if x[0] > num_filtered_tokens - max_predictions_per_seq
+            ),
+            len(cand_indexes),
+        )
+        # Then move one index before to get less than or equal to the
+        # number of predictions, handling not going below 0.
+        first_le_index = max(1, first_gt_index) - 1
+
+        tail_cand_indexes = cand_indexes[first_le_index:]
+        ngram_indexes = []
+        for i in range(len(tail_cand_indexes)):
+            ngram_indexes.append(tail_cand_indexes[i:])
+        ngram_indexes = [ngram_indexes]
+        # No need to shuffle outer list of length 1.
+    else:
+        num_to_predict = min(max_predictions_per_seq,
+                             max(1, int(round(len(tokens) * masked_lm_prob))))
+
+        ngram_indexes = []
+        for idx in range(len(cand_indexes)):
+            ngram_index = []
+            for n in ngrams:
+                ngram_index.append(cand_indexes[idx:idx + n])
+            ngram_indexes.append(ngram_index)
+
+        np_rng.shuffle(ngram_indexes)
 
     (masked_lms, masked_spans) = ([], [])
     covered_indexes = set()
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 065125923..2d6960e87 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -222,8 +222,6 @@ def build_training_sample(sample, target_seq_length,
     tokens = [cls_id] + tokens
 
     # Masking.
-    # Ensure we always have at least one prediction.
-    max_predictions_per_seq = max(1.0, masked_lm_prob * len(tokens))
     mean_ngrams = mean_span_lengths[denoiser_index]
     if mean_ngrams < 1:
         # Ensure we always obtain at least one `max_ngrams`.
@@ -233,11 +231,18 @@ def build_training_sample(sample, target_seq_length,
     if denoiser == 'R' or denoiser == 'X':
         sampling_style = SamplingStyle.NORMAL
         prefix_lm = False
+        max_predictions_per_seq = len(tokens) - 1        
     elif denoiser == 'S':
         sampling_style = SamplingStyle.UNIFORM
         prefix_lm = True
+        max_predictions_per_seq = min(
+            round(masked_lm_prob * len(tokens)) * 2 - 1,
+            len(tokens) - 1,
+        )        
     else:
         raise ValueError('unknown denoiser')
+    # Ensure we always have at least one prediction.
+    max_predictions_per_seq = max(1, max_predictions_per_seq)        
     (
         tokens, masked_positions, masked_labels, _, masked_spans,
     ) = create_masked_lm_predictions(

From a6f69bf2751f4a456bd6a4b85d13211fcae340e2 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 24 Jan 2023 19:14:54 +0400
Subject: [PATCH 32/37] Update UL2

---
 megatron/data/dataset_utils.py | 79 ++++++++++++++++++++--------------
 megatron/data/ul2_dataset.py   |  5 +--
 2 files changed, 48 insertions(+), 36 deletions(-)

diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py
index e76a951ab..a3ce376b5 100644
--- a/megatron/data/dataset_utils.py
+++ b/megatron/data/dataset_utils.py
@@ -18,6 +18,7 @@
 #   https://github.com/google-research/albert/blob/master/create_pretraining_data.py
 # with some modifications.
 
+import bisect
 from enum import Enum
 import math
 import os
@@ -191,6 +192,36 @@ def is_start_piece(piece):
     # append it to the previous set of word indexes.
     return not piece.startswith("##")
 
+def get_ngram_indices(
+        idx,
+        ngrams,
+        cand_indexes,
+        num_to_predict,
+        num_filtered_tokens,
+        prefix_lm,
+):
+    if prefix_lm:
+        # Find first index which is greater than the number of
+        # predictions.
+        first_gt_index = bisect.bisect_right(
+            cand_indexes,
+            [num_filtered_tokens - num_to_predict],
+        )
+        # Then move one index before to get less than or equal to the
+        # number of predictions, handling not going below 0.
+        first_le_index = max(1, first_gt_index) - 1
+
+        tail_cand_indexes = cand_indexes[first_le_index:]
+        ngram_index = [
+            tail_cand_indexes[i:]
+            for i in range(len(tail_cand_indexes))
+        ]
+    else:
+        ngram_index = []
+        for n in ngrams:
+            ngram_index.append(cand_indexes[idx:idx + n])
+    return ngram_index
+
 
 def create_masked_lm_predictions(tokens,
                                  vocab_id_list, vocab_id_to_token_dict,
@@ -249,9 +280,6 @@ def create_masked_lm_predictions(tokens,
         return (output_tokens, masked_lm_positions,
                 masked_lm_labels, token_boundary)
 
-    num_to_predict = min(max_predictions_per_seq,
-                         max(1, int(round(len(tokens) * masked_lm_prob))))
-
     if sampling_style is SamplingStyle.NORMAL:
         # First, we get the center of our normal distribution from
         # `max_ngrams`. Keeping the meaning of `max_ngrams` this way
@@ -277,38 +305,24 @@ def create_masked_lm_predictions(tokens,
         # sequences greater than the mean ratio.
         num_to_predict = max_predictions_per_seq
 
-        # Find first index which is greater than the number of
-        # predictions.
-        first_gt_index = next(
-            (
-                i
-                for (i, x) in enumerate(cand_indexes)
-                if x[0] > num_filtered_tokens - max_predictions_per_seq
-            ),
-            len(cand_indexes),
-        )
-        # Then move one index before to get less than or equal to the
-        # number of predictions, handling not going below 0.
-        first_le_index = max(1, first_gt_index) - 1
-
-        tail_cand_indexes = cand_indexes[first_le_index:]
-        ngram_indexes = []
-        for i in range(len(tail_cand_indexes)):
-            ngram_indexes.append(tail_cand_indexes[i:])
-        ngram_indexes = [ngram_indexes]
-        # No need to shuffle outer list of length 1.
+        ngram_index_indexes = np.array([0])
     else:
         num_to_predict = min(max_predictions_per_seq,
                              max(1, int(round(len(tokens) * masked_lm_prob))))
 
-        ngram_indexes = []
-        for idx in range(len(cand_indexes)):
-            ngram_index = []
-            for n in ngrams:
-                ngram_index.append(cand_indexes[idx:idx + n])
-            ngram_indexes.append(ngram_index)
-
-        np_rng.shuffle(ngram_indexes)
+        ngram_index_indexes = np.arange(len(cand_indexes))
+        np_rng.shuffle(ngram_index_indexes)
+
+    def get_ngram_indices_(idx):
+        return get_ngram_indices(
+            idx,
+            ngrams,
+            cand_indexes,
+            num_to_predict,
+            num_filtered_tokens,
+            prefix_lm,
+        )
+    ngram_indexes = map(get_ngram_indices_, ngram_index_indexes)
 
     (masked_lms, masked_spans) = ([], [])
     covered_indexes = set()
@@ -392,7 +406,8 @@ def create_masked_lm_predictions(tokens,
             label=[tokens[index] for index in index_set]))
 
     assert len(masked_lms) <= num_to_predict
-    np_rng.shuffle(ngram_indexes)
+    np_rng.shuffle(ngram_index_indexes)
+    ngram_indexes = map(get_ngram_indices_, ngram_index_indexes)
 
     select_indexes = set()
     if do_permutation:
diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 2d6960e87..e9eb82ff8 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -15,14 +15,11 @@
 
 """UL2-style dataset."""
 
-import math
-
 import numpy as np
 
 from megatron import get_tokenizer
 from megatron.data.dataset_utils import (
-    create_masked_lm_predictions,
-    get_samples_mapping,    
+    create_masked_lm_predictions,    
     SamplingStyle
 )
 from megatron.data.t5_dataset import (

From d0d277feb6808cd83b9cd4bd2812257708d6d06f Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 24 Jan 2023 19:36:08 +0400
Subject: [PATCH 33/37] Add get_samples_mapping

---
 megatron/data/ul2_dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index e9eb82ff8..6511ad345 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -19,7 +19,8 @@
 
 from megatron import get_tokenizer
 from megatron.data.dataset_utils import (
-    create_masked_lm_predictions,    
+    create_masked_lm_predictions, 
+    get_samples_mapping,
     SamplingStyle
 )
 from megatron.data.t5_dataset import (

From 3f29df898b30bd8f1cc398e65fbcd8dc0c0b98b7 Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Tue, 24 Jan 2023 19:44:51 +0400
Subject: [PATCH 34/37] Import math

---
 megatron/data/ul2_dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py
index 6511ad345..4da73d9e7 100644
--- a/megatron/data/ul2_dataset.py
+++ b/megatron/data/ul2_dataset.py
@@ -15,6 +15,7 @@
 
 """UL2-style dataset."""
 
+import math
 import numpy as np
 
 from megatron import get_tokenizer

From 5207386338ee47bdf9d4d781e8402e20c00fa96e Mon Sep 17 00:00:00 2001
From: Muennighoff <n.muennighoff@gmail.com>
Date: Mon, 6 Feb 2023 12:59:55 +0400
Subject: [PATCH 35/37] Fix prefixlm

---
 finetune_t0_non_causal_decoder.py |  1 +
 pretrain_ul2.py                   | 29 ++++++++++++++++++++---------
 2 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py
index 14650a6e5..13a758a9a 100644
--- a/finetune_t0_non_causal_decoder.py
+++ b/finetune_t0_non_causal_decoder.py
@@ -33,6 +33,7 @@ def model_provider(pre_process=True, post_process=True):
                              enabled=args.zero_stage == 3,
                              mpu=mpu):
         if args.deepspeed:
+            args.pretrain_causal_attention = False
             model = GPTModelPipe(
                 num_tokentypes=0,
                 parallel_output=True,
diff --git a/pretrain_ul2.py b/pretrain_ul2.py
index 9b64b6daf..89c936666 100644
--- a/pretrain_ul2.py
+++ b/pretrain_ul2.py
@@ -64,15 +64,26 @@ def model_provider(pre_process=True, post_process=True):
                              enabled=args.zero_stage == 3,
                              mpu=mpu):
         if args.deepspeed and is_decoder_only():
-            args.pretrain_causal_attention = True
-            model = GPTModelPipe(
-                num_tokentypes=0,
-                parallel_output=True,
-                attn_mask_type=AttnMaskType.causal
-            )
-            # This is a hack to give us a reference to get_batch_pipe from within training.py
-            # We need to call model.set_batch_fn after deepspeed.initialize
-            model._megatron_batch_fn = get_batch_pipe
+            if is_prefix_lm():
+                args.pretrain_causal_attention = False
+                print_rank_0('Using prefix LM UL2 model.')
+                model = GPTModelPipe(
+                    num_tokentypes=0,
+                    parallel_output=True,
+                    attn_mask_type=AttnMaskType.custom
+                )
+                model._megatron_batch_fn = get_batch_pipe
+            else:
+                args.pretrain_causal_attention = True
+                print_rank_0('Using decoder-only causal UL2 model.')   
+                model = GPTModelPipe(
+                    num_tokentypes=0,
+                    parallel_output=True,
+                    attn_mask_type=AttnMaskType.causal
+                )
+                # This is a hack to give us a reference to get_batch_pipe from within training.py
+                # We need to call model.set_batch_fn after deepspeed.initialize
+                model._megatron_batch_fn = get_batch_pipe
         elif is_decoder_only():
             print_rank_0('Using decoder-only UL2 model.')
             model = GPTModel(

From 9490e50ed68253569c7bfef33eb5bc4b72f3a58e Mon Sep 17 00:00:00 2001
From: Niklas Muennighoff <n.muennighoff@gmail.com>
Date: Sat, 20 May 2023 00:00:59 +0300
Subject: [PATCH 36/37] tmp

---
 megatron/arguments.py          |  3 ++-
 megatron/checkpointing.py      |  6 +++---
 tasks/eval_harness/evaluate.py | 14 ++++++++++----
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/megatron/arguments.py b/megatron/arguments.py
index d2499d149..1b101cbad 100644
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -649,7 +649,8 @@ def _add_learning_rate_args(parser):
                        'from checkpoint and ignore input arguments.')
     group.add_argument('--universal-checkpoint', action='store_true',
                         help='Loading a universal format checkpoint.')
-
+    group.add_argument('--reset-progress', action='store_true', default=None,
+                        help='Reset iteration to 0 & do not load args.') 
     return parser
 
 
diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py
index dacbec7dc..ebf93a986 100644
--- a/megatron/checkpointing.py
+++ b/megatron/checkpointing.py
@@ -342,7 +342,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     set_checkpoint_version(state_dict.get('checkpoint_version', 0))
 
     # Set iteration.
-    if args.finetune or release:
+    if args.finetune or release or args.reset_progress:
         iteration = 0
     else:
         try:
@@ -361,7 +361,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True
     # Check arguments.
     assert args.consumed_train_samples == 0
     assert args.consumed_valid_samples == 0
-    if 'args' in state_dict:
+    if 'args' in state_dict and not args.reset_progress:
         checkpoint_args = state_dict['args']
         if not args.universal_checkpoint:
             check_checkpoint_args(checkpoint_args)
@@ -480,4 +480,4 @@ def _checkpoint_info():
     return {
         "padded_vocab_size": args.padded_vocab_size,
         "original_vocab_size": tokenizer.vocab_size,
-    }
\ No newline at end of file
+    }
diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py
index 7e0cdb60e..67e95e1c9 100644
--- a/tasks/eval_harness/evaluate.py
+++ b/tasks/eval_harness/evaluate.py
@@ -390,6 +390,8 @@ def tasks_args(parser):
     group.add_argument('--intermed_results',  default = False, action='store_true', help='Whether to print & write intermediate results for each task')
     group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation')
     group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel')
+    group.add_argument('--fewshots', type=int, default=0, help='Num fewshots')
+    group.add_argument('--limit', type=int, default=None, help='Limit samples')
     return parser
 
 from megatron.global_vars import _parse_args
@@ -398,6 +400,10 @@ def main():
     # parse the megatron args. But wait with initalizing megatron.
     # avoid printing the arguments, since they will later be overridden.
     args = _parse_args(tasks_args)
+    if os.path.exists(args.results_path):
+        print("Exists ", args.results_path)
+        exit()
+
     load_path = args.load
     model = load_ds_checkpoint_and_setup_megatron(args)
 
@@ -422,11 +428,11 @@ def main():
         global_results = {"results": {}, "versions": {}}
         timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
         iteration_id = load_path.split("/")[-1].replace("/", "")
-        results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json")
+        results_path = args.results_path#.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots.json")
         # Backup file in case of interruption during writing
-        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json")
+        results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots_backup.json")
         for task_name, task in task_dict.items():
-            results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+            results = evaluator.evaluate(adaptor, {task_name: task}, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit)
             global_results["results"] = {**global_results["results"], **results["results"]}
             global_results["versions"] = {**global_results["versions"], **results["versions"]}
             if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
@@ -436,7 +442,7 @@ def main():
                 with open(results_path_backup, 'w') as outfile:
                     json.dump(global_results, outfile, indent=4)
     else:
-        global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters)
+        global_results = evaluator.evaluate(adaptor, task_dict, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit)
         if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0:
             print(json.dumps(global_results, indent=2))
             with open(args.results_path, 'w') as outfile:

From 6936afba8e8f64e73792f41f9e44c39d2efb5c5c Mon Sep 17 00:00:00 2001
From: Niklas Muennighoff <n.muennighoff@gmail.com>
Date: Fri, 19 May 2023 23:20:40 +0200
Subject: [PATCH 37/37] Revert UL2 Tokenizer Changes

---
 megatron/tokenizer/tokenizer.py | 72 ++-------------------------------
 1 file changed, 3 insertions(+), 69 deletions(-)

diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py
index 7d0340ca2..09304b1dd 100644
--- a/megatron/tokenizer/tokenizer.py
+++ b/megatron/tokenizer/tokenizer.py
@@ -40,9 +40,7 @@ def build_tokenizer(args):
                                             vocab_extra_ids=args.vocab_extra_ids)
     elif args.tokenizer_type == 'GPT2BPETokenizer':
         assert args.merge_file is not None
-        tokenizer = _GPT2BPETokenizer(
-            args.vocab_file, args.merge_file, vocab_extra_ids=args.vocab_extra_ids
-        )
+        tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file)
     elif args.tokenizer_type == "PretrainedFromHF":
         assert args.tokenizer_name_or_path is not None
 
@@ -288,36 +286,13 @@ def additional_special_tokens(self, value):
 class _GPT2BPETokenizer(AbstractTokenizer):
     """Original GPT2 BPE tokenizer."""
 
-    def __init__(self, vocab_file, merge_file, vocab_extra_ids=0):
+    def __init__(self, vocab_file, merge_file):
         name = 'GPT2 BPE'
         super().__init__(name)
 
         self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace',
                                        special_tokens=[], max_len=None)
-        self.eod_id = self.eos_token_id = self.tokenizer.encoder['<|endoftext|>']
-
-        self.bod_id = self.bos_token_id = self.tokenizer.encoder['[EOS]']
-        self.sep_id = self.tokenizer.encoder['[SEP]']
-        self.mask_id = self.tokenizer.encoder['[MASK]']
-        self.pad_id = self.tokenizer.encoder['[PAD]']
-
-        additional_special_tokens = []
-        self._additional_special_tokens = []
-        additional_special_tokens.extend(
-            ["<extra_id_{}>".format(i) for i in range(vocab_extra_ids)])
-        self.add_additional_special_tokens(additional_special_tokens)
-
-    def add_additional_special_tokens(self, tokens_list):
-        setattr(self, "additional_special_tokens", tokens_list)
-        for value in tokens_list:
-            self.add_token(value)
-
-    def add_token(self, token):
-        if token not in self.vocab:
-            self.inv_vocab[self.vocab_size] = token
-            # self.vocab_size comes from len(vocab)
-            # and it will increase as we add elements
-            self.vocab[token] = self.vocab_size
+        self.eod_id = self.tokenizer.encoder['<|endoftext|>']
 
     @property
     def vocab_size(self):
@@ -341,35 +316,6 @@ def detokenize(self, token_ids):
     def eod(self):
         return self.eod_id
 
-    @property
-    def bod(self):
-        return self.bod_id
-
-    @property
-    def sep(self):
-        return self.sep_id
-
-    @property
-    def mask(self):
-        return self.mask_id     
-
-    @property
-    def pad(self):
-        return self.pad_id
-
-    @property
-    def additional_special_tokens(self):
-        """ All the additional special tokens you may want to use (list of strings)."""
-        return self._additional_special_tokens
-        
-    @property
-    def additional_special_tokens_ids(self):
-        """ Ids of all the additional special tokens in the vocabulary (list of integers)."""
-        return [self.vocab.get(token) for token in self._additional_special_tokens]
-
-    @additional_special_tokens.setter
-    def additional_special_tokens(self, value):
-        self._additional_special_tokens = value
 
 class _AutoTokenizer(AbstractTokenizer):
     """AutoTokenizer for Hf Pretrained model loading."""
@@ -442,18 +388,6 @@ def eos(self):
         candidate = self.tokenizer.eos_token_id
         return self._check_token_candidate(candidate)
 
-    @property
-    def bos_token_id(self):
-        """Id of the beginning of sentence token in the vocabulary."""
-        candidate = self.tokenizer.bos_token_id
-        return self._check_token_candidate(candidate)
-
-    @property
-    def eos_token_id(self):
-        """Id of the end of sentence token in the vocabulary."""
-        candidate = self.tokenizer.eos_token_id
-        return self._check_token_candidate(candidate)
-
     @property
     def additional_special_tokens_ids(self):
         """ All the additional special tokens you may want to use (list of strings)."""