From ebb79c86a0749de9de5a78b2611f2a2735d9a2f9 Mon Sep 17 00:00:00 2001 From: Risto Luukkonen Date: Wed, 5 Oct 2022 09:24:52 +0300 Subject: [PATCH 01/37] Squash 3 commits to 1 Patch HIP-support to fused kernels from Microsoft/Megatron-DeepSpeed-fork Path HIP-support to fused kernels from Microsoft/Megatron-DeepSpeed-fork remove some local files --- .gitignore | 10 +++ megatron/fused_kernels/__init__.py | 80 +++++++++++-------- .../fused_kernels/layer_norm_cuda_kernel.cu | 21 ++++- .../scaled_masked_softmax_cuda.cu | 2 + .../scaled_upper_triang_masked_softmax.h | 3 +- ...scaled_upper_triang_masked_softmax_cuda.cu | 2 + 6 files changed, 84 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index 6b6db06fb..267995984 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,13 @@ +## HIP-compiled kernels etc. +*hip* +# +local_examples/ +logs/ +trash/ +kb-runs-gpt/ +ds_configs/ +gpt2-tokenizer/ +smi-output/ # tests # megatron autogenerated indices tests/data/*/*npy diff --git a/megatron/fused_kernels/__init__.py b/megatron/fused_kernels/__init__.py index e2ac2567b..bdc654c39 100644 --- a/megatron/fused_kernels/__init__.py +++ b/megatron/fused_kernels/__init__.py @@ -17,81 +17,97 @@ import pathlib import subprocess +import torch from torch.utils import cpp_extension +# Setting this param to a list has a problem of generating different +# compilation commands (with diferent order of architectures) and +# leading to recompilation of fused kernels. Set it to empty string +# to avoid recompilation and assign arch flags explicity in +# extra_cuda_cflags below +os.environ["TORCH_CUDA_ARCH_LIST"] = "" + def load(args): - # Setting this param to a list has a problem of generating different - # compilation commands (with diferent order of architectures) and - # leading to recompilation of fused kernels. Set it to empty string - # to avoid recompilation and assign arch flags explicity in - # extra_cuda_cflags below - # - # but if a user wants to set an explicit list of archs to compile to, then let that list - # through: - arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST', None) - if arch_list is None: - os.environ["TORCH_CUDA_ARCH_LIST"] = "" - - # # Check if cuda 11 is installed for compute capability 8.0 - # cc_flag = [] - # _, bare_metal_major, _ = _get_cuda_bare_metal_version( - # cpp_extension.CUDA_HOME) - # if int(bare_metal_major) >= 11: - # cc_flag.append('-gencode') - # cc_flag.append('arch=compute_80,code=sm_80') + # Check if cuda 11 is installed for compute capability 8.0 + cc_flag = [] + if torch.version.hip is None: + _, bare_metal_major, _ = _get_cuda_bare_metal_version( + cpp_extension.CUDA_HOME) + if int(bare_metal_major) >= 11: + cc_flag.append('-gencode') + cc_flag.append('arch=compute_80,code=sm_80') # Build path srcpath = pathlib.Path(__file__).parent.absolute() buildpath = srcpath / 'build' - buildpath.mkdir(parents=True, exist_ok=True) + _create_build_dir(buildpath) # Helper function to build the kernels. - def _cpp_extention_load_helper(name, sources, extra_cuda_flags): + def _cpp_extention_load_helper(name, sources, extra_cuda_flags, extra_include_paths): + if torch.version.hip is not None: + extra_cuda_cflags=['-O3'] + extra_cuda_flags + cc_flag + else: + extra_cuda_cflags=['-O3', + '-gencode', 'arch=compute_70,code=sm_70', + '--use_fast_math'] + extra_cuda_flags + cc_flag + return cpp_extension.load( name=name, sources=sources, build_directory=buildpath, extra_cflags=['-O3',], - extra_cuda_cflags=['-O3', - '--use_fast_math'] + extra_cuda_flags, + extra_cuda_cflags=extra_cuda_cflags, + extra_include_paths=extra_include_paths, verbose=(args.rank == 0) ) - # '-gencode', 'arch=compute_70,code=sm_70', # ============== # Fused softmax. # ============== + if torch.version.hip is not None: + extra_include_paths=[os.path.abspath(srcpath)] + else: + extra_include_paths=[] + if args.masked_softmax_fusion: - extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', - '-U__CUDA_NO_HALF_CONVERSIONS__', - '--expt-relaxed-constexpr', - '--expt-extended-lambda'] + if torch.version.hip is not None: + extra_cuda_flags = ['-D__HIP_NO_HALF_OPERATORS__=1', + '-D__HIP_NO_HALF_CONVERSIONS__=1'] + else: + extra_cuda_flags = ['-U__CUDA_NO_HALF_OPERATORS__', + '-U__CUDA_NO_HALF_CONVERSIONS__', + '--expt-relaxed-constexpr', + '--expt-extended-lambda'] # Upper triangular softmax. sources=[srcpath / 'scaled_upper_triang_masked_softmax.cpp', srcpath / 'scaled_upper_triang_masked_softmax_cuda.cu'] scaled_upper_triang_masked_softmax_cuda = _cpp_extention_load_helper( "scaled_upper_triang_masked_softmax_cuda", - sources, extra_cuda_flags) + sources, extra_cuda_flags, extra_include_paths) # Masked softmax. sources=[srcpath / 'scaled_masked_softmax.cpp', srcpath / 'scaled_masked_softmax_cuda.cu'] scaled_masked_softmax_cuda = _cpp_extention_load_helper( - "scaled_masked_softmax_cuda", sources, extra_cuda_flags) + "scaled_masked_softmax_cuda", sources, extra_cuda_flags, extra_include_paths) # ================================= # Mixed precision fused layer norm. # ================================= - extra_cuda_flags = ['-maxrregcount=50'] + if torch.version.hip is not None: + extra_cuda_flags = [] + else: + extra_cuda_flags = ['-maxrregcount=50'] + sources=[srcpath / 'layer_norm_cuda.cpp', srcpath / 'layer_norm_cuda_kernel.cu'] fused_mix_prec_layer_norm_cuda = _cpp_extention_load_helper( - "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags) + "fused_mix_prec_layer_norm_cuda", sources, extra_cuda_flags, extra_include_paths) def _get_cuda_bare_metal_version(cuda_dir): diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu index 28a579e1a..ea14f80f7 100644 --- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu +++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu @@ -246,14 +246,25 @@ void cuWelfordMuSigma2( } } } - +#ifndef __HIP_PLATFORM_HCC__ template U rsqrt(U v) { +#else +template __device__ U rsqrt(U v) { +#endif return U(1) / sqrt(v); } +#ifndef __HIP_PLATFORM_HCC__ template<> float rsqrt(float v) { +#else +template<> __device__ float rsqrt(float v) { +#endif return rsqrtf(v); } +#ifndef __HIP_PLATFORM_HCC__ template<> double rsqrt(double v) { +#else +template<> __device__ double rsqrt(double v) { +#endif return rsqrt(v); } @@ -304,7 +315,11 @@ void cuApplyLayerNorm( // 1) blockDim.x == warpSize // 2) Tensors are contiguous // +#ifndef __HIP_PLATFORM_HCC__ for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { +#else + for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { +#endif SharedMemory shared; U* buf = shared.getPointer(); U mu,sigma2; @@ -543,7 +558,11 @@ void cuComputeGradInput( const V* gamma, T* grad_input) { +#ifndef __HIP_PLATFORM_HCC__ for (auto i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { +#else + for (int i1=blockIdx.y; i1 < n1; i1 += gridDim.y) { +#endif U sum_loss1 = U(0); U sum_loss2 = U(0); const U c_mean = mean[i1]; diff --git a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu index 2efee39a6..0c068c7cb 100644 --- a/megatron/fused_kernels/scaled_masked_softmax_cuda.cu +++ b/megatron/fused_kernels/scaled_masked_softmax_cuda.cu @@ -18,7 +18,9 @@ #include #include #include +#ifndef __HIP_PLATFORM_HCC__ #include +#endif #include #include #include "scaled_masked_softmax.h" diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h index 6df83fc10..ee140c037 100644 --- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h +++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h @@ -17,7 +17,8 @@ #pragma once #include -#include +#include +// #include #include #include #include diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu index 5efc3d412..59e452584 100644 --- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu +++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu @@ -18,7 +18,9 @@ #include #include #include +#ifndef __HIP_PLATFORM_HCC__ #include +#endif #include #include #include "scaled_upper_triang_masked_softmax.h" From 21c90de18f702731c96fbdf6e70a7435e2f486f7 Mon Sep 17 00:00:00 2001 From: Sampo Pyysalo Date: Fri, 21 Oct 2022 14:54:16 +0300 Subject: [PATCH 02/37] Add --no-layer-norm-fusion argument --- megatron/arguments.py | 3 +++ megatron/model/fused_layer_norm.py | 11 +++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index c18235a78..78f4724d8 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -549,6 +549,9 @@ def _add_training_args(parser): group.add_argument('--no-bias-dropout-fusion', action='store_false', help='Disable bias and dropout fusion.', dest='bias_dropout_fusion') + group.add_argument('--no-layer-norm-fusion', action='store_true', + help='Disable fused layer norm.', + dest='layer_norm_fusion') group.add_argument('--optimizer', type=str, default='adam', choices=['adam', 'sgd'], help='Optimizer function') diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 55e9c9dd8..7b1d7eaa7 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -86,10 +86,13 @@ def __init__(self, normalized_shape, eps=1e-5): args = get_args() self.layernorm_tp_auto_sync = args.sync_tp_duplicated_parameters - self.use_meg_ds_fused_layer_norm = ( - args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm - or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920 - ) + if not args.layer_norm_fusion: + self.use_meg_ds_fused_layer_norm = False + else: + self.use_meg_ds_fused_layer_norm = ( + args.bf16 # Current Meg-DS cuda kernel has better throughput than torch.nn.LayerNorm + or version.parse(torch.__version__) >= version.parse("1.11.0") # https://github.com/pytorch/pytorch/pull/66920 + ) def reset_parameters(self): From e0487132140df9d7fec727689864d2d03b022a3e Mon Sep 17 00:00:00 2001 From: Sampo Pyysalo Date: Fri, 21 Oct 2022 15:19:15 +0300 Subject: [PATCH 03/37] Add --no-optimizer-fusion argument --- megatron/arguments.py | 3 +++ megatron/optimizer/__init__.py | 22 +++++++++++++++------- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 78f4724d8..ca416670d 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -552,6 +552,9 @@ def _add_training_args(parser): group.add_argument('--no-layer-norm-fusion', action='store_true', help='Disable fused layer norm.', dest='layer_norm_fusion') + group.add_argument('--no-optimizer-fusion', action='store_true', + help='Disable FusedAdam/FusedSGD norm.', + dest='optimizer_fusion') group.add_argument('--optimizer', type=str, default='adam', choices=['adam', 'sgd'], help='Optimizer function') diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 738717d55..b492abfb4 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -13,8 +13,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -from apex.optimizers import FusedAdam as Adam -from apex.optimizers import FusedSGD as SGD +from torch.optim import AdamW +from torch.optim import SGD +from apex.optimizers import FusedAdam +from apex.optimizers import FusedSGD from megatron import get_args from megatron.model.fused_layer_norm import MixedFusedLayerNorm as LayerNorm @@ -72,18 +74,24 @@ def get_megatron_optimizer(model): if args.use_bnb_optimizer: import bitsandbytes as bnb adam_optimizer = bnb.optim.Adam8bit + elif args.optimizer_fusion: + adam_optimizer = FusedAdam else: - adam_optimizer = Adam + adam_optimizer = AdamW optimizer = adam_optimizer(param_groups, lr=args.lr, weight_decay=args.weight_decay, betas=(args.adam_beta1, args.adam_beta2), eps=args.adam_eps) elif args.optimizer == 'sgd': - optimizer = SGD(param_groups, - lr=args.lr, - weight_decay=args.weight_decay, - momentum=args.sgd_momentum) + if args.optimizer_fusion: + sgd_optimizer = FusedSGD + else: + sgd_optimizer = SGD + optimizer = sgd_optimizer(param_groups, + lr=args.lr, + weight_decay=args.weight_decay, + momentum=args.sgd_momentum) else: raise Exception('{} optimizer is not supported.'.format( args.optimizer)) From 18e2c65becf98e73744fd23b4e3b38fc5ccb1e0b Mon Sep 17 00:00:00 2001 From: Sampo Pyysalo Date: Fri, 21 Oct 2022 15:57:59 +0300 Subject: [PATCH 04/37] Bugfix (thanks to Thomas Wang for catching this) --- megatron/arguments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index ca416670d..d2499d149 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -549,10 +549,10 @@ def _add_training_args(parser): group.add_argument('--no-bias-dropout-fusion', action='store_false', help='Disable bias and dropout fusion.', dest='bias_dropout_fusion') - group.add_argument('--no-layer-norm-fusion', action='store_true', + group.add_argument('--no-layer-norm-fusion', action='store_false', help='Disable fused layer norm.', dest='layer_norm_fusion') - group.add_argument('--no-optimizer-fusion', action='store_true', + group.add_argument('--no-optimizer-fusion', action='store_false', help='Disable FusedAdam/FusedSGD norm.', dest='optimizer_fusion') group.add_argument('--optimizer', type=str, default='adam', From 9b7cd052ed2bb93c96b40dc7f34c4556655fed6e Mon Sep 17 00:00:00 2001 From: Hubert Lu <55214931+hubertlu-tw@users.noreply.github.com> Date: Fri, 18 Nov 2022 07:20:46 -0800 Subject: [PATCH 05/37] Fix the bug of FusedLayerNorm on ROCm (#96) --- .../fused_kernels/layer_norm_cuda_kernel.cu | 57 ++++++++++++------- megatron/model/fused_layer_norm.py | 17 +++--- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/megatron/fused_kernels/layer_norm_cuda_kernel.cu b/megatron/fused_kernels/layer_norm_cuda_kernel.cu index ea14f80f7..aae0c993c 100644 --- a/megatron/fused_kernels/layer_norm_cuda_kernel.cu +++ b/megatron/fused_kernels/layer_norm_cuda_kernel.cu @@ -76,7 +76,8 @@ void cuWelfordMuSigma2( const int i1, U& mu, U& sigma2, - U* buf) + U* buf, + const int GPU_WARP_SIZE) { // Assumptions: // 1) blockDim.x == warpSize @@ -106,12 +107,11 @@ void cuWelfordMuSigma2( cuWelfordOnlineSum(curr,mu,sigma2,count); } // intra-warp reductions - for (int l = 0; l <= 4; ++l) { - int srcLaneB = (threadIdx.x+(1<(muB,sigma2B,countB,mu,sigma2,count); + for (int stride = GPU_WARP_SIZE / 2; stride > 0; stride /= 2) { + U sigma2B = WARP_SHFL_DOWN(sigma2, stride); + U muB = WARP_SHFL_DOWN(mu, stride); + U countB = WARP_SHFL_DOWN(count, stride); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -160,7 +160,8 @@ void cuWelfordMuSigma2( const int i1, float& mu, float& sigma2, - float* buf) + float* buf, + const int GPU_WARP_SIZE) { // Assumptions: // 1) blockDim.x == warpSize @@ -201,12 +202,11 @@ void cuWelfordMuSigma2( cuWelfordOnlineSum(curr,mu,sigma2,count); } // intra-warp reductions - for (int l = 0; l <= 4; ++l) { - int srcLaneB = (threadIdx.x+(1< 0; stride /= 2) { + float sigma2B = WARP_SHFL_DOWN(sigma2, stride); + float muB = WARP_SHFL_DOWN(mu, stride); + float countB = WARP_SHFL_DOWN(count, stride); + cuChanOnlineSum(muB, sigma2B, countB, mu, sigma2, count); } // threadIdx.x == 0 has correct values for each warp // inter-warp reductions @@ -308,7 +308,8 @@ void cuApplyLayerNorm( const int n2, const U epsilon, const V* __restrict__ gamma, - const V* __restrict__ beta + const V* __restrict__ beta, + const int GPU_WARP_SIZE ) { // Assumptions: @@ -323,7 +324,7 @@ void cuApplyLayerNorm( SharedMemory shared; U* buf = shared.getPointer(); U mu,sigma2; - cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf); + cuWelfordMuSigma2(vals,n1,n2,i1,mu,sigma2,buf,GPU_WARP_SIZE); const T* lvals = vals + i1*n2; V* ovals = output_vals + i1*n2; U c_invvar = rsqrt(sigma2 + epsilon); @@ -686,7 +687,11 @@ void HostApplyLayerNorm( ) { auto stream = at::cuda::getCurrentCUDAStream().stream(); - const dim3 threads(32,4,1); + const int warp_size = at::cuda::warp_size(); + dim3 threads(warp_size,4,1); +#ifndef __HIP_PLATFORM_HCC__ + threads.y = 1; +#endif const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; const dim3 blocks(1, std::min((uint64_t)n1, maxGridY), 1); @@ -701,7 +706,9 @@ void HostApplyLayerNorm( input, n1,n2, U(epsilon), - gamma,beta); + gamma, + beta, + warp_size); } @@ -754,11 +761,16 @@ void HostLayerNormGradient( ) { auto stream = at::cuda::getCurrentCUDAStream().stream(); + const int warp_size = at::cuda::warp_size(); if (gamma != NULL && beta != NULL) { // compute grad_gamma(j) and grad_beta(j) +#ifndef __HIP_PLATFORM_HCC__ + const int part_size = warp_size; +#else const int part_size = 16; - const dim3 threads2(32,4,1); +#endif + const dim3 threads2(warp_size,4,1); const dim3 blocks2((n2+threads2.x-1)/threads2.x,part_size,1); const int nshared2_a = 2 * sizeof(U) * threads2.y * threads2.y * (threads2.x + 1); @@ -777,7 +789,7 @@ void HostLayerNormGradient( part_grad_gamma.DATA_PTR(), part_grad_beta.DATA_PTR()); - const dim3 threads3(32,8,1); + const dim3 threads3(warp_size,8,1); const dim3 blocks3((n2+threads2.x-1)/threads2.x,1,1); const int nshared3 = threads3.x * threads3.y * sizeof(U); cuComputeGradGammaBeta<<>>( @@ -793,7 +805,10 @@ void HostLayerNormGradient( const uint64_t maxGridY = at::cuda::getCurrentDeviceProperties()->maxGridSize[1]; const dim3 blocks1(1, std::min((uint64_t)n1, maxGridY), 1); - const dim3 threads1(32,4,1); + dim3 threads1(warp_size,4,1); +#ifndef __HIP_PLATFORM_HCC__ + threads1.y = 2; +#endif int nshared = threads1.y > 1 ? threads1.y*threads1.x*sizeof(U) : diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index 7b1d7eaa7..cadf45561 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -28,7 +28,7 @@ from torch.nn.parameter import Parameter import importlib import torch -import torch.nn.functional as F +from torch.nn import functional as F global fused_mix_prec_layer_norm_cuda fused_mix_prec_layer_norm_cuda = None @@ -102,13 +102,14 @@ def reset_parameters(self): def forward(self, input): - if self.layernorm_tp_auto_sync: torch.distributed.all_reduce(self.weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) - - if self.use_meg_ds_fused_layer_norm: - return FusedLayerNormAffineFunction.apply( - input, self.weight, self.bias, self.normalized_shape, self.eps) - else: - return F.layer_norm(input, self.normalized_shape, self.weight, self.bias) += + # CPU path is here for unittest sake. + if not input.is_cuda: + print("WARNING! The input of FusedLayerNorm should be on the GPU." + "This warning should only be triggered in the FusedLayerNorm unit tests.") + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) + return FusedLayerNormAffineFunction.apply( + input, self.weight, self.bias, self.normalized_shape,self.eps) From 277e1d38760375f1b0356cc84b25ead74da5a0cb Mon Sep 17 00:00:00 2001 From: Sampo Pyysalo Date: Fri, 18 Nov 2022 21:20:06 +0200 Subject: [PATCH 06/37] Revert cherry-picked changes to .py --- megatron/model/fused_layer_norm.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/megatron/model/fused_layer_norm.py b/megatron/model/fused_layer_norm.py index cadf45561..7b1d7eaa7 100644 --- a/megatron/model/fused_layer_norm.py +++ b/megatron/model/fused_layer_norm.py @@ -28,7 +28,7 @@ from torch.nn.parameter import Parameter import importlib import torch -from torch.nn import functional as F +import torch.nn.functional as F global fused_mix_prec_layer_norm_cuda fused_mix_prec_layer_norm_cuda = None @@ -102,14 +102,13 @@ def reset_parameters(self): def forward(self, input): + if self.layernorm_tp_auto_sync: torch.distributed.all_reduce(self.weight, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) torch.distributed.all_reduce(self.bias, op=torch.distributed.ReduceOp.AVG, group=mpu.get_tensor_model_parallel_group()) -= - # CPU path is here for unittest sake. - if not input.is_cuda: - print("WARNING! The input of FusedLayerNorm should be on the GPU." - "This warning should only be triggered in the FusedLayerNorm unit tests.") - return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps) - return FusedLayerNormAffineFunction.apply( - input, self.weight, self.bias, self.normalized_shape,self.eps) + + if self.use_meg_ds_fused_layer_norm: + return FusedLayerNormAffineFunction.apply( + input, self.weight, self.bias, self.normalized_shape, self.eps) + else: + return F.layer_norm(input, self.normalized_shape, self.weight, self.bias) From 2963caeaf1c55cfc6aee2eae39d999fcc7ed1ecf Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 22 Nov 2022 14:18:39 +0400 Subject: [PATCH 07/37] Add LUMI eval compat --- examples/run_evalharness_deepspeed.md | 1 + examples/run_evalharness_lumi.sh | 113 ++++++++++++++++++ tasks/eval_harness/download.py | 1 + tasks/eval_harness/evaluate.py | 2 +- .../deepspeed_to_megatron.py | 2 +- 5 files changed, 117 insertions(+), 2 deletions(-) create mode 100644 examples/run_evalharness_lumi.sh diff --git a/examples/run_evalharness_deepspeed.md b/examples/run_evalharness_deepspeed.md index 695d9d0aa..60f380d9c 100644 --- a/examples/run_evalharness_deepspeed.md +++ b/examples/run_evalharness_deepspeed.md @@ -15,6 +15,7 @@ Get lm-eval harness (https://github.com/EleutherAI/lm-evaluation-harness) and `b start-prod pip install best-download==0.0.7 pip install git+https://github.com/EleutherAI/lm-evaluation-harness +pip install --upgrade scipy ``` 2. Pre-download needed datasets diff --git a/examples/run_evalharness_lumi.sh b/examples/run_evalharness_lumi.sh new file mode 100644 index 000000000..71fcd1e6d --- /dev/null +++ b/examples/run_evalharness_lumi.sh @@ -0,0 +1,113 @@ +#!/bin/bash +#SBATCH --exclude=nid005159 +#SBATCH --nodes=1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=32 +#SBATCH --mem=256G +#SBATCH -p eap +#SBATCH -t 2-0:00:00 +#SBATCH --gpus-per-node=mi250:1 +#SBATCH --exclusive=user +#SBATCH --hint=nomultithread +#SBATCH --account=project_462000119 +#SBATCH -o logs/%j.out +#SBATCH -e logs/%j.err + +# if run without sbatch, invoke here +if [ -z $SLURM_JOB_ID ]; then + mkdir -p logs + sbatch "$0" + exit +fi + +set -euo pipefail + +# symlink logs/latest_eval.out and logs/latest_eval.err +ln -f -s $SLURM_JOB_ID.out logs/latest_eval.out +ln -f -s $SLURM_JOB_ID.err logs/latest_eval.err + +# Data +CHECKPOINT_PATH=/scratch/project_462000119/muennighoff/nov-2022-optimization/checkpoints/global_step10 +VARIANT=global_step10 + +export HF_DATASETS_OFFLINE=1 +export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache + +VOCAB_FILE="gpt2/vocab.json" +MERGE_FILE="gpt2/merges.txt" + +PP_SIZE=1 +TP_SIZE=1 +# different from the training MICRO_BATCH_SIZE - no optim memory, so can do bigger BS +# make as big as it can fit into gpu w/o OOM, but not too close to 100% +EVAL_MICRO_BATCH_SIZE=1 +MICRO_BS_MULTIPLIER=1 + +# Model parameters +SEQ_LEN=2048 + +# Dummy arguments +MEGATRON_REQUIRED_ARGS=" \ + --num-layers -1 \ + --hidden-size -1 \ + --num-attention-heads -1 \ + --seq-length -1 \ + --max-position-embeddings -1 \ +" + +ZERO_STAGE=0 + +mkdir -p ds_configs +DS_CONFIG_PATH="ds_configs/$SLURM_JOB_ID.json" + +cat < $DS_CONFIG_PATH +{ + "train_micro_batch_size_per_gpu": 1, + "train_batch_size": 1, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": $ZERO_STAGE + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 2000, + "wall_clock_breakdown": false +} +EOF + +DEEPSPEED_ARGS=" \ + --deepspeed \ + --deepspeed_config $DS_CONFIG_PATH \ + --zero-stage $ZERO_STAGE \ + " + +CMD="Megatron-DeepSpeed/tasks/eval_harness/evaluate.py \ + --load $CHECKPOINT_PATH \ + --results_path $VARIANT-results.json \ + --tensor-model-parallel-size $TP_SIZE \ + --pipeline-model-parallel-size $PP_SIZE \ + --vocab-file $VOCAB_FILE \ + --merge-file $MERGE_FILE \ + --micro-batch-size $EVAL_MICRO_BATCH_SIZE \ + --no-load-optim \ + --no-load-rng \ + --bf16 \ + --inference \ + --seq-length $SEQ_LEN \ + --task_list piqa \ + --intermed_results \ + --adaptive_seq_len \ + --micro_bs_multiplier $MICRO_BS_MULTIPLIER \ + $MEGATRON_REQUIRED_ARGS \ + $DEEPSPEED_ARGS \ + " + +echo $CMD + +echo "START $SLURM_JOBID: $(date)" + +srun --label launch.sh $CMD + +echo "END $SLURM_JOBID: $(date)" + diff --git a/tasks/eval_harness/download.py b/tasks/eval_harness/download.py index d2abcd83a..be5d5c303 100644 --- a/tasks/eval_harness/download.py +++ b/tasks/eval_harness/download.py @@ -1,6 +1,7 @@ # Downloads the specified taks in the evaluation harness # This is particularly useful when running in environments where the GPU nodes # do not have internet access. This way we can pre-download them and use the cached data-set during evaluation. +# May want to set a cache before, e.g. export HF_DATASETS_CACHE=/scratch/project_462000119/ds_cache from lm_eval import tasks from lm_eval.tasks import ALL_TASKS diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 68dd649fd..7e0cdb60e 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -260,7 +260,7 @@ def tokenizer_encode(self, text): from megatron.initialize import initialize_megatron import megatron -from tools.convert_checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint +from deepspeed.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint from tools.convert_checkpoint.deepspeed_to_megatron import _create_rank_checkpoint def override_args(args, override_args, skip_keys, skip_if_specified_keys): diff --git a/tools/convert_checkpoint/deepspeed_to_megatron.py b/tools/convert_checkpoint/deepspeed_to_megatron.py index 74e5ca7c9..08471d0bb 100755 --- a/tools/convert_checkpoint/deepspeed_to_megatron.py +++ b/tools/convert_checkpoint/deepspeed_to_megatron.py @@ -4,7 +4,7 @@ import os import torch from collections import OrderedDict -from .deepspeed_checkpoint import ARGS_KEY, DeepSpeedCheckpoint +from deepspeed.checkpoint.deepspeed_checkpoint import DeepSpeedCheckpoint MODEL_KEY = 'model' ARGS_KEY = 'args' From 32f039c2d08da415d49559cad907d50a6ed971a9 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 22 Nov 2022 15:37:24 +0400 Subject: [PATCH 08/37] Update tasks --- examples/run_evalharness_lumi.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_evalharness_lumi.sh b/examples/run_evalharness_lumi.sh index 71fcd1e6d..1721d91d2 100644 --- a/examples/run_evalharness_lumi.sh +++ b/examples/run_evalharness_lumi.sh @@ -95,7 +95,7 @@ CMD="Megatron-DeepSpeed/tasks/eval_harness/evaluate.py \ --bf16 \ --inference \ --seq-length $SEQ_LEN \ - --task_list piqa \ + --task_list copa,piqa,rte,winogrande,hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions \ --intermed_results \ --adaptive_seq_len \ --micro_bs_multiplier $MICRO_BS_MULTIPLIER \ From 2ca2338c5633d0e5e529bbf7a613030564aa442b Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Tue, 22 Nov 2022 17:52:05 +0200 Subject: [PATCH 09/37] add inverse_sqrt lr decay style --- megatron/arguments.py | 2 +- megatron/learning_rates.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index d2499d149..f06ff6dbd 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -610,7 +610,7 @@ def _add_learning_rate_args(parser): 'and initial warmup, the learing rate at each ' 'iteration would be different.') group.add_argument('--lr-decay-style', type=str, default='linear', - choices=['constant', 'linear', 'cosine'], + choices=['constant', 'linear', 'cosine', 'inverse_sqrt'], help='Learning rate decay function.') group.add_argument('--lr-decay-iters', type=int, default=None, help='number of iterations to decay learning rate over,' diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index ae1fcdb2b..8fa6f40af 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -68,6 +68,9 @@ def get_lr(self): if self.num_steps == self.warmup_steps and \ self.decay_tokens is not None: self.warmup_tokens = self.num_tokens + # if self.decay_style == 'inverse_sqrt': + # # use constant warmup for inverse_sqrt + # return 1e-2 return self.max_lr * float(self.num_steps) / \ float(self.warmup_steps) @@ -102,6 +105,9 @@ def get_lr(self): coeff = (1.0 - decay_ratio) elif self.decay_style == 'cosine': coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) + elif self.decay_style == 'inverse_sqrt': + return self.max_lr * math.sqrt(float(self.warmup_steps)) / \ + math.sqrt(float(num_steps_)) else: raise Exception('{} decay style is not supported.'.format( self.decay_style)) From ad60932feb79513fdfbbd5f4d82dfe25fc08eedd Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Tue, 22 Nov 2022 17:54:55 +0200 Subject: [PATCH 10/37] fix no warmup case --- megatron/learning_rates.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 8fa6f40af..e23758d95 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -106,8 +106,8 @@ def get_lr(self): elif self.decay_style == 'cosine': coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) elif self.decay_style == 'inverse_sqrt': - return self.max_lr * math.sqrt(float(self.warmup_steps)) / \ - math.sqrt(float(num_steps_)) + return self.max_lr * math.sqrt(float(max(self.warmup_steps, 1))) / \ + math.sqrt(float(max(num_steps_,1))) else: raise Exception('{} decay style is not supported.'.format( self.decay_style)) From 0823ad8c1dc8a1b444dfc679648658892980b0fa Mon Sep 17 00:00:00 2001 From: Nouamane Tazi Date: Wed, 23 Nov 2022 15:05:27 +0100 Subject: [PATCH 11/37] use t5x formula Co-authored-by: Thomas Wang <24695242+thomasw21@users.noreply.github.com> --- megatron/learning_rates.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index e23758d95..73e16d519 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -106,8 +106,7 @@ def get_lr(self): elif self.decay_style == 'cosine': coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) elif self.decay_style == 'inverse_sqrt': - return self.max_lr * math.sqrt(float(max(self.warmup_steps, 1))) / \ - math.sqrt(float(max(num_steps_,1))) + return self.max_lr / math.sqrt(float(max(num_steps_, 1))) else: raise Exception('{} decay style is not supported.'.format( self.decay_style)) From a093db6fe567eae48d379b992b8fb16c97b9ef9f Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Wed, 23 Nov 2022 15:11:54 +0100 Subject: [PATCH 12/37] avoid num_steps > decay_steps case --- megatron/learning_rates.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 73e16d519..9a46d5ccd 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -78,6 +78,9 @@ def get_lr(self): if self.decay_style == 'constant': return self.max_lr + if self.decay_style == 'inverse_sqrt': + return self.max_lr / math.sqrt(float(max(num_steps_, 1))) + if self.decay_tokens is None: # step-based decay @@ -105,8 +108,6 @@ def get_lr(self): coeff = (1.0 - decay_ratio) elif self.decay_style == 'cosine': coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) - elif self.decay_style == 'inverse_sqrt': - return self.max_lr / math.sqrt(float(max(num_steps_, 1))) else: raise Exception('{} decay style is not supported.'.format( self.decay_style)) From b4601b9ef327025f678d01ef0ebf0f6d3c9be1a5 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Wed, 23 Nov 2022 15:15:17 +0100 Subject: [PATCH 13/37] remove casting as math.sqrt does that --- megatron/learning_rates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 9a46d5ccd..777133ce8 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -79,7 +79,7 @@ def get_lr(self): return self.max_lr if self.decay_style == 'inverse_sqrt': - return self.max_lr / math.sqrt(float(max(num_steps_, 1))) + return self.max_lr / math.sqrt(max(num_steps_, 1)) if self.decay_tokens is None: # step-based decay From 4dae1399e4b5a2fa2f1659ccf8b6b4668f9ded62 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Wed, 23 Nov 2022 15:28:42 +0100 Subject: [PATCH 14/37] add lr-warmup-style argument taking "constant" or "linear" values --- megatron/arguments.py | 2 ++ megatron/learning_rates.py | 20 ++++++++++++-------- megatron/training.py | 1 + 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index f06ff6dbd..946ccda9a 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -621,6 +621,8 @@ def _add_learning_rate_args(parser): group.add_argument('--lr-decay-tokens', type=int, default=None, help='number of tokens to decay learning rate over,' ' If not None will override iter/sample-based decay') + group.add_argument('--lr-warmup-style', type=str, default='linear', + choices=['constant', 'linear']), group.add_argument('--lr-warmup-fraction', type=float, default=None, help='fraction of lr-warmup-(iters/samples) to use ' 'for warmup (as a float)') diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 777133ce8..77a74c683 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -23,7 +23,7 @@ class AnnealingLR(object): """Anneals the learning rate.""" def __init__(self, optimizer, max_lr, min_lr, - warmup_steps, decay_steps, decay_style, + warmup_steps, decay_steps, decay_style, warmup_style, use_checkpoint_lr_scheduler=True, override_lr_scheduler=False): args = get_args() @@ -46,6 +46,7 @@ def __init__(self, optimizer, max_lr, min_lr, self.warmup_tokens = 0 self.decay_style = decay_style + self.warmup_style = warmup_style self.override_lr_scheduler = override_lr_scheduler self.use_checkpoint_lr_scheduler = use_checkpoint_lr_scheduler @@ -63,23 +64,26 @@ def get_lr(self): """Learning rate decay functions from: https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" - # Use linear warmup for the initial part. + # Use warmup for the initial part. if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps: if self.num_steps == self.warmup_steps and \ self.decay_tokens is not None: self.warmup_tokens = self.num_tokens - # if self.decay_style == 'inverse_sqrt': - # # use constant warmup for inverse_sqrt - # return 1e-2 - return self.max_lr * float(self.num_steps) / \ - float(self.warmup_steps) + if self.warmup_style == 'linear': + return self.max_lr * float(self.num_steps) / \ + float(self.warmup_steps) + elif self.warmup_style == 'constant': + return self.max_lr + else: + raise ValueError('Unknown warmup style: {}'.format( + self.warmup_style)) # If the learning rate is constant, just return the initial value. if self.decay_style == 'constant': return self.max_lr if self.decay_style == 'inverse_sqrt': - return self.max_lr / math.sqrt(max(num_steps_, 1)) + return self.max_lr / math.sqrt(max(self.num_steps - self.warmup_steps, 1)) if self.decay_tokens is None: # step-based decay diff --git a/megatron/training.py b/megatron/training.py index bd00bc77e..d86c51440 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -361,6 +361,7 @@ def get_learning_rate_scheduler(optimizer): warmup_steps=warmup_steps, decay_steps=decay_steps, decay_style=args.lr_decay_style, + warmup_style=args.lr_warmup_style, use_checkpoint_lr_scheduler=args.use_checkpoint_lr_scheduler, override_lr_scheduler=args.override_lr_scheduler) From 5fbb1dd54c39d98059dc8ee688225981cb4466c1 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Wed, 23 Nov 2022 19:28:13 +0100 Subject: [PATCH 15/37] refactor num_steps_ --- megatron/learning_rates.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 77a74c683..b21890dab 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -83,7 +83,8 @@ def get_lr(self): return self.max_lr if self.decay_style == 'inverse_sqrt': - return self.max_lr / math.sqrt(max(self.num_steps - self.warmup_steps, 1)) + num_steps_ = self.num_steps - self.warmup_steps + return self.max_lr / math.sqrt(max(num_steps_, 1)) if self.decay_tokens is None: # step-based decay From 6299fb24bb169b58581d103403cd10b489ec69de Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Wed, 23 Nov 2022 19:30:10 +0100 Subject: [PATCH 16/37] docs --- megatron/arguments.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index 946ccda9a..ed5cf2db6 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -622,7 +622,8 @@ def _add_learning_rate_args(parser): help='number of tokens to decay learning rate over,' ' If not None will override iter/sample-based decay') group.add_argument('--lr-warmup-style', type=str, default='linear', - choices=['constant', 'linear']), + choices=['constant', 'linear'], help='Learning rate ' + 'warmup function.') group.add_argument('--lr-warmup-fraction', type=float, default=None, help='fraction of lr-warmup-(iters/samples) to use ' 'for warmup (as a float)') From 4e8665093377eaf1713edfb8abbcf062a8034ca1 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Thu, 24 Nov 2022 16:17:28 +0100 Subject: [PATCH 17/37] fix formulas --- megatron/learning_rates.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index b21890dab..5bc0c02ea 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -64,6 +64,19 @@ def get_lr(self): """Learning rate decay functions from: https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" + + if self.decay_style == 'inverse_sqrt': + if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps: + if self.warmup_style == 'linear': + return self.max_lr * self.num_steps / (self.warmup_steps * self.warmup_steps**0.5) + elif self.warmup_style == 'constant': + return self.max_lr / self.warmup_steps**0.5 + else: + raise ValueError('Unknown warmup style: {}'.format( + self.warmup_style)) + + return self.max_lr / (max(self.num_steps, 1))**0.5 + # Use warmup for the initial part. if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps: if self.num_steps == self.warmup_steps and \ @@ -82,9 +95,6 @@ def get_lr(self): if self.decay_style == 'constant': return self.max_lr - if self.decay_style == 'inverse_sqrt': - num_steps_ = self.num_steps - self.warmup_steps - return self.max_lr / math.sqrt(max(num_steps_, 1)) if self.decay_tokens is None: # step-based decay From 50c693599725bbab9d5416a880c822652d5f8d2c Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Thu, 24 Nov 2022 20:04:32 +0100 Subject: [PATCH 18/37] fix formula Co-authored-by: thomasw21 <24695242+thomasw21@users.noreply.github.com> --- megatron/learning_rates.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 5bc0c02ea..4ae63d3ca 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -65,18 +65,6 @@ def get_lr(self): https://openreview.net/pdf?id=BJYwwY9ll pg. 4""" - if self.decay_style == 'inverse_sqrt': - if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps: - if self.warmup_style == 'linear': - return self.max_lr * self.num_steps / (self.warmup_steps * self.warmup_steps**0.5) - elif self.warmup_style == 'constant': - return self.max_lr / self.warmup_steps**0.5 - else: - raise ValueError('Unknown warmup style: {}'.format( - self.warmup_style)) - - return self.max_lr / (max(self.num_steps, 1))**0.5 - # Use warmup for the initial part. if self.warmup_steps > 0 and self.num_steps <= self.warmup_steps: if self.num_steps == self.warmup_steps and \ @@ -96,6 +84,12 @@ def get_lr(self): return self.max_lr + # If linear + # In warmup phase: lr = max_lr + # In decay phase: lr = max_lr * sqrt(warmup_steps) / sqrt(num_steps) + if self.decay_style == 'inverse_sqrt': + return self.max_lr * (max(self.warmup_steps, 1) / max(self.num_steps, 1))**0.5 + if self.decay_tokens is None: # step-based decay From 5c642dd30eceab840e186f44f06ac221ffc277a7 Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Mon, 28 Nov 2022 13:38:00 +0100 Subject: [PATCH 19/37] correct comment --- megatron/learning_rates.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index 4ae63d3ca..e74534775 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -84,7 +84,7 @@ def get_lr(self): return self.max_lr - # If linear + # If constant decay style # In warmup phase: lr = max_lr # In decay phase: lr = max_lr * sqrt(warmup_steps) / sqrt(num_steps) if self.decay_style == 'inverse_sqrt': From 1b14a28cfc95dcb05be0bf79b4ba72394a490f1d Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Mon, 28 Nov 2022 15:26:41 +0100 Subject: [PATCH 20/37] note about replicating t5x --- megatron/learning_rates.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/learning_rates.py b/megatron/learning_rates.py index e74534775..c886b9814 100644 --- a/megatron/learning_rates.py +++ b/megatron/learning_rates.py @@ -87,6 +87,7 @@ def get_lr(self): # If constant decay style # In warmup phase: lr = max_lr # In decay phase: lr = max_lr * sqrt(warmup_steps) / sqrt(num_steps) + # Note: To replicate t5x check https://github.com/TurkuNLP/Megatron-DeepSpeed/pull/2 if self.decay_style == 'inverse_sqrt': return self.max_lr * (max(self.warmup_steps, 1) / max(self.num_steps, 1))**0.5 From 5365f41f37b8f8d8df9be8a52d2595c760de08ef Mon Sep 17 00:00:00 2001 From: NouamaneTazi Date: Tue, 6 Dec 2022 20:26:44 +0100 Subject: [PATCH 21/37] quick fix for upper triang masked softmax cuda kernel for seq_len < 8192 --- .../scaled_upper_triang_masked_softmax.h | 20 +++++++++++++++++-- ...scaled_upper_triang_masked_softmax_cuda.cu | 2 +- .../fused_kernels/tests/test_fused_kernels.py | 4 ++++ megatron/initialize.py | 2 +- megatron/model/fused_softmax.py | 4 ++-- 5 files changed, 26 insertions(+), 6 deletions(-) diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h index ee140c037..4e6a467ce 100644 --- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h +++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h @@ -341,7 +341,7 @@ void dispatch_scaled_upper_triang_masked_softmax_forward( int softmax_elements_stride, int attn_batches) { - TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 2048 ); + TORCH_INTERNAL_ASSERT(softmax_elements >= 0 && softmax_elements <= 8192 ); if (softmax_elements == 0) { return; } else { @@ -416,6 +416,14 @@ void dispatch_scaled_upper_triang_masked_softmax_forward( scaled_upper_triang_masked_softmax_warp_forward <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); break; + case 12: // 4096 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 13: // 8192 + scaled_upper_triang_masked_softmax_warp_forward + <<>>(dst, src, scale, batch_count, softmax_elements_stride, softmax_elements); + break; default: break; } @@ -432,7 +440,7 @@ void dispatch_scaled_upper_triang_masked_softmax_backward( int softmax_elements_stride, int attn_batches) { - TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 2048 ); + TORCH_INTERNAL_ASSERT( softmax_elements >= 0 && softmax_elements <= 8192 ); if (softmax_elements == 0) { return; } else { @@ -507,6 +515,14 @@ void dispatch_scaled_upper_triang_masked_softmax_backward( scaled_upper_triang_masked_softmax_warp_backward <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); break; + case 12: // 4096 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; + case 13: // 8192 + scaled_upper_triang_masked_softmax_warp_backward + <<>>(grad_input, grad, output, scale, batch_count, softmax_elements_stride, softmax_elements); + break; default: break; } diff --git a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu index 59e452584..4aa9a702a 100644 --- a/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu +++ b/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu @@ -37,7 +37,7 @@ torch::Tensor fwd_cuda( // input is a 3d tensor with dimensions [attn_batches, seq_len, seq_len] const int attn_batches = input.size(0); const int seq_len = input.size(1); - TORCH_INTERNAL_ASSERT(seq_len <= 2048); + TORCH_INTERNAL_ASSERT(seq_len <= 8192); // Output auto act_options = input.options().requires_grad(false); diff --git a/megatron/fused_kernels/tests/test_fused_kernels.py b/megatron/fused_kernels/tests/test_fused_kernels.py index f8d5027a1..3e7deb913 100644 --- a/megatron/fused_kernels/tests/test_fused_kernels.py +++ b/megatron/fused_kernels/tests/test_fused_kernels.py @@ -3,6 +3,10 @@ import torch from torch.nn import LayerNorm +import sys +# add to path +sys.path.append("/home/nouamane/projects/Megatron-DeepSpeed/") +import megatron from megatron.model.enums import AttnMaskType from megatron.model.fused_layer_norm import MixedFusedLayerNorm from megatron.model.fused_softmax import FusedScaleMaskSoftmax diff --git a/megatron/initialize.py b/megatron/initialize.py index b967a228a..2e92b15af 100644 --- a/megatron/initialize.py +++ b/megatron/initialize.py @@ -194,7 +194,7 @@ def _compile_dependencies(): args.micro_batch_size # Constraints on sequence length and attn_batch_size to enable warp based # optimization and upper triangular optimization (for causal mask) - custom_kernel_constraint = seq_len > 16 and seq_len <=2048 and \ + custom_kernel_constraint = seq_len > 16 and seq_len <=8192 and \ seq_len % 4 == 0 and attn_batch_size % 4 == 0 # Print a warning. if not ((args.fp16 or args.bf16) and diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 07192e2bf..57c6c4769 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -167,11 +167,11 @@ def is_kernel_available(self, mask, b, np, sq, sk): if ( self.scaled_masked_softmax_fusion # user want to fuse and self.input_in_float16 # input must be fp16 - and 16 < sk <= 4096 # sk must be 16 ~ 4096 + and 16 < sk <= 8192 # sk must be 16 ~ 8192 and sq % 4 == 0 # sq must be divisor of 4 and attn_batches % 4 == 0 # np * b must be divisor of 4 ): - if 0 <= sk <= 4096: + if 0 <= sk <= 8192: batch_per_block = self.get_batch_per_block(sq, sk, b, np) if self.attn_mask_type == AttnMaskType.causal: From c41cc5e0e2cbc4046dd9646c925f162a0d16ee41 Mon Sep 17 00:00:00 2001 From: Sampo Pyysalo Date: Wed, 7 Dec 2022 14:04:33 +0200 Subject: [PATCH 22/37] Use torch.multiprocessing.set_start_method('spawn') Resolves DataLoader segfaults seen in some multinode runs with PP > 1. --- megatron/data/indexed_dataset.py | 2 +- pretrain_gpt.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index d0d312544..0322cba86 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -513,7 +513,7 @@ def __getstate__(self): return self._path def __setstate__(self, state): - self._do_init(state) + self._do_init(state, False) def _do_init(self, path, skip_warmup): self._path = path diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 5f8b104fc..2d0691f4e 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -228,6 +228,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): @record def main(): + torch.multiprocessing.set_start_method('spawn') pretrain(train_valid_test_datasets_provider, model_provider, forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) From 6732bc9b568e568498fd291895579a359afd080b Mon Sep 17 00:00:00 2001 From: Sampo Pyysalo Date: Fri, 9 Dec 2022 08:44:46 +0200 Subject: [PATCH 23/37] skip_warmup on __setstate__ --- megatron/data/indexed_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/megatron/data/indexed_dataset.py b/megatron/data/indexed_dataset.py index 0322cba86..403e4b5d2 100644 --- a/megatron/data/indexed_dataset.py +++ b/megatron/data/indexed_dataset.py @@ -513,7 +513,7 @@ def __getstate__(self): return self._path def __setstate__(self, state): - self._do_init(state, False) + self._do_init(state, True) def _do_init(self, path, skip_warmup): self._path = path From ab29faf5e6106ece378019e1712820095e75e5a7 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Wed, 28 Dec 2022 18:09:06 +0400 Subject: [PATCH 24/37] Copy preliminary UL2 --- megatron/arguments.py | 50 +++++- megatron/data/dataset_utils.py | 61 ++++++- megatron/data/t5_dataset.py | 52 ++++-- megatron/data/ul2_dataset.py | 281 ++++++++++++++++++++++++++++++++ megatron/enums.py | 5 + megatron/model/fused_softmax.py | 2 +- megatron/tokenizer/tokenizer.py | 12 ++ pretrain_ul2.py | 185 +++++++++++++++++++++ 8 files changed, 628 insertions(+), 20 deletions(-) create mode 100644 megatron/data/ul2_dataset.py create mode 100644 pretrain_ul2.py diff --git a/megatron/arguments.py b/megatron/arguments.py index ed5cf2db6..cca7bd57d 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -24,7 +24,7 @@ import torch import deepspeed -from megatron.enums import PositionEmbeddingType +from megatron.enums import PositionEmbeddingType, UL2ModelType import megatron from megatron.logging import log_levels @@ -49,6 +49,7 @@ def parse_args(extra_args_provider=None, defaults={}, parser = _add_autoresume_args(parser) parser = _add_biencoder_args(parser) parser = _add_vit_args(parser) + parser = _add_ul2_args(parser) parser = _add_logging_args(parser) parser = _add_zero_args(parser) parser = _add_memoryopt_args(parser) @@ -309,6 +310,17 @@ def parse_args(extra_args_provider=None, defaults={}, "skip train iterations should be specified as two numbers, i.e. start-end" ) args.skip_train_iteration_range = skip_train_iteration_range + + args.ul2_model_type = UL2ModelType(args.ul2_model_type) + if ( + args.ul2_model_type is not UL2ModelType.ENCODER_DECODER + and args.decoder_seq_length is not None + ): + print( + f'WARNING: `--decoder_seq_length` is ignored when ' + f'`--ul2-model-type` is not ' + f'"{UL2ModelType.ENCODER_DECODER.value}"!' + ) if args.use_bnb_optimizer: try: @@ -1032,6 +1044,42 @@ def _add_vit_args(parser): return parser +def _add_ul2_args(parser): + group = parser.add_argument_group(title="UL2") + + group.add_argument('--ul2-model-type', type=str, default='ED', + choices=['ED', 'ND', 'CD'], + help='What type of model to use for UL2 pretraining. ' + 'ED = encoder-decoder; ND = non-causal decoder-only; ' + 'CD = causal decoder-only') + group.add_argument('--ul2-denoiser-ratios', nargs='+', type=float, + default=None, + help='Probability of each denoising objective to be ' + 'selected. Uniform distribution by default.') + group.add_argument('--ul2-denoisers', nargs='+', type=str, + default=['R', 'R', 'S', 'X', 'X', 'X', 'X'], + choices=['R', 'S', 'X'], + help='What type of UL2 denoising objective the other ' + 'UL2 configurations refer to.') + group.add_argument('--ul2-mean-span-lengths', nargs='+', type=float, + default=[3, 8, 0.25, 3, 8, 64, 64], + help='Mean length for sampling span lengths. ' + 'Numbers < 1 indicate a mean length of the sequence ' + 'length times that number.') + group.add_argument('--ul2-mask-ratios', nargs='+', type=float, + default=[0.15, 0.15, 0.25, 0.5, 0.5, 0.15, 0.5], + help='Ratio of masked token in the full sequence.') + group.add_argument('--ul2-r-denoiser-token', type=str, default='[R]', + help='What token to prepend for the UL2 R-denoising ' + 'objective.') + group.add_argument('--ul2-s-denoiser-token', type=str, default='[S]', + help='What token to prepend for the UL2 S-denoising ' + 'objective.') + group.add_argument('--ul2-x-denoiser-token', type=str, default='[X]', + help='What token to prepend for the UL2 X-denoising ' + 'objective.') + + return parser def _add_zero_args(parser): """Text generate arguments.""" diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index 3841e263e..60d4e0d90 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -18,6 +18,7 @@ # https://github.com/google-research/albert/blob/master/create_pretraining_data.py # with some modifications. +from enum import Enum import math import os import time @@ -37,8 +38,16 @@ DSET_TYPE_BERT = 'standard_bert' DSET_TYPE_ICT = 'ict' DSET_TYPE_T5 = 't5' +DSET_TYPE_UL2 = 'ul2' -DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5] +DSET_TYPES = [DSET_TYPE_BERT, DSET_TYPE_ICT, DSET_TYPE_T5, DSET_TYPE_UL2] + + +class SamplingStyle(Enum): + POISSON = 'poisson' + GEOMETRIC = 'geometric' + UNIFORM = 'uniform' + NORMAL = 'normal' def analyze_data_prefix(data_prefix): @@ -194,9 +203,16 @@ def create_masked_lm_predictions(tokens, favor_longer_ngram=False, do_permutation=False, geometric_dist=False, - masking_style="bert"): + masking_style="bert", + sampling_style=SamplingStyle.POISSON, + prefix_lm=False): """Creates the predictions for the masked LM objective. Note: Tokens here are vocab ids and not text tokens.""" + if not isinstance(sampling_style, SamplingStyle): + sampling_style = SamplingStyle(sampling_style) + # Backward-compatibility + if geometric_dist: + sampling_style = SamplingStyle.GEOMETRIC cand_indexes = [] # Note(mingdachen): We create a list for recording if the piece is @@ -235,18 +251,24 @@ def create_masked_lm_predictions(tokens, max(1, int(round(len(tokens) * masked_lm_prob)))) ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) - if not geometric_dist: + if sampling_style is SamplingStyle.POISSON: # Note(mingdachen): # By default, we set the probilities to favor shorter ngram sequences. pvals = 1. / np.arange(1, max_ngrams + 1) pvals /= pvals.sum(keepdims=True) if favor_longer_ngram: pvals = pvals[::-1] + elif sampling_style is SamplingStyle.NORMAL: + normal_mean = (max_ngrams + 1) / 2 ngram_indexes = [] for idx in range(len(cand_indexes)): ngram_index = [] for n in ngrams: + if prefix_lm: + last_cand_index_index = min(idx + n - 1, len(cand_indexes) - 1) + if cand_indexes[last_cand_index_index][-1] < len(tokens) - 1: + continue ngram_index.append(cand_indexes[idx:idx + n]) ngram_indexes.append(ngram_index) @@ -266,15 +288,25 @@ def create_masked_lm_predictions(tokens, if index in covered_indexes: continue - if not geometric_dist: + if sampling_style is SamplingStyle.POISSON: n = np_rng.choice(ngrams[:len(cand_index_set)], p=pvals[:len(cand_index_set)] / pvals[:len(cand_index_set)].sum(keepdims=True)) - else: + elif sampling_style is SamplingStyle.GEOMETRIC: # Sampling "n" from the geometric distribution and clipping it to # the max_ngrams. Using p=0.2 default from the SpanBERT paper # https://arxiv.org/pdf/1907.10529.pdf (Sec 3.1) n = min(np_rng.geometric(0.2), max_ngrams) + elif sampling_style is SamplingStyle.UNIFORM: + n = np_rng.choice(ngrams[:len(cand_index_set)]) + elif sampling_style is SamplingStyle.NORMAL: + n = round(np.clip( + np_rng.normal(loc=normal_mean), + 1, + len(cand_index_set), + )) + else: + raise ValueError('unknown sampling style') index_set = sum(cand_index_set[n - 1], []) n -= 1 @@ -522,6 +554,7 @@ def build_dataset(index, name): from megatron.data.bert_dataset import BertDataset from megatron.data.ict_dataset import ICTDataset from megatron.data.t5_dataset import T5Dataset + from megatron.data.ul2_dataset import UL2Dataset dataset = None if splits[index + 1] > splits[index]: # Get the pointer to the original doc-idx so we can set it later. @@ -560,6 +593,24 @@ def build_dataset(index, name): short_seq_prob=short_seq_prob, **kwargs ) + elif dataset_type == DSET_TYPE_UL2: + args = get_args() + dataset = UL2Dataset( + indexed_dataset=indexed_dataset, + model_type=args.ul2_model_type, + denoiser_ratios=args.ul2_denoiser_ratios, + denoisers=args.ul2_denoisers, + mean_span_lengths=args.ul2_mean_span_lengths, + mask_ratios=args.ul2_mask_ratios, + denoiser_tokens={ + 'R': args.ul2_r_denoiser_token, + 'S': args.ul2_s_denoiser_token, + 'X': args.ul2_x_denoiser_token, + }, + max_seq_length_dec=max_seq_length_dec, + short_seq_prob=short_seq_prob, + **kwargs, + ) elif dataset_type == DSET_TYPE_BERT: dataset = BertDataset( indexed_dataset=indexed_dataset, diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py index 42110b923..af3374da6 100644 --- a/megatron/data/t5_dataset.py +++ b/megatron/data/t5_dataset.py @@ -26,6 +26,27 @@ get_samples_mapping ) + +class LengthExceededError(ValueError): + def __init__(self, msg=None): + if msg is None: + msg = ( + 'The sequence input became too long. ' + 'Try to increase `--seq-length` or `--encoder-seq-length`.' + ) + super().__init__(msg) + + +class DecoderLengthExceededError(ValueError): + def __init__(self, msg=None): + if msg is None: + msg = ( + 'The sequence input for the decoder became too long. ' + 'Try to increase `--decoder-seq-length`.' + ) + super().__init__(msg) + + class T5Dataset(torch.utils.data.Dataset): def __init__(self, name, indexed_dataset, data_prefix, @@ -157,38 +178,41 @@ def build_training_sample(sample, target_seq_length, return train_sample -def pad_and_convert_to_numpy(tokens, masked_positions, - masked_labels, pad_id, - max_seq_length, max_seq_length_dec, - masked_spans=None, bos_id=None, - eos_id=None, sentinel_tokens=None): - """Pad sequences and convert them to numpy.""" - +def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None, + eos_id=None, sentinel_tokens=None): sentinel_tokens = collections.deque(sentinel_tokens) t5_input = [] (t5_decoder_in, t5_decoder_out) = ([bos_id], []) (start_index, end_index) = (0, None) for span in masked_spans: flag = sentinel_tokens.popleft() - # Append the same tokens in decoder input and output t5_decoder_in.append(flag) t5_decoder_in.extend(span.label) t5_decoder_out.append(flag) t5_decoder_out.extend(span.label) - end_index = span.index[0] t5_input.extend(tokens[start_index: end_index]) t5_input.append(flag) - # the next start index is the token after the last span token start_index = span.index[-1] + 1 - # Add token to the t5_decoder_out t5_decoder_out.append(eos_id) # Add the remaining tokens to the t5 input t5_input.extend(tokens[start_index:]) + return t5_input, t5_decoder_in, t5_decoder_out + + +def pad_and_convert_to_numpy(tokens, masked_positions, + masked_labels, pad_id, + max_seq_length, max_seq_length_dec, + masked_spans=None, bos_id=None, + eos_id=None, sentinel_tokens=None): + """Pad sequences and convert them to numpy.""" + + t5_input, t5_decoder_in, t5_decoder_out = merge_subsequent_masks( + tokens, masked_spans, bos_id, eos_id, sentinel_tokens) # assert (len(t5_input) - len(masked_spans)) + \ # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) @@ -198,7 +222,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions, # Encoder-side padding mask. num_tokens = len(t5_input) padding_length = max_seq_length - num_tokens - assert padding_length >= 0 + if padding_length < 0: + raise LengthExceededError() assert len(masked_positions) == len(masked_labels) # Tokens.. @@ -208,7 +233,8 @@ def pad_and_convert_to_numpy(tokens, masked_positions, # Decoder-side padding mask. num_tokens_dec = len(t5_decoder_in) padding_length_dec = max_seq_length_dec - num_tokens_dec - assert padding_length_dec >= 0 + if padding_length_dec < 0: + raise DecoderLengthExceededError() filler_dec = [pad_id] * padding_length_dec tokens_dec_in = np.array(t5_decoder_in + filler_dec, dtype=np.int64) diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py new file mode 100644 index 000000000..4f80c973b --- /dev/null +++ b/megatron/data/ul2_dataset.py @@ -0,0 +1,281 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""UL2-style dataset.""" + +import math + +import numpy as np + +from megatron import get_tokenizer +from megatron.data.dataset_utils import ( + create_masked_lm_predictions, + get_samples_mapping, + SamplingStyle +) +from megatron.data.t5_dataset import ( + LengthExceededError, + make_history_mask, + merge_subsequent_masks, + pad_and_convert_to_numpy, + T5Dataset, +) +from megatron.enums import UL2ModelType + + +def is_decoder_only(ul2_model_type): + """Return whether we use a decoder-only model.""" + assert isinstance(ul2_model_type, UL2ModelType) + return ul2_model_type is not UL2ModelType.ENCODER_DECODER + + +def is_prefix_lm(ul2_model_type): + """Return whether we use a non-causal decoder-only model.""" + assert isinstance(ul2_model_type, UL2ModelType) + return ul2_model_type is UL2ModelType.NON_CAUSAL_DECODER + + +class UL2Dataset(T5Dataset): + + def __init__(self, name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, model_type, + denoiser_ratios, denoisers, mean_span_lengths, + mask_ratios, denoiser_tokens, max_seq_length, + max_seq_length_dec, short_seq_prob, seed): + + if denoiser_ratios is None: + # Uniform distribution by default. + denoiser_ratios = [1 / len(denoisers)] * len(denoisers) + + assert ( + len(denoiser_ratios) == len(denoisers) + == len(mean_span_lengths) == len(mask_ratios) + ), ( + 'some UL2 configurations do not correspond to the amount of ' + 'denoising objectives' + ) + + super().__init__(name, indexed_dataset, data_prefix, + num_epochs, max_num_samples, None, + max_seq_length, max_seq_length_dec, + short_seq_prob, seed) + + # Params to store. + self.model_type = model_type + self.denoiser_ratios = [ + denoiser_ratio / sum(denoiser_ratios) + for denoiser_ratio in denoiser_ratios + ] + self.denoisers = [denoiser.upper() for denoiser in denoisers] + self.mean_span_lengths = mean_span_lengths + self.mask_ratios = mask_ratios + + # Vocab stuff. + tokenizer = get_tokenizer() + # Remove CLS token because we don't need it. + del self.cls_id + self.cls_ids = { + denoiser: tokenizer.vocab[token] + for (denoiser, token) in denoiser_tokens.items() + } + # cls_token = self.vocab_id_to_token_dict[tokenizer.cls] + # if cls_token not in self.cls_ids: + # self.cls_ids[cls_token] = tokenizer.cls + + # Filter out denoiser tokens. + self.sentinel_tokens = [ + token + for token in tokenizer.additional_special_tokens_ids + if token not in self.cls_ids.values() + ] + assert len(self.sentinel_tokens) > 0, \ + "Provide the argument --vocab-extra-ids 100 to the script" + + def __getitem__(self, idx): + + start_index, end_index, seq_length = self.samples_mapping[idx] + sample = [] + for index in range(start_index, end_index): + sample.append(self.indexed_dataset[index]) + # Note that this rng state should be numpy and not python since + # python randint is inclusive whereas the numpy one is exclusive. + np_rng = np.random.RandomState(seed=(self.seed + idx)) + return build_training_sample(sample, seq_length, + self.max_seq_length, # needed for padding + self.max_seq_length_dec, + self.vocab_id_list, + self.vocab_id_to_token_dict, + self.cls_ids, self.sep_id, + self.mask_id, self.pad_id, + self.model_type, self.denoiser_ratios, + self.denoisers, self.mean_span_lengths, + self.mask_ratios, np_rng, self.bos_id, + self.eos_id, self.sentinel_tokens) + + +def build_training_sample(sample, target_seq_length, + max_seq_length, max_seq_length_dec, + vocab_id_list, vocab_id_to_token_dict, + cls_ids, sep_id, mask_id, pad_id, + model_type, denoiser_ratios, + denoisers, mean_span_lengths, + mask_ratios, np_rng, + bos_id=None, eos_id=None, + sentinel_tokens=None): + """Build training sample. + + Arguments: + sample: A list of sentences in which each sentence is a list token ids. + target_seq_length: Desired sequence length. + max_seq_length: Maximum length of the sequence. All values are padded to + this length. + vocab_id_list: List of vocabulary ids. Used to pick a random id. + vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. + cls_ids: Start of example ids. + sep_id: Separator id. + mask_id: Mask token id. + pad_id: Padding token id. + model_type: What type of model is used. + denoiser_ratios: Probability of each denoising objective to be selected. + denoisers: What type of UL2 denoising objective the other UL2 + configurations refer to. + mean_span_lengths: Mean length for sampling span lengths. Numbers < 1 + indicate a mean length of the sequence length times that number. + mask_ratios: Ratio of masked token in the full sequence. + np_rng: Random number genenrator. Note that this rng state should be + numpy and not python since python randint is inclusive for + the opper bound whereas the numpy one is exclusive. + bos_id: start of decoder example id + eos_id: end of generation id + sentinel_tokens: unique value to be substituted for every replaced span + """ + + # Denoiser selection + denoiser_index = np_rng.choice(np.arange(len(denoisers)), p=denoiser_ratios) + denoiser = denoisers[denoiser_index] + masked_lm_prob = mask_ratios[denoiser_index] + + assert target_seq_length <= max_seq_length + + # flatten sentences into one list + tokens = [token for sentence in sample for token in sentence] + + max_num_tokens = target_seq_length + # if is_decoder_only(model_type): + # # Keep space for repeated `extra_id` tokens; not the most data + # # efficient since we calculate this based on the maximum number + # # of possible `extra_id` tokens. + # safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob)) + # truncated = len(tokens) > safe_max_seq_len + # tokens = tokens[:safe_max_seq_len] + # else: + # Truncate to `target_sequence_length`. + truncated = len(tokens) > max_num_tokens + tokens = tokens[:max_num_tokens] + + # Prepend objective token. + cls_id = cls_ids.get(denoiser) + if cls_id is None: + raise ValueError('unknown denoiser') + tokens = [cls_id] + tokens + + # Masking. + max_predictions_per_seq = masked_lm_prob * len(tokens) + mean_ngrams = mean_span_lengths[denoiser_index] + if mean_ngrams < 1: + mean_ngrams = round(len(tokens) * mean_ngrams) + max_ngrams = mean_ngrams * 2 - 1 + + if denoiser == 'R' or denoiser == 'X': + sampling_style = SamplingStyle.NORMAL + prefix_lm = False + elif denoiser == 'S': + sampling_style = SamplingStyle.UNIFORM + prefix_lm = True + else: + raise ValueError('unknown denoiser') + ( + tokens, masked_positions, masked_labels, _, masked_spans, + ) = create_masked_lm_predictions( + tokens, vocab_id_list, vocab_id_to_token_dict, masked_lm_prob, + cls_id, sep_id, mask_id, max_predictions_per_seq, np_rng, + max_ngrams=max_ngrams, masking_style="t5", + sampling_style=sampling_style, prefix_lm=prefix_lm, + ) + + if is_decoder_only(model_type): + # Concatenate to one sequence. + tokens_enc, tokens_dec_in, labels = merge_subsequent_masks( + tokens, masked_spans, bos_id, eos_id, sentinel_tokens) + + # Move EOS tokens to end of sequence. + while tokens_enc[-1] == eos_id: + del tokens_enc[-1] + tokens_dec_in.append(eos_id) + labels.append(eos_id) + + num_labels = len(labels) + + # Move BOS token to start of sequence. + tokens_dec_in = tokens_dec_in[1:] + tokens = np.array(( + [bos_id] + + tokens_enc + + [sep_id] + + tokens_dec_in + ), dtype=np.int64) + labels = np.array(( + tokens_enc + + [sep_id] + + labels + ), dtype=np.int64) + + if max_seq_length - len(tokens) < 0: + raise LengthExceededError() + + loss_mask = np.zeros(len(tokens), dtype=np.int64) + loss_mask[-num_labels:] = 1 + + dec_mask = make_history_mask(tokens) + if is_prefix_lm(model_type): + dec_mask[:-num_labels, :-num_labels] = 1 + + train_sample = { + 'text': tokens, + 'labels': labels, + 'loss_mask': loss_mask, + 'truncated': int(truncated), + 'dec_mask': dec_mask, + } + else: + # Padding. + tokens_enc, tokens_dec_in, labels, enc_mask, \ + dec_mask, enc_dec_mask, loss_mask \ + = pad_and_convert_to_numpy(tokens, masked_positions, + masked_labels, pad_id, max_seq_length, + max_seq_length_dec, masked_spans, + bos_id, eos_id, sentinel_tokens) + + train_sample = { + 'text_enc': tokens_enc, + 'text_dec': tokens_dec_in, + 'labels': labels, + 'loss_mask': loss_mask, + 'truncated': int(truncated), + 'enc_mask': enc_mask, + 'dec_mask': dec_mask, + 'enc_dec_mask': enc_dec_mask, + } + return train_sample diff --git a/megatron/enums.py b/megatron/enums.py index 90d00a071..2961cbb66 100644 --- a/megatron/enums.py +++ b/megatron/enums.py @@ -33,3 +33,8 @@ class PositionEmbeddingType(enum.Enum): rotary = 1 absolute = 2 alibi = 3 + +class UL2ModelType(enum.Enum): + ENCODER_DECODER = 'ED' + NON_CAUSAL_DECODER = 'ND' + CAUSAL_DECODER = 'CD' diff --git a/megatron/model/fused_softmax.py b/megatron/model/fused_softmax.py index 57c6c4769..ccdfbaaef 100644 --- a/megatron/model/fused_softmax.py +++ b/megatron/model/fused_softmax.py @@ -214,7 +214,7 @@ def forward_torch_softmax(self, input, mask): if self.scale is not None: input = input * self.scale - if self.attn_mask_type == AttnMaskType.causal: + if self.attn_mask_type == AttnMaskType.causal and mask is None: assert mask is None assert input.shape[2] == input.shape[3] mask = self.get_causal_mask(input.shape[2]) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 09304b1dd..c0356a12c 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -388,6 +388,18 @@ def eos(self): candidate = self.tokenizer.eos_token_id return self._check_token_candidate(candidate) + @property + def bos_token_id(self): + """Id of the beginning of sentence token in the vocabulary.""" + candidate = self.tokenizer.bos_token_id + return self._check_token_candidate(candidate) + + @property + def eos_token_id(self): + """Id of the end of sentence token in the vocabulary.""" + candidate = self.tokenizer.eos_token_id + return self._check_token_candidate(candidate) + @property def additional_special_tokens_ids(self): """ All the additional special tokens you may want to use (list of strings).""" diff --git a/pretrain_ul2.py b/pretrain_ul2.py new file mode 100644 index 000000000..cab24ced0 --- /dev/null +++ b/pretrain_ul2.py @@ -0,0 +1,185 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Pretrain UL2""" + +from functools import partial + +import torch + +from megatron import ( + get_args, + get_timers, + mpu, + print_rank_0 +) +from megatron.data.dataset_utils import build_train_valid_test_datasets +from megatron.data.ul2_dataset import ( + is_decoder_only as _is_decoder_only, + is_prefix_lm as _is_prefix_lm, +) +from megatron.model.gpt_model import GPTModel +from megatron.model.t5_model import T5Model, t5_position_ids +from megatron.training import pretrain +from megatron.utils import average_losses_across_data_parallel_group + + +def is_decoder_only(): + """Return whether we use a decoder-only model.""" + args = get_args() + return _is_decoder_only(args.ul2_model_type) + + +def is_prefix_lm(): + """Return whether we use a non-causal decoder-only model.""" + args = get_args() + return _is_prefix_lm(args.ul2_model_type) + + +def model_provider(pre_process=True, post_process=True): + """Build the model.""" + assert pre_process and post_process, "UL2 doesn't yet support pipelining" + + print_rank_0('building UL2 model ...') + if is_decoder_only(): + print_rank_0('Using decoder-only UL2 model.') + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=is_prefix_lm(), + ) + else: + print_rank_0('Using encoder-decoder UL2 model.') + model = T5Model(num_tokentypes=0, parallel_output=True) + return model + + +def get_batch(data_iterator): + """Build the batch.""" + + if is_decoder_only(): + keys = ['text', 'labels', 'loss_mask', 'dec_mask'] + else: + keys = ['text_enc', 'text_dec', 'labels', 'loss_mask', + 'enc_mask', 'dec_mask', 'enc_dec_mask'] + datatype = torch.int64 + + # Broadcast data. + if data_iterator is not None: + data = next(data_iterator) + else: + data = None + data_b = mpu.broadcast_data(keys, data, datatype) + + # Unpack. + if is_decoder_only(): + tokens = data_b['text'].long() + labels = data_b['labels'].long() + loss_mask = data_b['loss_mask'].float() + + dec_mask = (data_b['dec_mask'] < 0.5) + return tokens, loss_mask, labels, dec_mask + else: + tokens_enc = data_b['text_enc'].long() + tokens_dec = data_b['text_dec'].long() + labels = data_b['labels'].long() + loss_mask = data_b['loss_mask'].float() + + enc_mask = (data_b['enc_mask'] < 0.5) + dec_mask = (data_b['dec_mask'] < 0.5) + enc_dec_mask = (data_b['enc_dec_mask'] < 0.5) + + return tokens_enc, tokens_dec, loss_mask, labels, \ + enc_mask, dec_mask, enc_dec_mask + + +def loss_func(loss_mask, output_tensor): + if is_decoder_only(): + lm_loss_ = output_tensor + else: + lm_loss_, _ = output_tensor + + lm_loss_ = lm_loss_.float() + lm_loss = torch.sum( + lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum() + + loss = lm_loss + averaged_losses = average_losses_across_data_parallel_group([lm_loss]) + + return loss, {'lm loss': averaged_losses[0]} + + +def forward_step(data_iterator, model): + """Forward step.""" + args = get_args() + timers = get_timers() + + # Get the batch. + timers('batch generator').start() + if is_decoder_only(): + (tokens, loss_mask, lm_labels, dec_mask) = get_batch(data_iterator) + else: + ( + tokens_enc, tokens_dec, loss_mask, lm_labels, + enc_mask, dec_mask, enc_dec_mask, + ) = get_batch(data_iterator) + timers('batch generator').stop() + + # Forward model lm_labels + if is_decoder_only(): + position_ids = t5_position_ids(tokens) + output_tensor = model(tokens, position_ids, dec_mask, + labels=lm_labels) + else: + output_tensor = model(tokens_enc, + tokens_dec, + enc_mask, + dec_mask, + enc_dec_mask, + tokentype_ids=None, + lm_labels=lm_labels) + + return output_tensor, partial(loss_func, loss_mask) + + +def train_valid_test_datasets_provider(train_val_test_num_samples): + """Build train, valid, and test datasets.""" + args = get_args() + + print_rank_0('> building train, validation, and test datasets ' + 'for UL2 ...') + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + max_seq_length=args.encoder_seq_length, + max_seq_length_dec=args.decoder_seq_length, + masked_lm_prob=args.mask_prob, + short_seq_prob=args.short_seq_prob, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + dataset_type='ul2') + print_rank_0("> finished creating UL2 datasets ...") + + return train_ds, valid_ds, test_ds + + +if __name__ == "__main__": + + pretrain(train_valid_test_datasets_provider, model_provider, forward_step, + args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'}) From 9328ad2cac3feb83d39dfc6d55b3ffeb9684c86b Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Thu, 29 Dec 2022 14:29:07 +0400 Subject: [PATCH 25/37] DeepSpeed compat --- megatron/data/ul2_dataset.py | 39 +++++++-- megatron/tokenizer/tokenizer.py | 60 ++++++++++++- pretrain_ul2.py | 148 ++++++++++++++++++++++++++------ 3 files changed, 213 insertions(+), 34 deletions(-) diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py index 4f80c973b..512086be3 100644 --- a/megatron/data/ul2_dataset.py +++ b/megatron/data/ul2_dataset.py @@ -67,10 +67,39 @@ def __init__(self, name, indexed_dataset, data_prefix, 'denoising objectives' ) - super().__init__(name, indexed_dataset, data_prefix, - num_epochs, max_num_samples, None, - max_seq_length, max_seq_length_dec, - short_seq_prob, seed) + # Params to store. + self.name = name + self.seed = seed + self.masked_lm_prob = short_seq_prob + self.max_seq_length = max_seq_length + self.max_seq_length_dec = max_seq_length_dec + + # Dataset. + self.indexed_dataset = indexed_dataset + + # Build the samples mapping. + self.samples_mapping = get_samples_mapping(self.indexed_dataset, + data_prefix, + num_epochs, + max_num_samples, + self.max_seq_length - 2, # account for added tokens + short_seq_prob, + self.seed, + self.name, + False) + + # Vocab stuff. + tokenizer = get_tokenizer() + self.vocab_id_list = list(tokenizer.inv_vocab.keys()) + self.vocab_id_to_token_dict = tokenizer.inv_vocab + # self.cls_id = tokenizer.cls + self.sep_id = tokenizer.sep + self.mask_id = tokenizer.mask + self.pad_id = tokenizer.pad + self.bos_id = tokenizer.bos_token_id + self.eos_id = tokenizer.eos_token_id + self.sentinel_tokens = tokenizer.additional_special_tokens_ids + assert len(self.sentinel_tokens) > 0, "Provide the argument --vocab-extra-ids 100 to the script" # Params to store. self.model_type = model_type @@ -85,7 +114,7 @@ def __init__(self, name, indexed_dataset, data_prefix, # Vocab stuff. tokenizer = get_tokenizer() # Remove CLS token because we don't need it. - del self.cls_id + # del self.cls_id self.cls_ids = { denoiser: tokenizer.vocab[token] for (denoiser, token) in denoiser_tokens.items() diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index c0356a12c..7d0340ca2 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -40,7 +40,9 @@ def build_tokenizer(args): vocab_extra_ids=args.vocab_extra_ids) elif args.tokenizer_type == 'GPT2BPETokenizer': assert args.merge_file is not None - tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) + tokenizer = _GPT2BPETokenizer( + args.vocab_file, args.merge_file, vocab_extra_ids=args.vocab_extra_ids + ) elif args.tokenizer_type == "PretrainedFromHF": assert args.tokenizer_name_or_path is not None @@ -286,13 +288,36 @@ def additional_special_tokens(self, value): class _GPT2BPETokenizer(AbstractTokenizer): """Original GPT2 BPE tokenizer.""" - def __init__(self, vocab_file, merge_file): + def __init__(self, vocab_file, merge_file, vocab_extra_ids=0): name = 'GPT2 BPE' super().__init__(name) self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', special_tokens=[], max_len=None) - self.eod_id = self.tokenizer.encoder['<|endoftext|>'] + self.eod_id = self.eos_token_id = self.tokenizer.encoder['<|endoftext|>'] + + self.bod_id = self.bos_token_id = self.tokenizer.encoder['[EOS]'] + self.sep_id = self.tokenizer.encoder['[SEP]'] + self.mask_id = self.tokenizer.encoder['[MASK]'] + self.pad_id = self.tokenizer.encoder['[PAD]'] + + additional_special_tokens = [] + self._additional_special_tokens = [] + additional_special_tokens.extend( + ["".format(i) for i in range(vocab_extra_ids)]) + self.add_additional_special_tokens(additional_special_tokens) + + def add_additional_special_tokens(self, tokens_list): + setattr(self, "additional_special_tokens", tokens_list) + for value in tokens_list: + self.add_token(value) + + def add_token(self, token): + if token not in self.vocab: + self.inv_vocab[self.vocab_size] = token + # self.vocab_size comes from len(vocab) + # and it will increase as we add elements + self.vocab[token] = self.vocab_size @property def vocab_size(self): @@ -316,6 +341,35 @@ def detokenize(self, token_ids): def eod(self): return self.eod_id + @property + def bod(self): + return self.bod_id + + @property + def sep(self): + return self.sep_id + + @property + def mask(self): + return self.mask_id + + @property + def pad(self): + return self.pad_id + + @property + def additional_special_tokens(self): + """ All the additional special tokens you may want to use (list of strings).""" + return self._additional_special_tokens + + @property + def additional_special_tokens_ids(self): + """ Ids of all the additional special tokens in the vocabulary (list of integers).""" + return [self.vocab.get(token) for token in self._additional_special_tokens] + + @additional_special_tokens.setter + def additional_special_tokens(self, value): + self._additional_special_tokens = value class _AutoTokenizer(AbstractTokenizer): """AutoTokenizer for Hf Pretrained model loading.""" diff --git a/pretrain_ul2.py b/pretrain_ul2.py index cab24ced0..f942566f3 100644 --- a/pretrain_ul2.py +++ b/pretrain_ul2.py @@ -17,6 +17,7 @@ from functools import partial +import deepspeed import torch from megatron import ( @@ -26,11 +27,13 @@ print_rank_0 ) from megatron.data.dataset_utils import build_train_valid_test_datasets +from megatron.data.gpt_dataset import build_dataset_group from megatron.data.ul2_dataset import ( is_decoder_only as _is_decoder_only, is_prefix_lm as _is_prefix_lm, ) -from megatron.model.gpt_model import GPTModel +from megatron.enums import AttnMaskType +from megatron.model.gpt_model import GPTModel, GPTModelPipe from megatron.model.t5_model import T5Model, t5_position_ids from megatron.training import pretrain from megatron.utils import average_losses_across_data_parallel_group @@ -51,21 +54,77 @@ def is_prefix_lm(): def model_provider(pre_process=True, post_process=True): """Build the model.""" assert pre_process and post_process, "UL2 doesn't yet support pipelining" - + print_rank_0('building UL2 model ...') + args = get_args() + + with deepspeed.zero.Init(data_parallel_group=mpu.get_data_parallel_group(), + remote_device=None if args.remote_device == 'none' else args.remote_device, + config_dict_or_path=args.deepspeed_config, + enabled=args.zero_stage == 3, + mpu=mpu): + if args.deepspeed and is_decoder_only(): + args.pretrain_causal_attention = True + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + attn_mask_type=AttnMaskType.causal + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe + elif is_decoder_only(): + print_rank_0('Using decoder-only UL2 model.') + model = GPTModel( + num_tokentypes=0, + parallel_output=True, + pre_process=pre_process, + post_process=post_process, + prefix_lm=is_prefix_lm(), + ) + else: + print_rank_0('Using encoder-decoder UL2 model.') + model = T5Model(num_tokentypes=0, parallel_output=True) + return model + +from megatron.global_vars import get_tokenizer +def visualize_model_inputs(tokens, attention_mask, labels, loss_mask): + tok = get_tokenizer() + print("TOKENS:", ",".join([tok.detokenize(tokens[0, i]) for i in range(100)])) + print("ATTN:", attention_mask[0, :, :100, :100]) + print("LABS:", labels[0, :100]) + print("LOSSMSK:", loss_mask[:100]) + +def get_batch_pipe(data): + """Modification of `get_batch` to work on `next(data_iterator)` instead of `data_iterator`""" if is_decoder_only(): - print_rank_0('Using decoder-only UL2 model.') - model = GPTModel( - num_tokentypes=0, - parallel_output=True, - pre_process=pre_process, - post_process=post_process, - prefix_lm=is_prefix_lm(), - ) + keys = ['text', 'labels', 'loss_mask', 'dec_mask'] else: - print_rank_0('Using encoder-decoder UL2 model.') - model = T5Model(num_tokentypes=0, parallel_output=True) - return model + keys = ['text_enc', 'text_dec', 'labels', 'loss_mask', + 'enc_mask', 'dec_mask', 'enc_dec_mask'] + datatype = torch.int64 + + data_b = mpu.broadcast_data(keys, data, datatype) + + + print( + visualize_model_inputs( + data_b['text'], + data_b['dec_mask'], + data_b['labels'], + data_b['loss_mask'], + ) + ) + + tokens = data_b['text'].long() + labels = data_b['labels'].long() + loss_mask = data_b['loss_mask'].float() + + dec_mask = (data_b['dec_mask'] < 0.5) + + position_ids = t5_position_ids(tokens) + + return (tokens, position_ids, dec_mask), (labels, loss_mask) def get_batch(data_iterator): @@ -85,6 +144,15 @@ def get_batch(data_iterator): data = None data_b = mpu.broadcast_data(keys, data, datatype) + print( + visualize_model_inputs( + data_b['text'], + data_b['dec_mask'], + data_b['labels'], + data_b['loss_mask'], + ) + ) + # Unpack. if is_decoder_only(): tokens = data_b['text'].long() @@ -162,19 +230,47 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): print_rank_0('> building train, validation, and test datasets ' 'for UL2 ...') - train_ds, valid_ds, test_ds = build_train_valid_test_datasets( - data_prefix=args.data_path, - data_impl=args.data_impl, - splits_string=args.split, - train_valid_test_num_samples=train_val_test_num_samples, - max_seq_length=args.encoder_seq_length, - max_seq_length_dec=args.decoder_seq_length, - masked_lm_prob=args.mask_prob, - short_seq_prob=args.short_seq_prob, - seed=args.seed, - skip_warmup=(not args.mmap_warmup), - dataset_type='ul2') - print_rank_0("> finished creating UL2 datasets ...") + if args.data_path: + train_ds, valid_ds, test_ds = build_train_valid_test_datasets( + data_prefix=args.data_path, + data_impl=args.data_impl, + splits_string=args.split, + train_valid_test_num_samples=train_val_test_num_samples, + max_seq_length=args.encoder_seq_length, + max_seq_length_dec=args.decoder_seq_length, + masked_lm_prob=args.mask_prob, + short_seq_prob=args.short_seq_prob, + seed=args.seed, + skip_warmup=(not args.mmap_warmup), + dataset_type='ul2') + print_rank_0("> finished creating UL2 datasets ...") + elif args.train_weighted_split_paths: + assigned_train_valid_test = [] + if args.train_weighted_split_paths is not None: + train_ds = [] + assigned_train_valid_test.append("train") + if args.valid_weighted_split_paths is not None: + valid_ds = [] + assigned_train_valid_test.append("valid") + if args.test_weighted_split_paths is not None: + test_ds = [] + assigned_train_valid_test.append("test") + + for s in assigned_train_valid_test: + data_groups = zip(eval(f"args.{s}_weighted_split_paths"), + eval(f"args.{s}_weighted_split_weights"), + eval(f"args.{s}_weighted_split_splits"), + eval(f"args.{s}_weighted_split_names")) + for paths, weights, splits, name in data_groups: + d = build_dataset_group(name, paths, weights, splits, + args.data_impl, + train_val_test_num_samples, + args.seq_length, args.seed, + (not args.mmap_warmup), + train_valid_test=s) + eval(f"{s}_ds").append(d) + else: + raise NotImplementedError("No dataloading argument passed") return train_ds, valid_ds, test_ds From 351f4f24465c974ab7c22dbc661d460006f25b01 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Fri, 30 Dec 2022 20:18:24 +0400 Subject: [PATCH 26/37] DS Group compat --- megatron/data/gpt_dataset.py | 37 ++++++++++++++++++++++++++++++------ megatron/data/t5_dataset.py | 4 ++++ megatron/data/ul2_dataset.py | 24 +++++++++++++++-------- pretrain_gpt.py | 1 - pretrain_ul2.py | 29 ++++++++++++++++------------ 5 files changed, 68 insertions(+), 27 deletions(-) diff --git a/megatron/data/gpt_dataset.py b/megatron/data/gpt_dataset.py index 0db1aa2fe..534d3bc3c 100644 --- a/megatron/data/gpt_dataset.py +++ b/megatron/data/gpt_dataset.py @@ -21,12 +21,12 @@ import numpy as np import torch -from megatron import mpu, print_rank_0 +from megatron import get_args, mpu, print_rank_0 from megatron.data.blendable_dataset import BlendableDataset from megatron.data.dataset_utils import get_datasets_weights_and_num_samples from megatron.data.dataset_utils import get_train_valid_test_split_, get_split_by_range_ from megatron.data.indexed_dataset import make_dataset as make_indexed_dataset - +from megatron.data.ul2_dataset import UL2Dataset def build_train_valid_test_datasets(data_prefix, data_impl, splits_string, train_valid_test_num_samples, @@ -154,10 +154,35 @@ def build_dataset(name): if splits[1] > splits[0]: documents = np.arange(start=splits[0], stop=splits[1], step=1, dtype=np.int32) - dataset = GPTDataset(name, data_prefix, - documents, indexed_dataset, - train_valid_test_num_samples[index], - seq_length, seed) + + args = get_args() + if args.ul2_model_type: + dataset = UL2Dataset( + name=name, + data_prefix=data_prefix, + num_epochs=None, + max_num_samples=train_valid_test_num_samples[index], + max_seq_length=seq_length, + seed=seed, + indexed_dataset=indexed_dataset, + model_type=args.ul2_model_type, + denoiser_ratios=args.ul2_denoiser_ratios, + denoisers=args.ul2_denoisers, + mean_span_lengths=args.ul2_mean_span_lengths, + mask_ratios=args.ul2_mask_ratios, + denoiser_tokens={ + 'R': args.ul2_r_denoiser_token, + 'S': args.ul2_s_denoiser_token, + 'X': args.ul2_x_denoiser_token, + }, + max_seq_length_dec=seq_length, + short_seq_prob=args.short_seq_prob, + ) + else: + dataset = GPTDataset(name, data_prefix, + documents, indexed_dataset, + train_valid_test_num_samples[index], + seq_length, seed) return dataset dataset = build_dataset(dataset_group_name) diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py index af3374da6..be52206ec 100644 --- a/megatron/data/t5_dataset.py +++ b/megatron/data/t5_dataset.py @@ -186,16 +186,20 @@ def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None, (start_index, end_index) = (0, None) for span in masked_spans: flag = sentinel_tokens.popleft() + # Append the same tokens in decoder input and output t5_decoder_in.append(flag) t5_decoder_in.extend(span.label) t5_decoder_out.append(flag) t5_decoder_out.extend(span.label) + end_index = span.index[0] t5_input.extend(tokens[start_index: end_index]) t5_input.append(flag) + # the next start index is the token after the last span token start_index = span.index[-1] + 1 + # Add token to the t5_decoder_out t5_decoder_out.append(eos_id) diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py index 512086be3..6c196da2f 100644 --- a/megatron/data/ul2_dataset.py +++ b/megatron/data/ul2_dataset.py @@ -202,17 +202,17 @@ def build_training_sample(sample, target_seq_length, tokens = [token for sentence in sample for token in sentence] max_num_tokens = target_seq_length - # if is_decoder_only(model_type): + if is_decoder_only(model_type): # # Keep space for repeated `extra_id` tokens; not the most data # # efficient since we calculate this based on the maximum number # # of possible `extra_id` tokens. - # safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob)) - # truncated = len(tokens) > safe_max_seq_len - # tokens = tokens[:safe_max_seq_len] - # else: + safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob)) + truncated = len(tokens) > safe_max_seq_len + tokens = tokens[:safe_max_seq_len] + else: # Truncate to `target_sequence_length`. - truncated = len(tokens) > max_num_tokens - tokens = tokens[:max_num_tokens] + truncated = len(tokens) > max_num_tokens + tokens = tokens[:max_num_tokens] # Prepend objective token. cls_id = cls_ids.get(denoiser) @@ -277,9 +277,16 @@ def build_training_sample(sample, target_seq_length, loss_mask = np.zeros(len(tokens), dtype=np.int64) loss_mask[-num_labels:] = 1 + padding = [pad_id] * (max_seq_length - len(tokens)) + tokens = np.concatenate((tokens, padding), axis=0) + labels = np.concatenate((labels, padding), axis=0) + loss_mask = np.concatenate((loss_mask, np.zeros(len(padding), dtype=np.int64)), axis=0) + dec_mask = make_history_mask(tokens) if is_prefix_lm(model_type): - dec_mask[:-num_labels, :-num_labels] = 1 + dec_mask[ + :-num_labels-len(padding), :-num_labels-len(padding) + ] = 1 train_sample = { 'text': tokens, @@ -288,6 +295,7 @@ def build_training_sample(sample, target_seq_length, 'truncated': int(truncated), 'dec_mask': dec_mask, } + else: # Padding. tokens_enc, tokens_dec_in, labels, enc_mask, \ diff --git a/pretrain_gpt.py b/pretrain_gpt.py index 2d0691f4e..5f8b104fc 100644 --- a/pretrain_gpt.py +++ b/pretrain_gpt.py @@ -228,7 +228,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): @record def main(): - torch.multiprocessing.set_start_method('spawn') pretrain(train_valid_test_datasets_provider, model_provider, forward_step, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}) diff --git a/pretrain_ul2.py b/pretrain_ul2.py index f942566f3..1342581e6 100644 --- a/pretrain_ul2.py +++ b/pretrain_ul2.py @@ -89,10 +89,13 @@ def model_provider(pre_process=True, post_process=True): from megatron.global_vars import get_tokenizer def visualize_model_inputs(tokens, attention_mask, labels, loss_mask): + print("SHAPES", tokens.shape, attention_mask.shape, labels.shape, loss_mask.shape) tok = get_tokenizer() - print("TOKENS:", ",".join([tok.detokenize(tokens[0, i]) for i in range(100)])) - print("ATTN:", attention_mask[0, :, :100, :100]) - print("LABS:", labels[0, :100]) + + print("TOKENS:", tok.detokenize(tokens[0, :].cpu().numpy().tolist())) + print("LABELS:", tok.detokenize(labels[0, :].cpu().numpy().tolist())) + + print("ATTN:", attention_mask[:100]) print("LOSSMSK:", loss_mask[:100]) def get_batch_pipe(data): @@ -107,14 +110,14 @@ def get_batch_pipe(data): data_b = mpu.broadcast_data(keys, data, datatype) - print( - visualize_model_inputs( - data_b['text'], - data_b['dec_mask'], - data_b['labels'], - data_b['loss_mask'], - ) - ) + # print( + # visualize_model_inputs( + # data_b['text'], + # data_b['dec_mask'], + # data_b['labels'], + # data_b['loss_mask'], + # ) + # ) tokens = data_b['text'].long() labels = data_b['labels'].long() @@ -227,6 +230,7 @@ def forward_step(data_iterator, model): def train_valid_test_datasets_provider(train_val_test_num_samples): """Build train, valid, and test datasets.""" args = get_args() + train_ds, valid_ds, test_ds = None, None, None print_rank_0('> building train, validation, and test datasets ' 'for UL2 ...') @@ -243,7 +247,6 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): seed=args.seed, skip_warmup=(not args.mmap_warmup), dataset_type='ul2') - print_rank_0("> finished creating UL2 datasets ...") elif args.train_weighted_split_paths: assigned_train_valid_test = [] if args.train_weighted_split_paths is not None: @@ -272,6 +275,8 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): else: raise NotImplementedError("No dataloading argument passed") + print_rank_0("> finished creating UL2 datasets ...") + return train_ds, valid_ds, test_ds From abc19b836bd7a3e03db0650e34a3440a7b492227 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Fri, 30 Dec 2022 22:35:03 +0400 Subject: [PATCH 27/37] Adapt eval for denoiser --- tasks/eval_harness/evaluate.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 7e0cdb60e..51d107e4f 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -42,6 +42,8 @@ def __init__(self, model, tokenizer): self.tokenizer = tokenizer self.VOCAB_SIZE = tokenizer.vocab_size self.EOT_TOKEN_ID = tokenizer.eod + self.add_denoiser = args.add_denoiser + self.DENOISER_TOKEN_ID = tokenizer.tokenize("[S]")[0] self._max_length = args.seq_length @@ -80,9 +82,15 @@ def loglikelihood(self, requests): for context, continuation in requests: if context == "": # end of text as context - context_enc = [self.EOT_TOKEN_ID] + if self.add_denoiser: + context_enc = [self.DENOISER_TOKEN_ID] + [self.EOT_TOKEN_ID] + else: + context_enc = [self.EOT_TOKEN_ID] else: - context_enc = self.tokenizer_encode(context) + if self.add_denoiser: + context_enc = [self.DENOISER_TOKEN_ID] + self.tokenizer_encode(context) + else: + context_enc = self.tokenizer_encode(context) continuation_enc = self.tokenizer_encode(continuation) @@ -390,6 +398,7 @@ def tasks_args(parser): group.add_argument('--intermed_results', default = False, action='store_true', help='Whether to print & write intermediate results for each task') group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation') group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel') + group.add_argument('--add_denoiser', default = False, action='store_true', help='Whether to add a denoiser to the model') return parser from megatron.global_vars import _parse_args From 816c32d1c20623214ef5f1e9b4d6d3aa7f44c5ff Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 3 Jan 2023 23:14:32 +0400 Subject: [PATCH 28/37] Simpler padding --- megatron/data/dataset_utils.py | 11 +++++- megatron/data/t5_dataset.py | 26 ++++++++++----- megatron/data/ul2_dataset.py | 61 +++++++++++++++++----------------- pretrain_ul2.py | 2 ++ 4 files changed, 60 insertions(+), 40 deletions(-) diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index 60d4e0d90..2617f5927 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -249,6 +249,12 @@ def create_masked_lm_predictions(tokens, num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))) + if prefix_lm: + # Adjust probabilities so that the mean is centered at the + # correct position. + # If we do not do this, the mean is at + # `len(tokens) * masked_lm_prob / 2`. + masked_lm_prob *= 2 ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) if sampling_style is SamplingStyle.POISSON: @@ -271,7 +277,10 @@ def create_masked_lm_predictions(tokens, continue ngram_index.append(cand_indexes[idx:idx + n]) ngram_indexes.append(ngram_index) - + if prefix_lm: + # No need to go further – we would only produce + # duplicate entries by continuing for this `idx`. + break np_rng.shuffle(ngram_indexes) (masked_lms, masked_spans) = ([], []) diff --git a/megatron/data/t5_dataset.py b/megatron/data/t5_dataset.py index be52206ec..f3ac34bf0 100644 --- a/megatron/data/t5_dataset.py +++ b/megatron/data/t5_dataset.py @@ -125,6 +125,8 @@ def build_training_sample(sample, target_seq_length, target_seq_length: Desired sequence length. max_seq_length: Maximum length of the sequence. All values are padded to this length. + max_seq_length_dec: Maximum length of the decoder input sequence. All + values are padded to this length. vocab_id_list: List of vocabulary ids. Used to pick a random id. vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. cls_id: Start of example id. @@ -179,23 +181,29 @@ def build_training_sample(sample, target_seq_length, def merge_subsequent_masks(tokens, masked_spans=None, bos_id=None, - eos_id=None, sentinel_tokens=None): - sentinel_tokens = collections.deque(sentinel_tokens) + eos_id=None, sentinel_tokens=None, prefix_lm=False): + if prefix_lm: + assert len(masked_spans) <= 1, \ + 'Received more than one masked span for PrefixLM masking' + else: + sentinel_tokens = collections.deque(sentinel_tokens) t5_input = [] (t5_decoder_in, t5_decoder_out) = ([bos_id], []) (start_index, end_index) = (0, None) for span in masked_spans: - flag = sentinel_tokens.popleft() + if not prefix_lm: + flag = sentinel_tokens.popleft() + # Append the same tokens in decoder input and output + t5_decoder_in.append(flag) + t5_decoder_out.append(flag) - # Append the same tokens in decoder input and output - t5_decoder_in.append(flag) t5_decoder_in.extend(span.label) - t5_decoder_out.append(flag) t5_decoder_out.extend(span.label) end_index = span.index[0] t5_input.extend(tokens[start_index: end_index]) - t5_input.append(flag) + if not prefix_lm: + t5_input.append(flag) # the next start index is the token after the last span token start_index = span.index[-1] + 1 @@ -212,11 +220,11 @@ def pad_and_convert_to_numpy(tokens, masked_positions, masked_labels, pad_id, max_seq_length, max_seq_length_dec, masked_spans=None, bos_id=None, - eos_id=None, sentinel_tokens=None): + eos_id=None, sentinel_tokens=None, prefix_lm=False): """Pad sequences and convert them to numpy.""" t5_input, t5_decoder_in, t5_decoder_out = merge_subsequent_masks( - tokens, masked_spans, bos_id, eos_id, sentinel_tokens) + tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm) # assert (len(t5_input) - len(masked_spans)) + \ # (len(t5_decoder_in) - (len(masked_spans) + 1)) == len(tokens) diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py index 6c196da2f..adfc3b17a 100644 --- a/megatron/data/ul2_dataset.py +++ b/megatron/data/ul2_dataset.py @@ -15,14 +15,11 @@ """UL2-style dataset.""" -import math - import numpy as np from megatron import get_tokenizer from megatron.data.dataset_utils import ( create_masked_lm_predictions, - get_samples_mapping, SamplingStyle ) from megatron.data.t5_dataset import ( @@ -164,12 +161,13 @@ def build_training_sample(sample, target_seq_length, bos_id=None, eos_id=None, sentinel_tokens=None): """Build training sample. - Arguments: sample: A list of sentences in which each sentence is a list token ids. target_seq_length: Desired sequence length. max_seq_length: Maximum length of the sequence. All values are padded to this length. + max_seq_length_dec: Maximum length of the decoder input sequence. All + values are padded to this length. vocab_id_list: List of vocabulary ids. Used to pick a random id. vocab_id_to_token_dict: A dictionary from vocab ids to text tokens. cls_ids: Start of example ids. @@ -202,17 +200,17 @@ def build_training_sample(sample, target_seq_length, tokens = [token for sentence in sample for token in sentence] max_num_tokens = target_seq_length - if is_decoder_only(model_type): + # if is_decoder_only(model_type): # # Keep space for repeated `extra_id` tokens; not the most data # # efficient since we calculate this based on the maximum number # # of possible `extra_id` tokens. - safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob)) - truncated = len(tokens) > safe_max_seq_len - tokens = tokens[:safe_max_seq_len] - else: + # safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob)) + # truncated = len(tokens) > safe_max_seq_len + # tokens = tokens[:safe_max_seq_len] + # else: # Truncate to `target_sequence_length`. - truncated = len(tokens) > max_num_tokens - tokens = tokens[:max_num_tokens] + truncated = len(tokens) > max_num_tokens + tokens = tokens[:max_num_tokens] # Prepend objective token. cls_id = cls_ids.get(denoiser) @@ -221,10 +219,12 @@ def build_training_sample(sample, target_seq_length, tokens = [cls_id] + tokens # Masking. - max_predictions_per_seq = masked_lm_prob * len(tokens) + # Ensure we always have at least one prediction. + max_predictions_per_seq = max(1.0, masked_lm_prob * len(tokens)) mean_ngrams = mean_span_lengths[denoiser_index] if mean_ngrams < 1: - mean_ngrams = round(len(tokens) * mean_ngrams) + # Ensure we always obtain at least one `max_ngrams`. + mean_ngrams = max(1, round(len(tokens) * mean_ngrams)) max_ngrams = mean_ngrams * 2 - 1 if denoiser == 'R' or denoiser == 'X': @@ -247,7 +247,7 @@ def build_training_sample(sample, target_seq_length, if is_decoder_only(model_type): # Concatenate to one sequence. tokens_enc, tokens_dec_in, labels = merge_subsequent_masks( - tokens, masked_spans, bos_id, eos_id, sentinel_tokens) + tokens, masked_spans, bos_id, eos_id, sentinel_tokens, prefix_lm) # Move EOS tokens to end of sequence. while tokens_enc[-1] == eos_id: @@ -259,34 +259,35 @@ def build_training_sample(sample, target_seq_length, # Move BOS token to start of sequence. tokens_dec_in = tokens_dec_in[1:] - tokens = np.array(( + tokens = ( [bos_id] + tokens_enc + [sep_id] + tokens_dec_in - ), dtype=np.int64) + ) + + # Pad and convert to NumPy. + padding_length = max_seq_length - len(tokens) + if padding_length < 0: + raise LengthExceededError() + filler = [pad_id] * padding_length + + tokens = np.array(tokens + filler, dtype=np.int64) labels = np.array(( tokens_enc + [sep_id] + labels + + filler ), dtype=np.int64) - if max_seq_length - len(tokens) < 0: - raise LengthExceededError() - loss_mask = np.zeros(len(tokens), dtype=np.int64) - loss_mask[-num_labels:] = 1 - - padding = [pad_id] * (max_seq_length - len(tokens)) - tokens = np.concatenate((tokens, padding), axis=0) - labels = np.concatenate((labels, padding), axis=0) - loss_mask = np.concatenate((loss_mask, np.zeros(len(padding), dtype=np.int64)), axis=0) + labels_start_neg_index = -(num_labels + padding_length) + labels_end_neg_index = -padding_length if padding_length > 0 else None + loss_mask[labels_start_neg_index:labels_end_neg_index] = 1 dec_mask = make_history_mask(tokens) if is_prefix_lm(model_type): - dec_mask[ - :-num_labels-len(padding), :-num_labels-len(padding) - ] = 1 + dec_mask[:labels_start_neg_index, :labels_start_neg_index] = 1 train_sample = { 'text': tokens, @@ -295,7 +296,6 @@ def build_training_sample(sample, target_seq_length, 'truncated': int(truncated), 'dec_mask': dec_mask, } - else: # Padding. tokens_enc, tokens_dec_in, labels, enc_mask, \ @@ -303,7 +303,8 @@ def build_training_sample(sample, target_seq_length, = pad_and_convert_to_numpy(tokens, masked_positions, masked_labels, pad_id, max_seq_length, max_seq_length_dec, masked_spans, - bos_id, eos_id, sentinel_tokens) + bos_id, eos_id, sentinel_tokens, + prefix_lm) train_sample = { 'text_enc': tokens_enc, diff --git a/pretrain_ul2.py b/pretrain_ul2.py index 1342581e6..9b64b6daf 100644 --- a/pretrain_ul2.py +++ b/pretrain_ul2.py @@ -124,6 +124,7 @@ def get_batch_pipe(data): loss_mask = data_b['loss_mask'].float() dec_mask = (data_b['dec_mask'] < 0.5) + dec_mask = dec_mask.unsqueeze(1) position_ids = t5_position_ids(tokens) @@ -163,6 +164,7 @@ def get_batch(data_iterator): loss_mask = data_b['loss_mask'].float() dec_mask = (data_b['dec_mask'] < 0.5) + dec_mask = dec_mask.unsqueeze(1) return tokens, loss_mask, labels, dec_mask else: tokens_enc = data_b['text_enc'].long() From bdbd54a0de63e778bac28119b835aa381a7dad85 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 3 Jan 2023 23:28:41 +0400 Subject: [PATCH 29/37] Fix sampling --- megatron/data/ul2_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py index adfc3b17a..dd96c00d2 100644 --- a/megatron/data/ul2_dataset.py +++ b/megatron/data/ul2_dataset.py @@ -20,6 +20,7 @@ from megatron import get_tokenizer from megatron.data.dataset_utils import ( create_masked_lm_predictions, + get_samples_mapping, SamplingStyle ) from megatron.data.t5_dataset import ( From cacf267c87b3f694dc85e340bd93d5adc6be0c57 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 3 Jan 2023 23:43:30 +0400 Subject: [PATCH 30/37] Switch padding --- megatron/data/ul2_dataset.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py index dd96c00d2..065125923 100644 --- a/megatron/data/ul2_dataset.py +++ b/megatron/data/ul2_dataset.py @@ -15,6 +15,8 @@ """UL2-style dataset.""" +import math + import numpy as np from megatron import get_tokenizer @@ -201,17 +203,17 @@ def build_training_sample(sample, target_seq_length, tokens = [token for sentence in sample for token in sentence] max_num_tokens = target_seq_length - # if is_decoder_only(model_type): + if is_decoder_only(model_type): # # Keep space for repeated `extra_id` tokens; not the most data # # efficient since we calculate this based on the maximum number # # of possible `extra_id` tokens. - # safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob)) - # truncated = len(tokens) > safe_max_seq_len - # tokens = tokens[:safe_max_seq_len] - # else: + safe_max_seq_len = math.floor(max_num_tokens / (1 + masked_lm_prob)) + truncated = len(tokens) > safe_max_seq_len + tokens = tokens[:safe_max_seq_len] + else: # Truncate to `target_sequence_length`. - truncated = len(tokens) > max_num_tokens - tokens = tokens[:max_num_tokens] + truncated = len(tokens) > max_num_tokens + tokens = tokens[:max_num_tokens] # Prepend objective token. cls_id = cls_ids.get(denoiser) From 557b09ceb70d327d8181d4a2fa010412155d6da7 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 24 Jan 2023 03:45:41 +0400 Subject: [PATCH 31/37] Upate sampling --- megatron/data/dataset_utils.py | 73 +++++++++++++++++++++++----------- megatron/data/ul2_dataset.py | 9 ++++- 2 files changed, 57 insertions(+), 25 deletions(-) diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index 2617f5927..e76a951ab 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -219,6 +219,7 @@ def create_masked_lm_predictions(tokens, # the starting piece of current token, where 1 means true, so that # on-the-fly whole word masking is possible. token_boundary = [0] * len(tokens) + num_filtered_tokens = 0 for (i, token) in enumerate(tokens): if token == cls_id or token == sep_id: @@ -237,6 +238,7 @@ def create_masked_lm_predictions(tokens, cand_indexes.append([i]) if is_start_piece(vocab_id_to_token_dict[token]): token_boundary[i] = 1 + num_filtered_tokens += 1 output_tokens = list(tokens) @@ -249,12 +251,16 @@ def create_masked_lm_predictions(tokens, num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))) - if prefix_lm: - # Adjust probabilities so that the mean is centered at the - # correct position. - # If we do not do this, the mean is at - # `len(tokens) * masked_lm_prob / 2`. - masked_lm_prob *= 2 + + if sampling_style is SamplingStyle.NORMAL: + # First, we get the center of our normal distribution from + # `max_ngrams`. Keeping the meaning of `max_ngrams` this way + # plays nicely with the other probability distributions in terms + # of math. + normal_mean = (max_ngrams + 1) / 2 + # However, we do not want to bound the maximum number of + # n-grams. + max_ngrams = num_filtered_tokens - 1 ngrams = np.arange(1, max_ngrams + 1, dtype=np.int64) if sampling_style is SamplingStyle.POISSON: @@ -264,24 +270,45 @@ def create_masked_lm_predictions(tokens, pvals /= pvals.sum(keepdims=True) if favor_longer_ngram: pvals = pvals[::-1] - elif sampling_style is SamplingStyle.NORMAL: - normal_mean = (max_ngrams + 1) / 2 - ngram_indexes = [] - for idx in range(len(cand_indexes)): - ngram_index = [] - for n in ngrams: - if prefix_lm: - last_cand_index_index = min(idx + n - 1, len(cand_indexes) - 1) - if cand_indexes[last_cand_index_index][-1] < len(tokens) - 1: - continue - ngram_index.append(cand_indexes[idx:idx + n]) - ngram_indexes.append(ngram_index) - if prefix_lm: - # No need to go further – we would only produce - # duplicate entries by continuing for this `idx`. - break - np_rng.shuffle(ngram_indexes) + if prefix_lm: + # We only do one span searching loop anyway, so this does not + # matter in terms of random search. However, we do want to allow + # sequences greater than the mean ratio. + num_to_predict = max_predictions_per_seq + + # Find first index which is greater than the number of + # predictions. + first_gt_index = next( + ( + i + for (i, x) in enumerate(cand_indexes) + if x[0] > num_filtered_tokens - max_predictions_per_seq + ), + len(cand_indexes), + ) + # Then move one index before to get less than or equal to the + # number of predictions, handling not going below 0. + first_le_index = max(1, first_gt_index) - 1 + + tail_cand_indexes = cand_indexes[first_le_index:] + ngram_indexes = [] + for i in range(len(tail_cand_indexes)): + ngram_indexes.append(tail_cand_indexes[i:]) + ngram_indexes = [ngram_indexes] + # No need to shuffle outer list of length 1. + else: + num_to_predict = min(max_predictions_per_seq, + max(1, int(round(len(tokens) * masked_lm_prob)))) + + ngram_indexes = [] + for idx in range(len(cand_indexes)): + ngram_index = [] + for n in ngrams: + ngram_index.append(cand_indexes[idx:idx + n]) + ngram_indexes.append(ngram_index) + + np_rng.shuffle(ngram_indexes) (masked_lms, masked_spans) = ([], []) covered_indexes = set() diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py index 065125923..2d6960e87 100644 --- a/megatron/data/ul2_dataset.py +++ b/megatron/data/ul2_dataset.py @@ -222,8 +222,6 @@ def build_training_sample(sample, target_seq_length, tokens = [cls_id] + tokens # Masking. - # Ensure we always have at least one prediction. - max_predictions_per_seq = max(1.0, masked_lm_prob * len(tokens)) mean_ngrams = mean_span_lengths[denoiser_index] if mean_ngrams < 1: # Ensure we always obtain at least one `max_ngrams`. @@ -233,11 +231,18 @@ def build_training_sample(sample, target_seq_length, if denoiser == 'R' or denoiser == 'X': sampling_style = SamplingStyle.NORMAL prefix_lm = False + max_predictions_per_seq = len(tokens) - 1 elif denoiser == 'S': sampling_style = SamplingStyle.UNIFORM prefix_lm = True + max_predictions_per_seq = min( + round(masked_lm_prob * len(tokens)) * 2 - 1, + len(tokens) - 1, + ) else: raise ValueError('unknown denoiser') + # Ensure we always have at least one prediction. + max_predictions_per_seq = max(1, max_predictions_per_seq) ( tokens, masked_positions, masked_labels, _, masked_spans, ) = create_masked_lm_predictions( From a6f69bf2751f4a456bd6a4b85d13211fcae340e2 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 24 Jan 2023 19:14:54 +0400 Subject: [PATCH 32/37] Update UL2 --- megatron/data/dataset_utils.py | 79 ++++++++++++++++++++-------------- megatron/data/ul2_dataset.py | 5 +-- 2 files changed, 48 insertions(+), 36 deletions(-) diff --git a/megatron/data/dataset_utils.py b/megatron/data/dataset_utils.py index e76a951ab..a3ce376b5 100644 --- a/megatron/data/dataset_utils.py +++ b/megatron/data/dataset_utils.py @@ -18,6 +18,7 @@ # https://github.com/google-research/albert/blob/master/create_pretraining_data.py # with some modifications. +import bisect from enum import Enum import math import os @@ -191,6 +192,36 @@ def is_start_piece(piece): # append it to the previous set of word indexes. return not piece.startswith("##") +def get_ngram_indices( + idx, + ngrams, + cand_indexes, + num_to_predict, + num_filtered_tokens, + prefix_lm, +): + if prefix_lm: + # Find first index which is greater than the number of + # predictions. + first_gt_index = bisect.bisect_right( + cand_indexes, + [num_filtered_tokens - num_to_predict], + ) + # Then move one index before to get less than or equal to the + # number of predictions, handling not going below 0. + first_le_index = max(1, first_gt_index) - 1 + + tail_cand_indexes = cand_indexes[first_le_index:] + ngram_index = [ + tail_cand_indexes[i:] + for i in range(len(tail_cand_indexes)) + ] + else: + ngram_index = [] + for n in ngrams: + ngram_index.append(cand_indexes[idx:idx + n]) + return ngram_index + def create_masked_lm_predictions(tokens, vocab_id_list, vocab_id_to_token_dict, @@ -249,9 +280,6 @@ def create_masked_lm_predictions(tokens, return (output_tokens, masked_lm_positions, masked_lm_labels, token_boundary) - num_to_predict = min(max_predictions_per_seq, - max(1, int(round(len(tokens) * masked_lm_prob)))) - if sampling_style is SamplingStyle.NORMAL: # First, we get the center of our normal distribution from # `max_ngrams`. Keeping the meaning of `max_ngrams` this way @@ -277,38 +305,24 @@ def create_masked_lm_predictions(tokens, # sequences greater than the mean ratio. num_to_predict = max_predictions_per_seq - # Find first index which is greater than the number of - # predictions. - first_gt_index = next( - ( - i - for (i, x) in enumerate(cand_indexes) - if x[0] > num_filtered_tokens - max_predictions_per_seq - ), - len(cand_indexes), - ) - # Then move one index before to get less than or equal to the - # number of predictions, handling not going below 0. - first_le_index = max(1, first_gt_index) - 1 - - tail_cand_indexes = cand_indexes[first_le_index:] - ngram_indexes = [] - for i in range(len(tail_cand_indexes)): - ngram_indexes.append(tail_cand_indexes[i:]) - ngram_indexes = [ngram_indexes] - # No need to shuffle outer list of length 1. + ngram_index_indexes = np.array([0]) else: num_to_predict = min(max_predictions_per_seq, max(1, int(round(len(tokens) * masked_lm_prob)))) - ngram_indexes = [] - for idx in range(len(cand_indexes)): - ngram_index = [] - for n in ngrams: - ngram_index.append(cand_indexes[idx:idx + n]) - ngram_indexes.append(ngram_index) - - np_rng.shuffle(ngram_indexes) + ngram_index_indexes = np.arange(len(cand_indexes)) + np_rng.shuffle(ngram_index_indexes) + + def get_ngram_indices_(idx): + return get_ngram_indices( + idx, + ngrams, + cand_indexes, + num_to_predict, + num_filtered_tokens, + prefix_lm, + ) + ngram_indexes = map(get_ngram_indices_, ngram_index_indexes) (masked_lms, masked_spans) = ([], []) covered_indexes = set() @@ -392,7 +406,8 @@ def create_masked_lm_predictions(tokens, label=[tokens[index] for index in index_set])) assert len(masked_lms) <= num_to_predict - np_rng.shuffle(ngram_indexes) + np_rng.shuffle(ngram_index_indexes) + ngram_indexes = map(get_ngram_indices_, ngram_index_indexes) select_indexes = set() if do_permutation: diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py index 2d6960e87..e9eb82ff8 100644 --- a/megatron/data/ul2_dataset.py +++ b/megatron/data/ul2_dataset.py @@ -15,14 +15,11 @@ """UL2-style dataset.""" -import math - import numpy as np from megatron import get_tokenizer from megatron.data.dataset_utils import ( - create_masked_lm_predictions, - get_samples_mapping, + create_masked_lm_predictions, SamplingStyle ) from megatron.data.t5_dataset import ( From d0d277feb6808cd83b9cd4bd2812257708d6d06f Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 24 Jan 2023 19:36:08 +0400 Subject: [PATCH 33/37] Add get_samples_mapping --- megatron/data/ul2_dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py index e9eb82ff8..6511ad345 100644 --- a/megatron/data/ul2_dataset.py +++ b/megatron/data/ul2_dataset.py @@ -19,7 +19,8 @@ from megatron import get_tokenizer from megatron.data.dataset_utils import ( - create_masked_lm_predictions, + create_masked_lm_predictions, + get_samples_mapping, SamplingStyle ) from megatron.data.t5_dataset import ( From 3f29df898b30bd8f1cc398e65fbcd8dc0c0b98b7 Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Tue, 24 Jan 2023 19:44:51 +0400 Subject: [PATCH 34/37] Import math --- megatron/data/ul2_dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/megatron/data/ul2_dataset.py b/megatron/data/ul2_dataset.py index 6511ad345..4da73d9e7 100644 --- a/megatron/data/ul2_dataset.py +++ b/megatron/data/ul2_dataset.py @@ -15,6 +15,7 @@ """UL2-style dataset.""" +import math import numpy as np from megatron import get_tokenizer From 5207386338ee47bdf9d4d781e8402e20c00fa96e Mon Sep 17 00:00:00 2001 From: Muennighoff Date: Mon, 6 Feb 2023 12:59:55 +0400 Subject: [PATCH 35/37] Fix prefixlm --- finetune_t0_non_causal_decoder.py | 1 + pretrain_ul2.py | 29 ++++++++++++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/finetune_t0_non_causal_decoder.py b/finetune_t0_non_causal_decoder.py index 14650a6e5..13a758a9a 100644 --- a/finetune_t0_non_causal_decoder.py +++ b/finetune_t0_non_causal_decoder.py @@ -33,6 +33,7 @@ def model_provider(pre_process=True, post_process=True): enabled=args.zero_stage == 3, mpu=mpu): if args.deepspeed: + args.pretrain_causal_attention = False model = GPTModelPipe( num_tokentypes=0, parallel_output=True, diff --git a/pretrain_ul2.py b/pretrain_ul2.py index 9b64b6daf..89c936666 100644 --- a/pretrain_ul2.py +++ b/pretrain_ul2.py @@ -64,15 +64,26 @@ def model_provider(pre_process=True, post_process=True): enabled=args.zero_stage == 3, mpu=mpu): if args.deepspeed and is_decoder_only(): - args.pretrain_causal_attention = True - model = GPTModelPipe( - num_tokentypes=0, - parallel_output=True, - attn_mask_type=AttnMaskType.causal - ) - # This is a hack to give us a reference to get_batch_pipe from within training.py - # We need to call model.set_batch_fn after deepspeed.initialize - model._megatron_batch_fn = get_batch_pipe + if is_prefix_lm(): + args.pretrain_causal_attention = False + print_rank_0('Using prefix LM UL2 model.') + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + attn_mask_type=AttnMaskType.custom + ) + model._megatron_batch_fn = get_batch_pipe + else: + args.pretrain_causal_attention = True + print_rank_0('Using decoder-only causal UL2 model.') + model = GPTModelPipe( + num_tokentypes=0, + parallel_output=True, + attn_mask_type=AttnMaskType.causal + ) + # This is a hack to give us a reference to get_batch_pipe from within training.py + # We need to call model.set_batch_fn after deepspeed.initialize + model._megatron_batch_fn = get_batch_pipe elif is_decoder_only(): print_rank_0('Using decoder-only UL2 model.') model = GPTModel( From 9490e50ed68253569c7bfef33eb5bc4b72f3a58e Mon Sep 17 00:00:00 2001 From: Niklas Muennighoff Date: Sat, 20 May 2023 00:00:59 +0300 Subject: [PATCH 36/37] tmp --- megatron/arguments.py | 3 ++- megatron/checkpointing.py | 6 +++--- tasks/eval_harness/evaluate.py | 14 ++++++++++---- 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/megatron/arguments.py b/megatron/arguments.py index d2499d149..1b101cbad 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -649,7 +649,8 @@ def _add_learning_rate_args(parser): 'from checkpoint and ignore input arguments.') group.add_argument('--universal-checkpoint', action='store_true', help='Loading a universal format checkpoint.') - + group.add_argument('--reset-progress', action='store_true', default=None, + help='Reset iteration to 0 & do not load args.') return parser diff --git a/megatron/checkpointing.py b/megatron/checkpointing.py index dacbec7dc..ebf93a986 100644 --- a/megatron/checkpointing.py +++ b/megatron/checkpointing.py @@ -342,7 +342,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True set_checkpoint_version(state_dict.get('checkpoint_version', 0)) # Set iteration. - if args.finetune or release: + if args.finetune or release or args.reset_progress: iteration = 0 else: try: @@ -361,7 +361,7 @@ def load_checkpoint(model, optimizer, lr_scheduler, load_arg='load', strict=True # Check arguments. assert args.consumed_train_samples == 0 assert args.consumed_valid_samples == 0 - if 'args' in state_dict: + if 'args' in state_dict and not args.reset_progress: checkpoint_args = state_dict['args'] if not args.universal_checkpoint: check_checkpoint_args(checkpoint_args) @@ -480,4 +480,4 @@ def _checkpoint_info(): return { "padded_vocab_size": args.padded_vocab_size, "original_vocab_size": tokenizer.vocab_size, - } \ No newline at end of file + } diff --git a/tasks/eval_harness/evaluate.py b/tasks/eval_harness/evaluate.py index 7e0cdb60e..67e95e1c9 100644 --- a/tasks/eval_harness/evaluate.py +++ b/tasks/eval_harness/evaluate.py @@ -390,6 +390,8 @@ def tasks_args(parser): group.add_argument('--intermed_results', default = False, action='store_true', help='Whether to print & write intermediate results for each task') group.add_argument('--bootstrap_iters', type=int, default=100000, help='How many iterations to use for stderr estimation') group.add_argument('--micro_bs_multiplier', type=int, default=1, help='Increase the global batch size to remove bubble when pipeline parallel') + group.add_argument('--fewshots', type=int, default=0, help='Num fewshots') + group.add_argument('--limit', type=int, default=None, help='Limit samples') return parser from megatron.global_vars import _parse_args @@ -398,6 +400,10 @@ def main(): # parse the megatron args. But wait with initalizing megatron. # avoid printing the arguments, since they will later be overridden. args = _parse_args(tasks_args) + if os.path.exists(args.results_path): + print("Exists ", args.results_path) + exit() + load_path = args.load model = load_ds_checkpoint_and_setup_megatron(args) @@ -422,11 +428,11 @@ def main(): global_results = {"results": {}, "versions": {}} timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') iteration_id = load_path.split("/")[-1].replace("/", "") - results_path = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}.json") + results_path = args.results_path#.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots.json") # Backup file in case of interruption during writing - results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_backup.json") + results_path_backup = args.results_path.replace(".json", f"_lm-eval_{iteration_id}_{timestamp}_{args.fewshots}shots_backup.json") for task_name, task in task_dict.items(): - results = evaluator.evaluate(adaptor, {task_name: task}, False, 0, None, bootstrap_iters=args.bootstrap_iters) + results = evaluator.evaluate(adaptor, {task_name: task}, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit) global_results["results"] = {**global_results["results"], **results["results"]} global_results["versions"] = {**global_results["versions"], **results["versions"]} if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: @@ -436,7 +442,7 @@ def main(): with open(results_path_backup, 'w') as outfile: json.dump(global_results, outfile, indent=4) else: - global_results = evaluator.evaluate(adaptor, task_dict, False, 0, None, bootstrap_iters=args.bootstrap_iters) + global_results = evaluator.evaluate(adaptor, task_dict, False, args.fewshots, bootstrap_iters=args.bootstrap_iters, limit=args.limit) if mpu.is_pipeline_last_stage() and mpu.get_tensor_model_parallel_rank() == 0: print(json.dumps(global_results, indent=2)) with open(args.results_path, 'w') as outfile: From 6936afba8e8f64e73792f41f9e44c39d2efb5c5c Mon Sep 17 00:00:00 2001 From: Niklas Muennighoff Date: Fri, 19 May 2023 23:20:40 +0200 Subject: [PATCH 37/37] Revert UL2 Tokenizer Changes --- megatron/tokenizer/tokenizer.py | 72 ++------------------------------- 1 file changed, 3 insertions(+), 69 deletions(-) diff --git a/megatron/tokenizer/tokenizer.py b/megatron/tokenizer/tokenizer.py index 7d0340ca2..09304b1dd 100644 --- a/megatron/tokenizer/tokenizer.py +++ b/megatron/tokenizer/tokenizer.py @@ -40,9 +40,7 @@ def build_tokenizer(args): vocab_extra_ids=args.vocab_extra_ids) elif args.tokenizer_type == 'GPT2BPETokenizer': assert args.merge_file is not None - tokenizer = _GPT2BPETokenizer( - args.vocab_file, args.merge_file, vocab_extra_ids=args.vocab_extra_ids - ) + tokenizer = _GPT2BPETokenizer(args.vocab_file, args.merge_file) elif args.tokenizer_type == "PretrainedFromHF": assert args.tokenizer_name_or_path is not None @@ -288,36 +286,13 @@ def additional_special_tokens(self, value): class _GPT2BPETokenizer(AbstractTokenizer): """Original GPT2 BPE tokenizer.""" - def __init__(self, vocab_file, merge_file, vocab_extra_ids=0): + def __init__(self, vocab_file, merge_file): name = 'GPT2 BPE' super().__init__(name) self.tokenizer = GPT2Tokenizer(vocab_file, merge_file, errors='replace', special_tokens=[], max_len=None) - self.eod_id = self.eos_token_id = self.tokenizer.encoder['<|endoftext|>'] - - self.bod_id = self.bos_token_id = self.tokenizer.encoder['[EOS]'] - self.sep_id = self.tokenizer.encoder['[SEP]'] - self.mask_id = self.tokenizer.encoder['[MASK]'] - self.pad_id = self.tokenizer.encoder['[PAD]'] - - additional_special_tokens = [] - self._additional_special_tokens = [] - additional_special_tokens.extend( - ["".format(i) for i in range(vocab_extra_ids)]) - self.add_additional_special_tokens(additional_special_tokens) - - def add_additional_special_tokens(self, tokens_list): - setattr(self, "additional_special_tokens", tokens_list) - for value in tokens_list: - self.add_token(value) - - def add_token(self, token): - if token not in self.vocab: - self.inv_vocab[self.vocab_size] = token - # self.vocab_size comes from len(vocab) - # and it will increase as we add elements - self.vocab[token] = self.vocab_size + self.eod_id = self.tokenizer.encoder['<|endoftext|>'] @property def vocab_size(self): @@ -341,35 +316,6 @@ def detokenize(self, token_ids): def eod(self): return self.eod_id - @property - def bod(self): - return self.bod_id - - @property - def sep(self): - return self.sep_id - - @property - def mask(self): - return self.mask_id - - @property - def pad(self): - return self.pad_id - - @property - def additional_special_tokens(self): - """ All the additional special tokens you may want to use (list of strings).""" - return self._additional_special_tokens - - @property - def additional_special_tokens_ids(self): - """ Ids of all the additional special tokens in the vocabulary (list of integers).""" - return [self.vocab.get(token) for token in self._additional_special_tokens] - - @additional_special_tokens.setter - def additional_special_tokens(self, value): - self._additional_special_tokens = value class _AutoTokenizer(AbstractTokenizer): """AutoTokenizer for Hf Pretrained model loading.""" @@ -442,18 +388,6 @@ def eos(self): candidate = self.tokenizer.eos_token_id return self._check_token_candidate(candidate) - @property - def bos_token_id(self): - """Id of the beginning of sentence token in the vocabulary.""" - candidate = self.tokenizer.bos_token_id - return self._check_token_candidate(candidate) - - @property - def eos_token_id(self): - """Id of the end of sentence token in the vocabulary.""" - candidate = self.tokenizer.eos_token_id - return self._check_token_candidate(candidate) - @property def additional_special_tokens_ids(self): """ All the additional special tokens you may want to use (list of strings)."""