diff --git a/configs/common/train.py b/configs/common/train.py index 2201b615c..2c1911a9c 100644 --- a/configs/common/train.py +++ b/configs/common/train.py @@ -128,6 +128,8 @@ # NOTE: if it is None, LiBai will automatically set pipeline_stage_id # `auto_pipeline_stage_id` and `actual_pipeline_stage_id` will be saved in `config.yaml` custom_pipeline_stage_id=None, + # set device type + device_type="cuda", ), # the device type of input tensors for model, defaults to "cuda". diff --git a/libai/inference/generator/generation_utils.py b/libai/inference/generator/generation_utils.py index 73df306b6..8ef9a08b9 100644 --- a/libai/inference/generator/generation_utils.py +++ b/libai/inference/generator/generation_utils.py @@ -55,6 +55,9 @@ class Generator: + dist_utils = dist.get_dist_util() + device_type = dist_utils.device_type + def _prepare_model_inputs( self, inputs: Optional[flow.Tensor] = None, @@ -101,7 +104,7 @@ def _prepare_input_ids_for_generation( shape, dtype=flow.long, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement(self.device_type, list(range(dist.get_world_size()))), ) * -100 ) @@ -113,7 +116,7 @@ def _prepare_input_ids_for_generation( (1, 1), dtype=flow.long, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement(self.device_type, list(range(dist.get_world_size()))), ) * bos_token_id ) @@ -137,7 +140,7 @@ def _prepare_attention_mask_for_generation( inputs.shape[:2], dtype=flow.bool, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement(dist.device_type, list(range(dist.get_world_size()))), ) def _prepare_encoder_decoder_kwargs_for_generation( @@ -171,7 +174,7 @@ def _prepare_decoder_input_ids_for_generation( (batch_size, 1), dtype=flow.long, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement(dist.device_type, list(range(dist.get_world_size()))), ) * decoder_start_token_id ) @@ -195,6 +198,7 @@ def _expand_inputs_for_generation( is_encoder_decoder: bool = False, attention_mask: Optional[flow.Tensor] = None, encoder_outputs: Optional[flow.Tensor] = None, + device_type="cuda", **model_kwargs, ): expanded_return_idx = ( @@ -202,7 +206,7 @@ def _expand_inputs_for_generation( ) expanded_return_idx = expanded_return_idx.to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement(device_type, list(range(dist.get_world_size()))), ) input_ids = input_ids.index_select(0, expanded_return_idx) @@ -589,12 +593,12 @@ def multinomial_sample( probs = nn.functional.softmax(next_token_scores, dim=-1) probs = probs.to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement(self.device_type, list(range(dist.get_world_size()))), ).to_local() next_tokens = flow.multinomial(probs, num_samples=1).squeeze(1) next_tokens = next_tokens.to_global( sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement(self.device_type, list(range(dist.get_world_size()))), ) unfinished_sequences = unfinished_sequences.to_global( sbp=next_tokens.sbp, placement=next_tokens.placement @@ -687,7 +691,7 @@ def beam_search( (batch_size, num_beams), dtype=flow.float, sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=flow.placement("cuda", list(range(dist.get_world_size()))), + placement=flow.placement(self.device_type, list(range(dist.get_world_size()))), ) beam_scores[:, 1:] = -1e9 beam_scores = beam_scores.view((batch_size * num_beams,)) @@ -1019,6 +1023,7 @@ def generate( input_ids, expand_size=num_return_sequences, is_encoder_decoder=self.cfg.is_encoder_decoder, + device_type=self.device_type, **model_kwargs, ) @@ -1057,6 +1062,7 @@ def generate( input_ids, expand_size=num_beams, is_encoder_decoder=self.cfg.is_encoder_decoder, + device_type=self.device_type, **model_kwargs, ) diff --git a/libai/tokenizer/tokenization_base.py b/libai/tokenizer/tokenization_base.py index d5bd5a9ea..ee2018ff7 100644 --- a/libai/tokenizer/tokenization_base.py +++ b/libai/tokenizer/tokenization_base.py @@ -138,6 +138,8 @@ class PreTrainedTokenizer(object): pretrained_vocab_files_map = {} pretrained_init_configuration = {} max_model_input_sizes = {} + dist_utils = dist.get_dist_util() + device_type = dist_utils.device_type SPECIAL_TOKENS_ATTRIBUTES = [ "bos_token", @@ -783,7 +785,8 @@ def convert_to_tensors(self, token_ids, return_tensors=None, is_global=False, ** elif is_global: sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast])) placement = kwargs.get( - "placement", flow.placement("cuda", list(range(dist.get_world_size()))) + "placement", + flow.placement(self.device_type, list(range(dist.get_world_size()))), ) return_token_ids = flow.tensor( token_ids, sbp=sbp, placement=placement, dtype=flow.long diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py index e7914a0ad..1c03dc21d 100644 --- a/libai/utils/distributed.py +++ b/libai/utils/distributed.py @@ -14,6 +14,7 @@ # limitations under the License. import logging +import os import dill import numpy as np @@ -438,7 +439,8 @@ def convert_to_distributed_default_setting(t): def ttol(tensor, pure_local=False, ranks=None): """Global tensor to local tensor.""" if tensor.is_global: - placement = tensor.placement if not ranks else flow.placement("cuda", ranks) + device_type = os.getenv("DEVICE_TYPE", "cuda") + placement = tensor.placement if not ranks else flow.placement(device_type, ranks) if pure_local: tensor = tensor.to_global(placement=placement).to_local() else: diff --git a/projects/MagicPrompt/layers/attention_layer.py b/projects/MagicPrompt/layers/attention_layer.py index 114172071..7f8292b85 100644 --- a/projects/MagicPrompt/layers/attention_layer.py +++ b/projects/MagicPrompt/layers/attention_layer.py @@ -208,12 +208,20 @@ def forward( causal_mask = causal_mask.repeat(attention_scores.size(0), 1, 1, 1) causal_mask = causal_mask.to_global(placement=attention_scores.placement) fill_value = flow.finfo(attention_scores.dtype).min - mask_value = flow.ones( - causal_mask.size(), - dtype=attention_scores.dtype, - sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), - placement=attention_scores.placement, - ).fill_(fill_value) + if causal_mask.shape[0] == 1: + mask_value = flow.ones( + causal_mask.size(), + dtype=attention_scores.dtype, + sbp=dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]), + placement=attention_scores.placement, + ).fill_(fill_value) + else: + mask_value = flow.ones( + causal_mask.size(), + dtype=attention_scores.dtype, + sbp=dist.get_nd_sbp([flow.sbp.split(0), flow.sbp.broadcast]), + placement=attention_scores.placement, + ).fill_(fill_value) attention_scores = flow.where(causal_mask, attention_scores, mask_value) if attention_mask is not None: