aws-neuron
diff --git a/‎src/transformers_neuronx/base.py‎
Lines changed: 77 additions & 16 deletions b/‎src/transformers_neuronx/base.py‎
Lines changed: 77 additions & 16 deletions
diff --git a/‎src/transformers_neuronx/config.py‎
Lines changed: 10 additions & 2 deletions b/‎src/transformers_neuronx/config.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎src/transformers_neuronx/decoder.py‎
Lines changed: 11 additions & 9 deletions b/‎src/transformers_neuronx/decoder.py‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎src/transformers_neuronx/generation_utils.py‎
Lines changed: 27 additions & 16 deletions b/‎src/transformers_neuronx/generation_utils.py‎
Lines changed: 27 additions & 16 deletions
diff --git a/‎src/transformers_neuronx/gpt2/model.py‎
Lines changed: 2 additions & 2 deletions b/‎src/transformers_neuronx/gpt2/model.py‎
Lines changed: 2 additions & 2 deletions
@@ -73,11 +73,17 @@ def to_neuron(self):
         self.setup()
 
     # top level api
-    def enable_speculative_decoder(self,speculation_length:Optional[Union[List[int], int]]):
+    def enable_speculative_decoder(self, speculation_length: Optional[Union[List[int], int]], batch_sizes: Optional[Union[List[int], int]]=None):
         if isinstance(speculation_length, int):
-            speculation_length=[speculation_length]
+            speculation_length = [speculation_length]
+        if batch_sizes is None:
+            batch_sizes = self.decoder_param_set.batch_size
+        if isinstance(batch_sizes, int):
+            batch_sizes = [batch_sizes]
         for k in speculation_length:
-            self.decoder_lm_head_for_speculation[k]=self.decoder_param_set.init_speculative_decoder(unroll=self.unroll, buckets=self.token_buckets, model_obj=self, n_active_tokens=k)
+            for batch_size in batch_sizes:
+                self.decoder_lm_head_for_speculation[k, batch_size] = \
+                    self.decoder_param_set.init_speculative_decoder(unroll=self.unroll, buckets=self.token_buckets, model_obj=self, n_active_tokens=k, batch_size=batch_size)
 
     def enable_window_context_decoder(self, window_context_length:Optional[Union[List[int], int]], unroll):
         if isinstance(window_context_length, int):
@@ -164,7 +170,7 @@ def context(self, hidden, cache_ids, start_ids, last_token_id, *rest):
         Other arguments that are required by the model are contained in `rest`.
         """
         context_length = hidden.shape[1]
-        batch_size, = start_ids.shape
+        batch_size = start_ids.shape[0]
 
         if self.is_fid:
             # Fusion-In-Decoder context encoding
@@ -239,7 +245,7 @@ def context(self, hidden, cache_ids, start_ids, last_token_id, *rest):
             return logits, scores
         return logits
 
-    def _prepare_for_par_ctx_rhs_padding(self, input_ids):
+    def _prepare_for_par_ctx_rhs_padding(self, input_ids, cache_ids):
         """A helper to do rhs padding on prompt for parallel context encoding model
         i.e.
             input_ids = [[111, 222, 333]]
@@ -257,9 +263,12 @@ def _prepare_for_par_ctx_rhs_padding(self, input_ids):
         batch_size, context_length = input_ids.shape
 
         # if last_token_id not used, simply set to 0
-        last_token_id = torch.as_tensor(0, dtype=torch.int32)
+        if self.neuron_config.vectorize_last_token_id:
+            last_token_id = torch.zeros(batch_size, dtype=torch.int32)
+        else:
+            last_token_id = torch.as_tensor(0, dtype=torch.int32)
         if context_length == 1:
-            return input_ids, last_token_id
+            return input_ids, cache_ids, last_token_id
 
         # TODO: check context_buckets for compatibility with OPT
         if hasattr(self, "context_buckets"):
@@ -269,11 +278,38 @@ def _prepare_for_par_ctx_rhs_padding(self, input_ids):
 
         if estimate:
             # when context length is larger than estimate, last_token_id=estimate-1
-            last_token_id = torch.as_tensor(min(context_length - 1, estimate-1), dtype=torch.int32)
+            if self.neuron_config.vectorize_last_token_id:
+                last_token_id = cache_ids.max(dim=1).values
+            else:
+                last_token_id = torch.as_tensor(min(context_length - 1, estimate-1), dtype=torch.int32)
             if context_length < estimate:
                 input_ids = utils.pad(input_ids, 1, estimate, left=False)
+                cache_ids = self._pad_cache_ids(cache_ids, batch_size, context_length, estimate)
+
+        return input_ids, cache_ids, last_token_id
 
-        return input_ids, last_token_id
+    def _pad_cache_ids(self, cache_ids, batch_size, context_length, estimate):
+        if self.neuron_config.use_2d_cache_ids:
+            # TODO: fix cache_ids padding for batch speculative decoding
+            cache_ids = torch.arange(estimate, dtype=torch.long)
+            cache_ids = cache_ids.unsqueeze(0).expand(batch_size, estimate)
+        else:
+            if cache_ids is None:
+                cache_ids = torch.arange(estimate, dtype=torch.long)
+            else:
+                # Inputs: cache_ids = [16, 17], estimate = 512
+                #
+                # Process:
+                # start_idx = 18, end_idx = 528 (= 512+16)
+                # padded_elements =       [18, 19, ..., 511, 512, 513, ..., 525, 526, 527]
+                # cache_ids_pad = [16, 17, 18, 19, ..., 511, 512, 513, ..., 525, 526, 527]
+                # cache_ids =     [16, 17, 18, 19, ..., 511, 511, 511, ..., 511, 511, 511]
+                start_idx = cache_ids[-1].item() + 1
+                end_idx = estimate + start_idx - context_length
+                pad_elements = torch.arange(start_idx, end_idx, dtype=torch.long)
+                cache_ids_pad = torch.concat([cache_ids, pad_elements], dim=0)
+                cache_ids = torch.minimum(cache_ids_pad, torch.tensor(estimate-1, dtype=torch.long))
+        return cache_ids
 
     def _prepare_for_continuous_batching(self, input_ids, cache_ids=None, seq_ids=None):
         n_seqs, n_active_tokens = input_ids.shape
@@ -288,10 +324,33 @@ def _prepare_for_continuous_batching(self, input_ids, cache_ids=None, seq_ids=No
         if n_active_tokens > 1 and cache_ids.flatten()[0].item() == 0:
             # context encoding
             n_active_seqs, n_active_tokens = input_ids.shape
-            n_positions = self.context_buckets[-1]
+            continuous_batching_n_positions = bucket.find(self.context_buckets, n_active_tokens)
             assert n_active_seqs == cache_ids.shape[0], f"invalid n_active_seqs ({n_active_seqs} vs {cache_ids.shape[0]})"
-            assert n_active_tokens <= n_positions, f"invalid input prompt length ({n_active_tokens} <= {n_positions})"
-            cache_ids_pad = torch.zeros(n_active_seqs, n_positions, dtype=cache_ids.dtype, device='cpu')
+            assert n_active_tokens <= continuous_batching_n_positions, \
+                f"invalid input prompt length ({n_active_tokens} <= {continuous_batching_n_positions})"
+            cache_ids_pad = torch.zeros(n_active_seqs, continuous_batching_n_positions, dtype=cache_ids.dtype, device='cpu')
+            for seq_id in range(n_active_seqs):
+                cache_ids_pad[seq_id, :n_active_tokens] = cache_ids[seq_id, :n_active_tokens]
+            return input_ids, cache_ids_pad, seq_ids
+
+        elif n_active_tokens > 1 and cache_ids.flatten()[0].item() > 0:
+            # speculative forward
+            n_active_seqs, n_active_tokens = input_ids.shape
+            speculative_n_positions = bucket.find(self.context_buckets, n_active_tokens)
+            assert n_active_tokens <= speculative_n_positions, \
+                f"invalid input prompt length ({n_active_tokens} <= {speculative_n_positions})"
+            prompt_buckets = list(set([k for k, batch_size in self.decoder_lm_head_for_speculation.keys()]))
+            speculation_bucket = bucket.find(prompt_buckets, n_active_tokens)
+            # validate the speculative head was compiled for the given batch size
+            speculation_batches = [batch_size for (k, batch_size) in self.decoder_lm_head_for_speculation.keys()]
+            assert n_active_seqs in speculation_batches, \
+                    f"invalid batch size for speculative forward ({n_active_seqs} not in {speculation_batches})"
+            # make cache ids 2d if needed and pad to match speculation bucket
+            if len(cache_ids.shape) == 1:
+                cache_ids = cache_ids.unsqueeze(0)
+            assert cache_ids.shape[0] == n_active_seqs, \
+                    f"invalid n_active_seqs ({n_active_seqs} vs {cache_ids.shape[0]}) in speculative forward"
+            cache_ids_pad = torch.zeros(n_active_seqs, speculative_n_positions, dtype=cache_ids.dtype, device='cpu')
             for seq_id in range(n_active_seqs):
                 cache_ids_pad[seq_id, :n_active_tokens] = cache_ids[seq_id, :n_active_tokens]
             return input_ids, cache_ids_pad, seq_ids
@@ -311,7 +370,7 @@ def _preprocess(self, input_ids, start_ids=None, cache_ids=None):
         input_ids, cache_ids, start_ids = self._prepare_for_continuous_batching(input_ids, cache_ids, start_ids)
 
         # right pad the input_ids if neccessary
-        input_ids, last_token_id = self._prepare_for_par_ctx_rhs_padding(input_ids)
+        input_ids, cache_ids, last_token_id = self._prepare_for_par_ctx_rhs_padding(input_ids, cache_ids)
 
         # note: this context_length is after right padded
         batch_size, context_length = input_ids.shape
@@ -321,6 +380,8 @@ def _preprocess(self, input_ids, start_ids=None, cache_ids=None):
 
         if cache_ids is None:
             cache_ids = torch.arange(context_length, dtype=torch.int32)
+            if self.neuron_config.use_2d_cache_ids:
+                cache_ids = cache_ids.unsqueeze(0).expand(batch_size, context_length)
 
         if hasattr(self, "prefixed_length") and self.prefixed_length:
             cache_ids += self.prefixed_length
@@ -365,7 +426,7 @@ def _context_dynamic_batching(self, hidden, *args):
                 "input batch size ({input_batch_size}) not divisible by running batch size ({running_batch_size})"
             n_iters = input_batch_size // running_batch_size
             all_logits = []
-            cache_ids, start_ids = args[0], args[1]
+            cache_ids, start_ids, last_token_id = args[0], args[1], args[2]
             for iter_id in range(n_iters):
                 # Assuming HSB layout
                 start_idx = iter_id*running_batch_size
@@ -376,9 +437,9 @@ def _context_dynamic_batching(self, hidden, *args):
                     hidden_per_batch = hidden[:, :, start_idx:end_idx]
                 cache_ids_per_batch = cache_ids[start_idx:end_idx, :]
                 start_ids_per_batch = start_ids[start_idx:end_idx]
-                last_token_id = cache_ids_per_batch.max()
+                last_token_id_per_batch = last_token_id[start_idx:end_idx]
                 logits_per_batch = self.context(hidden_per_batch, cache_ids_per_batch,
-                                                start_ids_per_batch, last_token_id)
+                                                start_ids_per_batch, last_token_id_per_batch)
                 all_logits.append(logits_per_batch)
             logits = torch.cat(all_logits, dim=2)
         else:
 
@@ -93,10 +93,10 @@ def __init__(self, **kargs):
         self.cast_logits_dtype = kargs.pop('cast_logits_dtype', 'float32')
         self.fuse_qkv = kargs.pop('fuse_qkv', False)
         self.continuous_batching = kargs.pop('continuous_batching', None)
-        self.use_2d_cache_ids = kargs.pop('use_2d_cache_ids', False)
+        self.lhs_aligned = kargs.pop('use_2d_cache_ids', False) or kargs.pop('lhs_aligned', False)
         if self.continuous_batching:
             # Force using 2D cache_ids layout for continuous batching.
-            self.use_2d_cache_ids = True
+            self.lhs_aligned = True
         self.attention_layout = kargs.pop('attention_layout', constants.LAYOUT_HSB)
         self.cache_layout = kargs.pop('cache_layout', constants.LAYOUT_SBH)
         self.collectives_layout = kargs.pop('collectives_layout', constants.LAYOUT_HSB)
@@ -122,6 +122,14 @@ def __init__(self, **kargs):
 
         self.layer_partition = {}
 
+    @property
+    def use_2d_cache_ids(self):
+        return self.lhs_aligned
+
+    @property
+    def vectorize_last_token_id(self):
+        return self.lhs_aligned
+
     def is_valid_layer(self, layer_id):
         if not self.is_pp():
             return True
 
@@ -215,12 +215,13 @@ def init_token_decoder(self,unroll, buckets, model_obj):
             decoder_lm_head.add_embedding_builder(self.hlo_builder.embedding)
         return decoder_lm_head
 
-    def init_speculative_decoder(self, unroll, buckets, model_obj, n_active_tokens):
-        decoder_lm_head = DecoderLmHeadForSamplingNoEmbedding(
+    def init_speculative_decoder(self, unroll, buckets, model_obj, n_active_tokens, batch_size=None):
+        cls = type(self)
+        decoder_lm_head = cls(
             tp_degree=self.tp_degree,
             n_positions_list=buckets,
             n_active_tokens=n_active_tokens,
-            batch_size=self.batch_size,
+            batch_size=self.batch_size if batch_size is None else batch_size,
             attention_head_size=self.attention_head_size,
             amp=self.amp,
             num_layers=self.num_layers,
@@ -400,9 +401,9 @@ def forward_single(self, *inputs):
         etc.
         """
         _, cache_ids, start_ids, *_ = inputs
-        batch_size, = start_ids.shape
-        # In continuous batching, take largest cache_id and use the power-of-two policy to find the appropriate bucket.
-        if self.neuron_config and self.neuron_config.continuous_batching:
+        batch_size = start_ids.shape[0]
+        # With 2D cache_ids, take largest cache_id and use the power-of-two policy to find the appropriate bucket.
+        if self.neuron_config and self.neuron_config.use_2d_cache_ids:
             bucket_id = 0
             batch_size, _ = cache_ids.shape
         else:
@@ -416,7 +417,7 @@ def forward_single(self, *inputs):
 
     def forward(self, *inputs):
         hidden, cache_ids, start_ids, *_ = inputs
-        batch_size, = start_ids.shape
+        batch_size = start_ids.shape[0]
         sequence_dim, *_ = self.inputs_sdim
         sequence_length = hidden.shape[sequence_dim]
         if sequence_length == 1:
@@ -459,8 +460,9 @@ def embed_positions_ids(self, position_ids, start_ids=None, batch_size=None):
             batch_size = self.batch_size[0]
         if start_ids is None:
             return position_ids, torch.zeros([batch_size], dtype=torch.int32)
-        position_ids = position_ids.unsqueeze(0).repeat(batch_size, 1)
-        position_ids -= start_ids.unsqueeze(1)
+        if not self.neuron_config.use_2d_cache_ids:
+            position_ids = position_ids.unsqueeze(0).repeat(batch_size, 1)
+            position_ids -= start_ids.unsqueeze(1)
         position_ids.masked_fill_(position_ids < 0, 0)
         return position_ids, start_ids
 
 
@@ -22,10 +22,10 @@ def __init__(self, config, model):
         super().__init__(config)
         self.model = model
         self.config = config
-        self.cur_len = 0
+        self.cur_len = torch.zeros(1, dtype=torch.long)
 
     def reset_generation(self):
-        self.cur_len = 0
+        self.cur_len = torch.zeros(1, dtype=torch.long)
 
     def forward(self, input_ids, cache_ids, start_ids=None, output_hidden_states=False, output_attentions=False,
             attention_mask=None, return_dict=False):
@@ -69,23 +69,34 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_
         if attention_mask is not None:
             _, start_ids = attention_mask.max(axis=1)
 
-        if self.cur_len > 0:
+        if (self.cur_len > 0).any().item():
             input_ids = input_ids[:, -1:]
-            cache_ids = torch.as_tensor([self.cur_len], dtype=torch.int32)
-
-        continuous_batching = self.model.neuron_config.continuous_batching is not None
-        if continuous_batching:
-            if self.cur_len > 0:
-                batch_size = input_ids.shape[0]
-                cache_ids = torch.as_tensor([self.cur_len]*batch_size, dtype=torch.int32).reshape(batch_size, 1)
-                start_ids = None
-            else:
-                cache_ids = torch.arange(input_ids.shape[-1]) * attention_mask
-                start_ids = torch.arange(input_ids.shape[0])
 
-        # no need to prepare cache_ids for parallel context encoding here as forward will pad input_ids and generate legalized cache_ids
+        if self.model.neuron_config.use_2d_cache_ids:
+            # 2D cache_ids
+            batch_size, context_length = attention_mask.shape
+            start_ids = torch.arange(input_ids.shape[0])
+            if (self.cur_len > 0).any().item():
+                # token generation (aka decoding) with 2D cache_ids
+                index_map = torch.arange(context_length).unsqueeze(0).expand(batch_size, context_length)
+                cache_ids = (index_map * attention_mask).max(dim=1).values.unsqueeze(-1)
+                self.cur_len = cache_ids.squeeze(-1)
+            else:
+                # context encoding (aka prefill) with 2D cache_ids
+                cache_ids = torch.arange(context_length) * attention_mask
+                self.cur_len = cache_ids.max(dim=1).values
+        else:
+            start_ids = None
+            if (self.cur_len > 0).any().item():
+                # token generation (aka decoding) with 1D cache_ids
+                cache_ids = self.cur_len
+                self.cur_len = cache_ids + 1
+            else:
+                # context encoding (aka prefill) with 1D cache_ids
+                batch_size, context_length = input_ids.shape
+                cache_ids = torch.arange(context_length)
+                self.cur_len = torch.tensor([context_length], dtype=torch.long)
 
-        self.cur_len += input_ids.shape[-1]
         model_inputs = {
             "input_ids": input_ids,
             "cache_ids": cache_ids,
 
@@ -442,7 +442,7 @@ def forward(self, input_ids, cache_ids=None, start_ids=None):
         is_context_encode = context_length > 1
         estimate = bucket.find(self.context_buckets, context_length)
 
-        inputs, last_token_id = self._prepare_for_par_ctx_rhs_padding(input_ids)
+        inputs, cache_ids, last_token_id = self._prepare_for_par_ctx_rhs_padding(input_ids, cache_ids)
         batch_size, context_length = inputs.shape
 
         model = self.decoder_lm_head
@@ -529,7 +529,7 @@ def speculative_forward(self, input_ids, cache_ids=None, start_ids=None, specula
         if speculation_length is None:
             model=self.decoder_lm_head
         else:
-            model=self.decoder_lm_head_for_speculation[speculation_length]
+            model=self.decoder_lm_head_for_speculation[speculation_length, batch_size]
 
         # Compute the window starting index for specific mask patterns
         # For other patterns we pass in a default value of 0, it won't be used