aws-neuron
diff --git a/‎src/transformers_neuronx/__init__.py‎
Lines changed: 4 additions & 2 deletions b/‎src/transformers_neuronx/__init__.py‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/transformers_neuronx/activations.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers_neuronx/activations.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers_neuronx/base.py‎
Lines changed: 64 additions & 33 deletions b/‎src/transformers_neuronx/base.py‎
Lines changed: 64 additions & 33 deletions
diff --git a/‎src/transformers_neuronx/bloom/hlo.py‎
Lines changed: 9 additions & 19 deletions b/‎src/transformers_neuronx/bloom/hlo.py‎
Lines changed: 9 additions & 19 deletions
@@ -14,8 +14,10 @@
 # ==============================================================================
 from transformers_neuronx.version import __version__
 
-from transformers_neuronx.config import NeuronConfig, QuantizationConfig, ContinuousBatchingConfig
-from transformers_neuronx.constants import GQA
+
+from transformers_neuronx.constants import GQA, Layout
+from transformers_neuronx.sparse_attn_utils import SparseAttnConfig
+from transformers_neuronx.config import NeuronConfig, QuantizationConfig, ContinuousBatchingConfig, GenerationConfig
 from transformers_neuronx.generation_utils import HuggingFaceGenerationModelAdapter
 
 from transformers_neuronx.bloom.model import BloomForSampling
 
@@ -15,7 +15,7 @@
 import math
 
 def gelu_new(hidden):
-    return hidden.dtype[hidden.sizes].CustomCall(hidden, custom_call_target="AwsNeuronGelu")
+    return hidden.dtype[hidden.sizes].CustomCall(hidden, custom_call_target="AwsNeuronGeluApprxTanh")
 
 def gelu_new_legacy(hidden):
     dtype = hidden.dtype
 
@@ -24,6 +24,7 @@
 from transformers_neuronx import module
 from transformers_neuronx.compiler import ParallelKernel
 from transformers_neuronx.constants import LAYOUT_BSH
+from transformers_neuronx.config import GenerationConfig
 from concurrent.futures import ProcessPoolExecutor
 
 
@@ -85,10 +86,12 @@ def enable_speculative_decoder(self, speculation_length: Optional[Union[List[int
                 self.decoder_lm_head_for_speculation[k, batch_size] = \
                     self.decoder_param_set.init_speculative_decoder(unroll=self.unroll, buckets=self.token_buckets, model_obj=self, n_active_tokens=k, batch_size=batch_size)
 
-    def enable_window_context_decoder(self, window_context_length:Optional[Union[List[int], int]], unroll):
+    def enable_window_context_decoder(self, window_context_length:Optional[Union[List[int], int]], unroll: Optional[int] = None):
         if isinstance(window_context_length, int):
             window_context_length=[window_context_length]
         self.window_context_buckets = bucket.context_sizes(window_context_length, self.token_buckets)
+        if unroll is None:
+            unroll = self.decoder_param_set.num_layers
         for k in self.window_context_buckets:
             self.decoder_lm_head_for_window_context[k]=self.decoder_param_set.init_window_context_decoder(unroll=unroll, buckets=self.token_buckets, model_obj=self, n_active_tokens=k)
 
@@ -172,6 +175,8 @@ def context(self, hidden, cache_ids, start_ids, last_token_id, *rest):
         context_length = hidden.shape[1]
         batch_size = start_ids.shape[0]
 
+        all_logits = [] # Collect all logits if neuron_config.output_all_logits is True
+
         if self.is_fid:
             # Fusion-In-Decoder context encoding
             fused_context_length = hidden.shape[1]
@@ -181,7 +186,6 @@ def context(self, hidden, cache_ids, start_ids, last_token_id, *rest):
 
         estimate = bucket.find(self.context_buckets, context_length)
 
-
         if estimate is not None:
             hidden_context = hidden
             cache_context = cache_ids
@@ -208,11 +212,11 @@ def context(self, hidden, cache_ids, start_ids, last_token_id, *rest):
                     logits, scores = model(hidden_context, cache_context, start_ids, last_token_id, *rest)
                 else:
                     logits = model(hidden_context, cache_context, start_ids, last_token_id, *rest)
-
-
+                if self.neuron_config.output_all_logits:
+                    all_logits.append(logits[:, :last_token_id + 1, :])
 
         # process the leftovers context
-        while current < context_length - 1:
+        while current < context_length:
             # find the optimal "window"
             estimate = None
             if hasattr(self, "window_context_buckets"):
@@ -225,18 +229,25 @@ def context(self, hidden, cache_ids, start_ids, last_token_id, *rest):
                     cache_ids = torch.as_tensor([i], dtype=torch.int32)
                     hidden_slice = hidden[:, i:i+1].contiguous()
                     logits = self.decoder_lm_head(hidden_slice, cache_ids, start_ids, last_token_id, *rest)
+                    if self.neuron_config.output_all_logits:
+                        all_logits.append(logits)
                 break
 
             hidden_slice = hidden[:, current:current+estimate].contiguous()
             cache_ids = torch.as_tensor([i for i in range(current, current+estimate)], dtype=torch.int32)
-            last_token_id = torch.as_tensor(estimate - 1)
+            last_token_id = torch.as_tensor([estimate - 1])
             if self.neuron_config.log_softmax_scores:
                 logits, scores = self.decoder_lm_head_for_window_context[estimate](hidden_slice, cache_ids, start_ids, last_token_id, *rest)
             else:
                 logits = self.decoder_lm_head_for_window_context[estimate](hidden_slice, cache_ids, start_ids, last_token_id, *rest)
+            if self.neuron_config.output_all_logits:
+                all_logits.append(logits)
 
             current += estimate
 
+        if all_logits:
+            logits = torch.cat(all_logits, dim=1)
+
         if self.is_fid:
             logits[:] = float('-inf')
             logits[self.bos_token_id] = 1.0
@@ -266,12 +277,16 @@ def _prepare_for_par_ctx_rhs_padding(self, input_ids, cache_ids):
         if self.neuron_config.vectorize_last_token_id:
             last_token_id = torch.zeros(batch_size, dtype=torch.int32)
         else:
-            last_token_id = torch.as_tensor(0, dtype=torch.int32)
+            last_token_id = torch.as_tensor([0], dtype=torch.int32)
         if context_length == 1:
             return input_ids, cache_ids, last_token_id
 
         # TODO: check context_buckets for compatibility with OPT
-        if hasattr(self, "context_buckets"):
+        if cache_ids is not None and cache_ids.flatten()[0].item() > 0:
+            # speculative forward: n_active_tokens > 1 and cache_ids start from position > 0
+            speculation_buckets = list(set([k for k, batch_size in self.decoder_lm_head_for_speculation.keys()]))
+            estimate = bucket.find(speculation_buckets, context_length)
+        elif hasattr(self, "context_buckets"):
             estimate = bucket.find(self.context_buckets, context_length)
         else:
             estimate = self.context_length_estimate
@@ -281,7 +296,7 @@ def _prepare_for_par_ctx_rhs_padding(self, input_ids, cache_ids):
             if self.neuron_config.vectorize_last_token_id:
                 last_token_id = cache_ids.max(dim=1).values
             else:
-                last_token_id = torch.as_tensor(min(context_length - 1, estimate-1), dtype=torch.int32)
+                last_token_id = torch.as_tensor([min(context_length - 1, estimate-1)], dtype=torch.int32)
             if context_length < estimate:
                 input_ids = utils.pad(input_ids, 1, estimate, left=False)
                 cache_ids = self._pad_cache_ids(cache_ids, batch_size, context_length, estimate)
@@ -291,11 +306,15 @@ def _prepare_for_par_ctx_rhs_padding(self, input_ids, cache_ids):
     def _pad_cache_ids(self, cache_ids, batch_size, context_length, estimate):
         if self.neuron_config.use_2d_cache_ids:
             # TODO: fix cache_ids padding for batch speculative decoding
-            cache_ids = torch.arange(estimate, dtype=torch.long)
+            # for now, use cache_ids without change for speculative_forward
+            is_speculative_forward = cache_ids.flatten()[0].item() > 0
+            if is_speculative_forward:
+                return cache_ids
+            cache_ids = torch.arange(estimate, dtype=torch.int32)
             cache_ids = cache_ids.unsqueeze(0).expand(batch_size, estimate)
         else:
             if cache_ids is None:
-                cache_ids = torch.arange(estimate, dtype=torch.long)
+                cache_ids = torch.arange(estimate, dtype=torch.int32)
             else:
                 # Inputs: cache_ids = [16, 17], estimate = 512
                 #
@@ -306,9 +325,9 @@ def _pad_cache_ids(self, cache_ids, batch_size, context_length, estimate):
                 # cache_ids =     [16, 17, 18, 19, ..., 511, 511, 511, ..., 511, 511, 511]
                 start_idx = cache_ids[-1].item() + 1
                 end_idx = estimate + start_idx - context_length
-                pad_elements = torch.arange(start_idx, end_idx, dtype=torch.long)
+                pad_elements = torch.arange(start_idx, end_idx, dtype=torch.int32)
                 cache_ids_pad = torch.concat([cache_ids, pad_elements], dim=0)
-                cache_ids = torch.minimum(cache_ids_pad, torch.tensor(estimate-1, dtype=torch.long))
+                cache_ids = torch.minimum(cache_ids_pad, torch.tensor(estimate-1, dtype=torch.int32))
         return cache_ids
 
     def _prepare_for_continuous_batching(self, input_ids, cache_ids=None, seq_ids=None):
@@ -350,20 +369,24 @@ def _prepare_for_continuous_batching(self, input_ids, cache_ids=None, seq_ids=No
                 cache_ids = cache_ids.unsqueeze(0)
             assert cache_ids.shape[0] == n_active_seqs, \
                     f"invalid n_active_seqs ({n_active_seqs} vs {cache_ids.shape[0]}) in speculative forward"
-            cache_ids_pad = torch.zeros(n_active_seqs, speculative_n_positions, dtype=cache_ids.dtype, device='cpu')
+            # pad cache IDs with max(n_positions) - 1
+            # unlike context encoding, padding with 0
+            # during speculative_forward will contaminate kv-cache history
+            cache_ids_pad = torch.full((n_active_seqs, speculation_bucket), max(self.context_buckets) - 1, dtype=cache_ids.dtype, device="cpu")
             for seq_id in range(n_active_seqs):
                 cache_ids_pad[seq_id, :n_active_tokens] = cache_ids[seq_id, :n_active_tokens]
             return input_ids, cache_ids_pad, seq_ids
 
         # token generation
-        full_input_ids = torch.zeros(batch_size, 1, dtype=input_ids.dtype, device="cpu")
-        full_cache_ids = torch.zeros(batch_size, 1, dtype=cache_ids.dtype, device="cpu")
+        full_input_ids = torch.zeros(batch_size, 1, dtype=torch.int32)
+        full_cache_ids = torch.zeros(batch_size, 1, dtype=torch.int32)
+        full_seq_ids = torch.arange(batch_size, dtype=torch.int32)
         for idx, seq_id in enumerate(seq_ids.flatten()):
             seq_id = seq_id.item()
             full_input_ids[seq_id, :] = input_ids[idx, :]
             full_cache_ids[seq_id, :] = cache_ids[idx, :]
 
-        return full_input_ids, full_cache_ids, seq_ids
+        return full_input_ids, full_cache_ids, full_seq_ids
 
     def _preprocess(self, input_ids, start_ids=None, cache_ids=None):
         # enable dynamic batch size feature for continuous batching
@@ -389,7 +412,8 @@ def _preprocess(self, input_ids, start_ids=None, cache_ids=None):
         return input_ids, cache_ids, start_ids, last_token_id
 
     def _postprocess(self, logits, start_ids=None):
-        if start_ids is None:
+
+        if start_ids is None or (self.neuron_config.output_all_logits and logits.shape[1] > 1):
             return logits
 
         running_batch_size, n_embed = logits.shape
@@ -400,24 +424,25 @@ def _postprocess(self, logits, start_ids=None):
             return logits
 
         # token generation (aka decoding)
-        seq_ids = start_ids.flatten().tolist()
-        assert input_batch_size == len(seq_ids), f"expected seq_ids to be {input_batch_size} in length, but seq_ids={seq_ids}"
-        new_logits = torch.zeros(input_batch_size, n_embed, dtype=logits.dtype, device=logits.device)
-        for idx, seq_id in enumerate(seq_ids):
-            new_logits[idx, :] = logits[seq_id, :]
+        seq_ids = start_ids.flatten()
+        if torch.equal(seq_ids, torch.arange(input_batch_size)):
+            logits = logits[:input_batch_size]
+        else:
+            logits = logits[seq_ids]
 
-        return new_logits
+        return logits
 
     def _cast_logits(self, logits):
         # Cast logits to float32 or the dtype specified in the neuron config
         logits_dtype = torch.float32
         if self.neuron_config:
-            logits_dtype = getattr(torch, self.neuron_config.cast_logits_dtype)
+            if self.neuron_config.cast_logits_dtype is not None:
+                logits_dtype = getattr(torch, self.neuron_config.cast_logits_dtype)
         return logits.to(logits_dtype)
 
     def _context_dynamic_batching(self, hidden, *args):
         is_bsh = self.neuron_config and self.neuron_config.attention_layout == LAYOUT_BSH
-        input_batch_size = hidden.shape[0] if is_bsh else hidden.shape[2]
+        input_batch_size = hidden.shape[0] if is_bsh or self.neuron_config.on_device_embedding else hidden.shape[2]
         assert hasattr(self, "context_batch_sizes"), f"{type(self)} doesn't support dynamic batching."
 
         running_batch_size = self.context_batch_sizes[-1]
@@ -428,20 +453,19 @@ def _context_dynamic_batching(self, hidden, *args):
             all_logits = []
             cache_ids, start_ids, last_token_id = args[0], args[1], args[2]
             for iter_id in range(n_iters):
-                # Assuming HSB layout
                 start_idx = iter_id*running_batch_size
                 end_idx = (iter_id+1)*running_batch_size
-                if is_bsh:
-                    hidden_per_batch = hidden[start_idx:end_idx, :, :]
+                if is_bsh or self.neuron_config.on_device_embedding:
+                    hidden_per_batch = hidden[start_idx:end_idx, ...]
                 else:
-                    hidden_per_batch = hidden[:, :, start_idx:end_idx]
+                    hidden_per_batch = hidden[..., start_idx:end_idx]
                 cache_ids_per_batch = cache_ids[start_idx:end_idx, :]
                 start_ids_per_batch = start_ids[start_idx:end_idx]
                 last_token_id_per_batch = last_token_id[start_idx:end_idx]
                 logits_per_batch = self.context(hidden_per_batch, cache_ids_per_batch,
                                                 start_ids_per_batch, last_token_id_per_batch)
                 all_logits.append(logits_per_batch)
-            logits = torch.cat(all_logits, dim=2)
+            logits = torch.cat(all_logits, dim=-1)
         else:
             assert input_batch_size == running_batch_size, \
                 "input batch size ({input_batch_size}) not equal to running batch size ({running_batch_size})"
@@ -464,8 +488,11 @@ def _forward(self, hidden, *args):
             return logits
 
         logits = self._cast_logits(logits)
-        logits = logits[:self.config.vocab_size, -1, :]
-        logits = logits.transpose(0, 1)
+        if self.neuron_config.output_all_logits and context_length > 1:
+            logits = logits.permute(2, 1, 0)
+        else:
+            logits = logits[:self.config.vocab_size, -1, :]
+            logits = logits.transpose(0, 1)
         return logits
 
 
@@ -506,6 +533,10 @@ def profile(self, profile_dir, ntff_count_limit):
             if isinstance(kernel, ParallelKernel):
                 kernel.profile(profile_dir, ntff_count_limit)
 
+    def update_generation_config(self, generation_config: GenerationConfig):
+        self.decoder_lm_head.update_generation_config(generation_config)
+
+
 # Base class for all "Serializable Objects"
 class NeuronBaseSerializer:
 
 
@@ -14,23 +14,23 @@
 # ==============================================================================
 from transformers_neuronx import hlo
 from transformers_neuronx.constants import LAYOUT_BSH
-from transformers_neuronx.layers import transformer, alibi, generation
+from transformers_neuronx.layers import transformer, alibi, attention
 from transformers_neuronx.bloom.config import BloomConfig
 
 class BloomForSamplingNoEmbeddingHlo:
 
     def __init__(self, config: BloomConfig, neuron_config=None):
         self.config = config
         self.neuron_config = neuron_config
+        self.n_positions = None
 
-    def inputs(self, scribe, dtype, n_positions, n_active_tokens, batch_size):
-        hidden, cache_ids, start_ids, last_token_id, dims = transformer.inputs(
+    def inputs(self, scribe, dtype, n_active_tokens, batch_size):
+        tensors, dims = transformer.inputs(
             scribe, dtype, batch_size, n_active_tokens, self.config.hidden_size, self.neuron_config
         )
-        mask, active_mask = hlo.attention_mask(cache_ids, start_ids, n_positions)
-        return (hidden, last_token_id, cache_ids, mask, active_mask), dims
+        return tensors, dims
 
-    def embedding(self, input_ids, last_token_id, cache_ids, mask, active_mask, slopes, word_embeddings, ln_weight, ln_bias):
+    def embedding(self, input_ids, cache_ids, start_ids, last_token_id, slopes, word_embeddings, ln_weight, ln_bias):
         dtype = getattr(input_ids.scribe, self.config.amp)
         hidden = hlo.embedding(word_embeddings, input_ids, tp_degree=self.config.tp_degree, dtype=dtype)
         if self.config.hidden_size % self.config.tp_degree != 0:
@@ -41,8 +41,9 @@ def embedding(self, input_ids, last_token_id, cache_ids, mask, active_mask, slop
         return hlo.layer_norm_bsh(hidden, ln_weight, ln_bias) if is_bsh \
                else hlo.layer_norm(hidden, ln_weight, ln_bias)
 
-    def pre_layer(self, hidden, last_token_id, cache_ids, mask, active_mask, *pre_layer_weights):
+    def pre_layer(self, hidden, cache_ids, start_ids, last_token_id, *pre_layer_weights):
         slopes, *rest = pre_layer_weights
+        mask, active_mask = hlo.attention_mask(cache_ids, start_ids, self.n_positions)
         prior_alibi, active_alibi = alibi.alibi(slopes, mask, active_mask)
         return hidden, last_token_id, cache_ids, mask, active_mask, prior_alibi, active_alibi
 
@@ -87,14 +88,9 @@ def layer(self, hidden, last_token_id, cache_ids, mask, active_mask, prior_alibi
         hidden = dtype[hidden.sizes].Add(mlp_hidden, hidden)
         return hidden, out_attn_k_cache, out_attn_v_cache
 
-    def ln_lm_head(self, hidden, last_token_id, ln_f_weight, ln_f_bias, lm_head_weight, lm_head_bias, logits_indices, return_all_outputs=True):
+    def ln_lm_head(self, hidden, last_token_id, ln_f_weight, ln_f_bias, lm_head_weight, lm_head_bias, return_all_outputs=True):
         logits = transformer.ln_lm_head(self.config.tp_degree, hidden, last_token_id, ln_f_weight, ln_f_bias, lm_head_weight,
                                         lm_head_bias, return_all_outputs, neuron_config=self.neuron_config)
-        if self.neuron_config.on_device_generation is not None:
-            return generation.generate(logits, logits_indices,
-                                       config=self.neuron_config.on_device_generation, 
-                                       tp_degree=self.config.tp_degree, 
-                                       eos_token_id=self.config.eos_token_id)
         return logits
 
     def attention(self,
@@ -111,12 +107,6 @@ def attention(self,
         dtype = hidden.dtype
         d_head = self.config.hidden_size // self.config.n_head
 
-        is_bsh = neuron_config and neuron_config.attention_layout == LAYOUT_BSH
-        if is_bsh:
-            import transformers_neuronx.layers.attention as attention
-        else:
-            import transformers_neuronx.layers.attention_hsb as attention
-
         # Q = (hidden @ wQ) + bQ
         # K = (hidden @ wK) + bK
         # V = (hidden @ wV) + bV