@@ -194,7 +194,7 @@ def add_token(request: LlmRequest,
194
194
* ,
195
195
beam : int ,
196
196
step : int = 0 ) -> int :
197
- seq_slot = request .seq_slot
197
+ seq_slot = request .py_seq_slot
198
198
assert seq_slot is not None
199
199
new_token = int (new_tokens [step , seq_slot , beam ])
200
200
request .add_new_token (new_token , beam )
@@ -285,14 +285,14 @@ def _handle_stop_criteria(self, request: LlmRequest,
285
285
286
286
def handle_logits (self , request : LlmRequest , state : SampleState , * ,
287
287
beam : int , count : int ):
288
- current_slice = slice (0 , count ), request .seq_slot , beam
288
+ current_slice = slice (0 , count ), request .py_seq_slot , beam
289
289
if request .py_return_generation_logits :
290
290
assert state .host .logits is not None
291
291
current_logits = state .host .logits [current_slice ]
292
292
request .py_result .append_generation_logits (current_logits )
293
293
if request .py_return_log_probs :
294
294
assert state .host .log_probs is not None
295
- log_probs = state .host .log_probs [request .seq_slot ][beam ][:count ]
295
+ log_probs = state .host .log_probs [request .py_seq_slot ][beam ][:count ]
296
296
current_tokens = state .host .new_tokens [current_slice ]
297
297
298
298
token_log_probs = [{
@@ -406,7 +406,7 @@ def _process_requests(self,
406
406
no_draft_tokens = len (requests ) == sum_steps
407
407
fast_path = not self .enable_mixed_sampler and no_draft_tokens and gen_logits_host is None and log_probs_host is None
408
408
409
- seq_slots = torch .as_tensor ([r .seq_slot for r in requests ])
409
+ seq_slots = torch .as_tensor ([r .py_seq_slot for r in requests ])
410
410
seq_slots = seq_slots .to (device = "cuda" , non_blocking = True )
411
411
412
412
if fast_path :
@@ -616,9 +616,9 @@ def _update_cache_indirection_buffer(self,
616
616
# Copy cache indirection output to input
617
617
for request in scheduled_requests .generation_requests :
618
618
self .store ["decoder_state" ].cache_indirection_input [
619
- request .seq_slot ].copy_ (
619
+ request .py_seq_slot ].copy_ (
620
620
self .store ["decoder_state" ].cache_indirection_output [
621
- request .seq_slot ],
621
+ request .py_seq_slot ],
622
622
non_blocking = True )
623
623
624
624
@torch .inference_mode ()
@@ -881,7 +881,7 @@ def update_requests_multiple_beams_or_drafting(self,
881
881
882
882
def _finalize_request (self , request : LlmRequest , streaming : bool ):
883
883
""" Finalizes the request. This is necessary for beam search. """
884
- seq_slot = request .seq_slot
884
+ seq_slot = request .py_seq_slot
885
885
event = self .algs .decoder .finalize (self .store ["decoder_state" ],
886
886
seq_slot , request .sampling_config ,
887
887
streaming )
@@ -893,7 +893,7 @@ def _post_process_request(self, request: LlmRequest,
893
893
request: LlmRequest which shall be post processed
894
894
finalize_event: CudaEvent to wait for the finalize step to finish
895
895
"""
896
- seq_slot = request .seq_slot
896
+ seq_slot = request .py_seq_slot
897
897
beam_width = request .sampling_config .beam_width
898
898
# synchronize on the finalize event before continuing the post processing.
899
899
finalize_event .synchronize ()
0 commit comments