@@ -846,8 +846,8 @@ def _get_padded_batch(
846
846
spec_resource_manager : Optional [BaseResourceManager ] = None ) -> int :
847
847
can_run_cuda_graph = scheduled_requests .can_run_cuda_graph
848
848
batch_size = scheduled_requests .batch_size
849
- # The number of sequences in the batch is the number of prompts times the beam width.
850
- new_batch_size = batch_size * self . max_beam_width
849
+ new_batch_size = batch_size
850
+
851
851
if self ._run_cuda_graphs and self .enable_attention_dp and self .mapping .tp_size > 1 :
852
852
graph_batch_size = self .dist .tp_allgather (
853
853
[can_run_cuda_graph , batch_size ])
@@ -981,8 +981,8 @@ def _maybe_get_cuda_graph(
981
981
self ._cuda_graphs [batch_size ] = {}
982
982
983
983
self ._cuda_graphs [batch_size ][draft_len ] = DecodingCUDAGraphRunner (
984
- num_sequences_in_batch , "cuda" , attn_metadata , spec_metadata ,
985
- self .use_mrope )
984
+ batch_size , "cuda" , attn_metadata , spec_metadata , self . use_mrope ,
985
+ self .max_beam_width )
986
986
return self ._cuda_graphs [batch_size ][draft_len ]
987
987
988
988
def __del__ (self ) -> None :
@@ -1376,8 +1376,11 @@ def _prepare_tp_inputs(
1376
1376
gather_ids .append (len (position_ids ) - 1 )
1377
1377
1378
1378
request_ids .append (request .py_request_id )
1379
- gen_request_seq_slots .append (request .py_seq_slot )
1380
1379
request .py_batch_idx = request .py_seq_slot
1380
+ # Do not add a gen_request_seq_slot for CUDA graph dummy requests
1381
+ # to prevent access errors due to None values
1382
+ if not request .is_cuda_graph_dummy :
1383
+ gen_request_seq_slots .append (request .py_seq_slot )
1381
1384
1382
1385
previous_batch_len = len (previous_batch_indices )
1383
1386
@@ -1506,7 +1509,7 @@ def previous_seq_slots_device():
1506
1509
pin_memory = True ,
1507
1510
)
1508
1511
1509
- num_generation_requests = len (scheduled_requests . generation_requests )
1512
+ num_generation_requests = len (gen_request_seq_slots )
1510
1513
# Cache indirection is only used for beam search on generation requests
1511
1514
if self .use_beam_search and num_generation_requests > 0 :
1512
1515
# CUDA Graph needs to set beam width during warmup (where the graph is captured), to ensure that cache indirection buffer is correctly picked up by the CUDA graph
0 commit comments