File tree Expand file tree Collapse file tree 2 files changed +12
-4
lines changed Expand file tree Collapse file tree 2 files changed +12
-4
lines changed Original file line number Diff line number Diff line change @@ -726,8 +726,11 @@ def disable_optimization(backend: Backend):
726
726
# For non-draft model, we also capture the CUDA graph instance for draft length 0,
727
727
# so that when we disable spec decode at runtime, we can still run the captured graph.
728
728
# Note that for one engine mode, we are not able to turn off spec decode at runtime.
729
- if not self .is_draft_model and self .max_draft_len > 0 and not self .spec_config .spec_dec_mode .use_one_engine (
730
- ):
729
+ if (not self .is_draft_model and self .max_draft_len > 0
730
+ and not self .spec_config .spec_dec_mode .use_one_engine ()
731
+ # Assume that speculation is always on if the user didn't give us a max_concurrency
732
+ # value. This will save on memory.
733
+ and self .spec_config .max_concurrency is not None ):
731
734
draft_lengths .append (0 )
732
735
733
736
for bs in cuda_graph_batch_sizes :
Original file line number Diff line number Diff line change 1
1
from abc import ABC , abstractmethod
2
- from typing import List , Optional
2
+ from typing import List , Optional , final
3
3
4
4
from ..pyexecutor .llm_request import LlmRequest
5
5
from ..pyexecutor .resource_manager import ResourceManager
@@ -26,8 +26,13 @@ def prepare_draft_tokens(
26
26
"""
27
27
raise NotImplementedError
28
28
29
+ @final
29
30
def should_use_spec_decode (self , requests : List [LlmRequest ]) -> bool :
30
- """Check if spec decode should be used for the current iteration."""
31
+ """
32
+ You probably don't want to override this. ModelEngine
33
+ assumes that speculation is always on if max_concurrency
34
+ is not specified by the user's spec config.
35
+ """
31
36
if self .max_concurrency is not None :
32
37
return len (requests ) <= self .max_concurrency
33
38
return True
You can’t perform that action at this time.
0 commit comments