Skip to content

Commit 9645814

Browse files
authored
[chore] Clean up quickstart_advanced.py (#6021)
Signed-off-by: Mike Iovine <[email protected]>
1 parent d7f0b0a commit 9645814

File tree

5 files changed

+16
-19
lines changed

5 files changed

+16
-19
lines changed

examples/llm-api/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ python3 quickstart_multimodal.py --model_dir Efficient-Large-Model/NVILA-8B --mo
4040
python3 quickstart_advanced.py \
4141
--model_dir meta-llama/Llama-3.1-8B-Instruct \
4242
--spec_decode_algo NGRAM \
43-
--spec_decode_nextn 4 \
43+
--spec_decode_max_draft_len 4 \
4444
--max_matching_ngram_size 2 \
4545
--disable_overlap_scheduler \
4646
--disable_kv_cache_reuse
@@ -51,7 +51,7 @@ python3 quickstart_advanced.py \
5151
python3 quickstart_advanced.py \
5252
--model_dir meta-llama/Llama-3.1-8B-Instruct \
5353
--spec_decode_algo draft_target \
54-
--spec_decode_nextn 5 \
54+
--spec_decode_max_draft_len 5 \
5555
--draft_model_dir meta-llama/Llama-3.2-1B-Instruct \
5656
--disable_overlap_scheduler \
5757
--disable_kv_cache_reuse

examples/llm-api/quickstart_advanced.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,8 @@ def add_llm_args(parser):
108108

109109
# Speculative decoding
110110
parser.add_argument('--spec_decode_algo', type=str, default=None)
111-
parser.add_argument('--spec_decode_nextn', type=int, default=1)
112-
parser.add_argument('--draft_model_dir',
113-
'--eagle_model_dir',
114-
type=str,
115-
default=None)
111+
parser.add_argument('--spec_decode_max_draft_len', type=int, default=1)
112+
parser.add_argument('--draft_model_dir', type=str, default=None)
116113
parser.add_argument('--max_matching_ngram_size', type=int, default=5)
117114
parser.add_argument('--use_one_model', default=False, action='store_true')
118115

@@ -162,23 +159,23 @@ def setup_llm(args, **kwargs):
162159
)
163160

164161
spec_config = MTPDecodingConfig(
165-
num_nextn_predict_layers=args.spec_decode_nextn,
162+
num_nextn_predict_layers=args.spec_decode_max_draft_len,
166163
use_relaxed_acceptance_for_thinking=args.
167164
use_relaxed_acceptance_for_thinking,
168165
relaxed_topk=args.relaxed_topk,
169166
relaxed_delta=args.relaxed_delta)
170167
elif spec_decode_algo == "EAGLE3":
171168
spec_config = EagleDecodingConfig(
172-
max_draft_len=args.spec_decode_nextn,
169+
max_draft_len=args.spec_decode_max_draft_len,
173170
speculative_model_dir=args.draft_model_dir,
174171
eagle3_one_model=args.use_one_model)
175172
elif spec_decode_algo == "DRAFT_TARGET":
176173
spec_config = DraftTargetDecodingConfig(
177-
max_draft_len=args.spec_decode_nextn,
174+
max_draft_len=args.spec_decode_max_draft_len,
178175
speculative_model_dir=args.draft_model_dir)
179176
elif spec_decode_algo == "NGRAM":
180177
spec_config = NGramDecodingConfig(
181-
max_draft_len=args.spec_decode_nextn,
178+
max_draft_len=args.spec_decode_max_draft_len,
182179
max_matching_ngram_size=args.max_matching_ngram_size,
183180
is_keep_all=True,
184181
is_use_oldest=True,

examples/models/core/deepseek_v3/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ Prompt: 'The future of AI is', Generated text: ' a topic of great interest and s
9797
To run with MTP, use [examples/llm-api/quickstart_advanced.py](../pytorch/quickstart_advanced.py) with additional options, see
9898
```bash
9999
cd examples/llm-api
100-
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N
100+
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N
101101
```
102102

103103
`N` is the number of MTP modules. When `N` is equal to `0`, which means that MTP is not used (default). When `N` is greater than `0`, which means that `N` MTP modules are enabled. In the current implementation, the weight of each MTP module is shared.
@@ -124,7 +124,7 @@ When verifying and receiving draft tokens, there are two ways:
124124

125125
```bash
126126
cd examples/llm-api
127-
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N --use_relaxed_acceptance_for_thinking --relaxed_topk 15 --relaxed_delta 0.5
127+
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N --use_relaxed_acceptance_for_thinking --relaxed_topk 15 --relaxed_delta 0.5
128128
```
129129

130130
### Long context support

examples/ngram/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ python examples/summarize.py \
9090

9191
```bash
9292
python3 examples/llm-api/quickstart_advanced.py \
93-
--spec_decode_nextn 4 \
93+
--spec_decode_max_draft_len 4 \
9494
--max_matching_ngram_size 2 \
9595
--disable_overlap_scheduler \
9696
--disable_kv_cache_reuse

tests/integration/defs/test_e2e.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1641,7 +1641,7 @@ def test_ptp_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
16411641
[
16421642
str(example_root / "quickstart_advanced.py"),
16431643
"--use_cuda_graph",
1644-
"--spec_decode_nextn",
1644+
"--spec_decode_max_draft_len",
16451645
"1", # test 1 MTP module
16461646
"--spec_decode_algo",
16471647
"MTP",
@@ -1720,13 +1720,13 @@ def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name,
17201720
delete_on_close=True) as running_log:
17211721
llm_venv.run_cmd([
17221722
str(example_root / "quickstart_advanced.py"),
1723-
"--spec_decode_nextn",
1723+
"--spec_decode_max_draft_len",
17241724
"4",
17251725
"--spec_decode_algo",
17261726
"eagle3",
17271727
"--model_dir",
17281728
f"{llm_models_root()}/{model_path}",
1729-
"--eagle_model_dir",
1729+
"--draft_model_dir",
17301730
f"{llm_models_root()}/{eagle_model_path}",
17311731
"--disable_kv_cache_reuse",
17321732
"--disable_overlap_scheduler",
@@ -1753,7 +1753,7 @@ def test_ptp_quickstart_advanced_ngram(llm_root, llm_venv, model_name,
17531753
f"{llm_models_root()}/{model_path}",
17541754
"--spec_decode_algo",
17551755
"NGRAM",
1756-
"--spec_decode_nextn",
1756+
"--spec_decode_max_draft_len",
17571757
"4",
17581758
"--max_matching_ngram_size",
17591759
"2",
@@ -1829,7 +1829,7 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus(
18291829
"--disable_kv_cache_reuse",
18301830
"--spec_decode_algo",
18311831
"MTP",
1832-
"--spec_decode_nextn",
1832+
"--spec_decode_max_draft_len",
18331833
"5",
18341834
"--use_relaxed_acceptance_for_thinking",
18351835
"--relaxed_topk=10",

0 commit comments

Comments
 (0)