[chore] Clean up quickstart_advanced.py (#6021)

mikeiovine · web-flow · commit 9645814bdf4e · 2025-07-21T15:00:59.000-04:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
diff --git a/examples/llm-api/README.md b/examples/llm-api/README.md
@@ -40,7 +40,7 @@ python3 quickstart_multimodal.py --model_dir Efficient-Large-Model/NVILA-8B --mo
 python3 quickstart_advanced.py \
     --model_dir meta-llama/Llama-3.1-8B-Instruct \
     --spec_decode_algo NGRAM \
-    --spec_decode_nextn 4 \
+    --spec_decode_max_draft_len 4 \
     --max_matching_ngram_size 2 \
     --disable_overlap_scheduler \
     --disable_kv_cache_reuse
@@ -51,7 +51,7 @@ python3 quickstart_advanced.py \
 python3 quickstart_advanced.py \
     --model_dir meta-llama/Llama-3.1-8B-Instruct \
     --spec_decode_algo draft_target \
-    --spec_decode_nextn 5 \
+    --spec_decode_max_draft_len 5 \
     --draft_model_dir meta-llama/Llama-3.2-1B-Instruct \
     --disable_overlap_scheduler \
     --disable_kv_cache_reuse
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
@@ -108,11 +108,8 @@ def add_llm_args(parser):
 
     # Speculative decoding
     parser.add_argument('--spec_decode_algo', type=str, default=None)
-    parser.add_argument('--spec_decode_nextn', type=int, default=1)
-    parser.add_argument('--draft_model_dir',
-                        '--eagle_model_dir',
-                        type=str,
-                        default=None)
+    parser.add_argument('--spec_decode_max_draft_len', type=int, default=1)
+    parser.add_argument('--draft_model_dir', type=str, default=None)
     parser.add_argument('--max_matching_ngram_size', type=int, default=5)
     parser.add_argument('--use_one_model', default=False, action='store_true')
 
@@ -162,23 +159,23 @@ def setup_llm(args, **kwargs):
             )
 
         spec_config = MTPDecodingConfig(
-            num_nextn_predict_layers=args.spec_decode_nextn,
+            num_nextn_predict_layers=args.spec_decode_max_draft_len,
             use_relaxed_acceptance_for_thinking=args.
             use_relaxed_acceptance_for_thinking,
             relaxed_topk=args.relaxed_topk,
             relaxed_delta=args.relaxed_delta)
     elif spec_decode_algo == "EAGLE3":
         spec_config = EagleDecodingConfig(
-            max_draft_len=args.spec_decode_nextn,
+            max_draft_len=args.spec_decode_max_draft_len,
             speculative_model_dir=args.draft_model_dir,
             eagle3_one_model=args.use_one_model)
     elif spec_decode_algo == "DRAFT_TARGET":
         spec_config = DraftTargetDecodingConfig(
-            max_draft_len=args.spec_decode_nextn,
+            max_draft_len=args.spec_decode_max_draft_len,
             speculative_model_dir=args.draft_model_dir)
     elif spec_decode_algo == "NGRAM":
         spec_config = NGramDecodingConfig(
-            max_draft_len=args.spec_decode_nextn,
+            max_draft_len=args.spec_decode_max_draft_len,
             max_matching_ngram_size=args.max_matching_ngram_size,
             is_keep_all=True,
             is_use_oldest=True,
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
@@ -97,7 +97,7 @@ Prompt: 'The future of AI is', Generated text: ' a topic of great interest and s
 To run with MTP, use [examples/llm-api/quickstart_advanced.py](../pytorch/quickstart_advanced.py) with additional options, see
 ```bash
 cd examples/llm-api
-python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N
+python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N
 ```
 
 `N` is the number of MTP modules. When `N` is equal to `0`, which means that MTP is not used (default). When `N` is greater than `0`, which means that `N` MTP modules are enabled. In the current implementation, the weight of each MTP module is shared.
@@ -124,7 +124,7 @@ When verifying and receiving draft tokens, there are two ways:
 
   ```bash
   cd examples/llm-api
-  python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_nextn N --use_relaxed_acceptance_for_thinking --relaxed_topk 15 --relaxed_delta 0.5
+  python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N --use_relaxed_acceptance_for_thinking --relaxed_topk 15 --relaxed_delta 0.5
   ```
 
 ### Long context support
diff --git a/examples/ngram/README.md b/examples/ngram/README.md
@@ -90,7 +90,7 @@ python examples/summarize.py \
 
 ```bash
 python3 examples/llm-api/quickstart_advanced.py \
-    --spec_decode_nextn 4 \
+    --spec_decode_max_draft_len 4 \
     --max_matching_ngram_size 2 \
     --disable_overlap_scheduler \
     --disable_kv_cache_reuse
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1641,7 +1641,7 @@ def test_ptp_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
             [
                 str(example_root / "quickstart_advanced.py"),
                 "--use_cuda_graph",
-                "--spec_decode_nextn",
+                "--spec_decode_max_draft_len",
                 "1",  # test 1 MTP module
                 "--spec_decode_algo",
                 "MTP",
@@ -1720,13 +1720,13 @@ def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name,
                                      delete_on_close=True) as running_log:
         llm_venv.run_cmd([
             str(example_root / "quickstart_advanced.py"),
-            "--spec_decode_nextn",
+            "--spec_decode_max_draft_len",
             "4",
             "--spec_decode_algo",
             "eagle3",
             "--model_dir",
             f"{llm_models_root()}/{model_path}",
-            "--eagle_model_dir",
+            "--draft_model_dir",
             f"{llm_models_root()}/{eagle_model_path}",
             "--disable_kv_cache_reuse",
             "--disable_overlap_scheduler",
@@ -1753,7 +1753,7 @@ def test_ptp_quickstart_advanced_ngram(llm_root, llm_venv, model_name,
             f"{llm_models_root()}/{model_path}",
             "--spec_decode_algo",
             "NGRAM",
-            "--spec_decode_nextn",
+            "--spec_decode_max_draft_len",
             "4",
             "--max_matching_ngram_size",
             "2",
@@ -1829,7 +1829,7 @@ def test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus(
             "--disable_kv_cache_reuse",
             "--spec_decode_algo",
             "MTP",
-            "--spec_decode_nextn",
+            "--spec_decode_max_draft_len",
             "5",
             "--use_relaxed_acceptance_for_thinking",
             "--relaxed_topk=10",