@@ -302,6 +302,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
302
302
MODEL_PATH = f"{ llm_models_root ()} /llama-3.1-model/Llama-3.1-8B-Instruct"
303
303
304
304
@pytest .mark .skip_less_device_memory (32000 )
305
+ @pytest .mark .skip_less_device (2 )
305
306
@pytest .mark .parametrize ("disable_overlap_scheduler" , [False , True ])
306
307
def test_auto_dtype (self , disable_overlap_scheduler ):
307
308
ctx_server_config = {"disable_overlap_scheduler" : True }
@@ -331,6 +332,8 @@ def test_auto_dtype(self, disable_overlap_scheduler):
331
332
task = GSM8K (self .MODEL_NAME )
332
333
task .evaluate (llm )
333
334
335
+ @pytest .mark .skip_less_device (2 )
336
+ @skip_pre_hopper
334
337
def test_ngram (self ):
335
338
speculative_decoding_config = {
336
339
"decoding_type" : "NGram" ,
@@ -381,6 +384,7 @@ def test_ngram(self):
381
384
@skip_pre_hopper
382
385
@parametrize_with_ids ("overlap_scheduler" , [True , False ])
383
386
@parametrize_with_ids ("eagle3_one_model" , [True , False ])
387
+ @pytest .mark .skip_less_device (2 )
384
388
def test_eagle3 (self , overlap_scheduler , eagle3_one_model ):
385
389
speculative_decoding_config = {
386
390
"decoding_type" : "Eagle" ,
@@ -437,36 +441,33 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
437
441
task = GSM8K (self .MODEL_NAME )
438
442
task .evaluate (llm )
439
443
440
- @pytest .mark .skip_less_device (2 )
441
444
@pytest .mark .parametrize ("tp,pp" , [(1 , 2 ), (2 , 1 ), (2 , 2 )],
442
445
ids = ["tp1pp2" , "tp2pp1" , "tp2pp2" ])
443
446
@pytest .mark .parametrize ("testset" , ["GSM8K" , "MMLU" ])
444
447
def test_tp_pp_symmetric (self , tp , pp , testset ):
445
448
return run_parallel_test (self .MODEL_NAME , self .MODEL_PATH , pp , tp , pp ,
446
449
tp , 1 , 1 , get_accuracy_task (testset ))
447
450
448
- @pytest .mark .skip_less_device (4 )
449
451
@parametrize_with_ids ("ctx_pp" , [2 , 4 ])
450
452
@parametrize_with_ids ("gen_tp" , [1 , 2 ])
451
453
@pytest .mark .parametrize ("testset" , ["GSM8K" , "MMLU" ])
452
454
def test_ctx_pp_gen_tp_asymmetric (self , ctx_pp , gen_tp , testset ):
453
455
return run_parallel_test (self .MODEL_NAME , self .MODEL_PATH , ctx_pp , 1 , 1 ,
454
456
gen_tp , 1 , 1 , get_accuracy_task (testset ))
455
457
456
- @pytest .mark .skip_less_device (4 )
457
458
@pytest .mark .parametrize ("testset" , ["GSM8K" , "MMLU" ])
458
459
def test_multi_instance (self , testset ):
459
460
return run_parallel_test (self .MODEL_NAME , self .MODEL_PATH , 1 , 1 , 1 , 1 ,
460
461
2 , 2 , get_accuracy_task (testset ))
461
462
462
463
463
- @pytest .mark .skip_less_device_memory (140000 )
464
- @pytest .mark .timeout (3600 )
465
- @pytest .mark .skip_less_device (4 )
466
464
class TestLlama4ScoutInstruct (LlmapiAccuracyTestHarness ):
467
465
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
468
466
MODEL_PATH = f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct"
469
467
468
+ @pytest .mark .skip_less_device_memory (140000 )
469
+ @pytest .mark .timeout (3600 )
470
+ @pytest .mark .skip_less_device (8 )
470
471
@pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
471
472
def test_auto_dtype (self , overlap_scheduler ):
472
473
ctx_server_config = {"disable_overlap_scheduler" : True }
@@ -505,6 +506,8 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
505
506
MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
506
507
MODEL_PATH = f"{ llm_models_root ()} /DeepSeek-V3-Lite/bf16"
507
508
509
+ @pytest .mark .skip_less_device (2 )
510
+ @pytest .mark .skip_less_device_memory (60000 )
508
511
def test_nixl_backend (self ):
509
512
ctx_server_config = {
510
513
"disable_overlap_scheduler" : True ,
@@ -542,7 +545,7 @@ def test_nixl_backend(self):
542
545
@parametrize_with_ids ("overlap_scheduler" , [True , False ])
543
546
@parametrize_with_ids ("mtp_nextn" ,
544
547
[0 , pytest .param (2 , marks = skip_pre_hopper )])
545
- @pytest .mark .skip_less_device (4 )
548
+ @pytest .mark .skip_less_device (8 )
546
549
def test_auto_dtype (self , overlap_scheduler , mtp_nextn ):
547
550
ctx_server_config = {"disable_overlap_scheduler" : True }
548
551
gen_server_config = {"disable_overlap_scheduler" : not overlap_scheduler }
@@ -586,6 +589,7 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
586
589
MODEL_NAME = "google/gemma-3-1b-it"
587
590
MODEL_PATH = f"{ llm_models_root ()} /gemma/gemma-3-1b-it/"
588
591
592
+ @pytest .mark .skip_less_device (2 )
589
593
@pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
590
594
def test_auto_dtype (self , overlap_scheduler ):
591
595
ctx_server_config = {
@@ -637,6 +641,7 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
637
641
MODEL_NAME = "Qwen3/Qwen3-8B"
638
642
MODEL_PATH = f"{ llm_models_root ()} /Qwen3/Qwen3-8B-FP8"
639
643
644
+ @pytest .mark .skip_less_device (2 )
640
645
def test_nixl_backend (self ):
641
646
ctx_server_config = {
642
647
"disable_overlap_scheduler" : True ,
@@ -673,8 +678,9 @@ def test_nixl_backend(self):
673
678
task = GSM8K (self .MODEL_NAME )
674
679
task .evaluate (llm )
675
680
676
- @pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
677
681
@skip_pre_hopper
682
+ @pytest .mark .skip_less_device (2 )
683
+ @pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
678
684
def test_auto_dtype (self , overlap_scheduler ):
679
685
ctx_server_config = {
680
686
"disable_overlap_scheduler" : True ,
0 commit comments