@@ -345,6 +345,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
345
345
MODEL_PATH = f"{ llm_models_root ()} /llama-3.1-model/Llama-3.1-8B-Instruct"
346
346
347
347
@pytest .mark .skip_less_device_memory (32000 )
348
+ @pytest .mark .skip_less_device (2 )
348
349
@pytest .mark .parametrize ("disable_overlap_scheduler" , [False , True ])
349
350
def test_auto_dtype (self , disable_overlap_scheduler ):
350
351
ctx_server_config = {"disable_overlap_scheduler" : True }
@@ -374,6 +375,8 @@ def test_auto_dtype(self, disable_overlap_scheduler):
374
375
task = GSM8K (self .MODEL_NAME )
375
376
task .evaluate (llm )
376
377
378
+ @pytest .mark .skip_less_device (2 )
379
+ @skip_pre_hopper
377
380
def test_ngram (self ):
378
381
speculative_decoding_config = {
379
382
"decoding_type" : "NGram" ,
@@ -424,6 +427,7 @@ def test_ngram(self):
424
427
@skip_pre_hopper
425
428
@parametrize_with_ids ("overlap_scheduler" , [True , False ])
426
429
@parametrize_with_ids ("eagle3_one_model" , [True , False ])
430
+ @pytest .mark .skip_less_device (2 )
427
431
def test_eagle3 (self , overlap_scheduler , eagle3_one_model ):
428
432
speculative_decoding_config = {
429
433
"decoding_type" : "Eagle" ,
@@ -578,7 +582,6 @@ def test_tp_pp_symmetric(self, tp, pp, testset):
578
582
return run_parallel_test (self .MODEL_NAME , self .MODEL_PATH , pp , tp , pp ,
579
583
tp , 1 , 1 , [get_accuracy_task (testset )])
580
584
581
- @pytest .mark .skip_less_device (4 )
582
585
@parametrize_with_ids ("ctx_pp" , [2 , 4 ])
583
586
@parametrize_with_ids ("gen_tp" , [1 , 2 ])
584
587
@pytest .mark .parametrize ("testset" , ["GSM8K" , "MMLU" ])
@@ -589,20 +592,18 @@ def test_ctx_pp_gen_tp_asymmetric(self, ctx_pp, gen_tp, testset):
589
592
return run_parallel_test (self .MODEL_NAME , self .MODEL_PATH , ctx_pp , 1 , 1 ,
590
593
gen_tp , 1 , 1 , [get_accuracy_task (testset )])
591
594
592
- @pytest .mark .skip_less_device (4 )
593
595
@pytest .mark .parametrize ("testset" , ["GSM8K" , "MMLU" ])
594
596
def test_multi_instance (self , testset ):
595
597
return run_parallel_test (self .MODEL_NAME , self .MODEL_PATH , 1 , 1 , 1 , 1 ,
596
598
2 , 2 , [get_accuracy_task (testset )])
597
599
598
600
599
- @pytest .mark .skip_less_device_memory (140000 )
600
- @pytest .mark .timeout (3600 )
601
- @pytest .mark .skip_less_device (4 )
602
601
class TestLlama4ScoutInstruct (LlmapiAccuracyTestHarness ):
603
602
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
604
603
MODEL_PATH = f"{ llm_models_root ()} /llama4-models/Llama-4-Scout-17B-16E-Instruct"
605
604
605
+ @pytest .mark .skip_less_device_memory (140000 )
606
+ @pytest .mark .timeout (3600 )
606
607
@pytest .mark .skip_less_device (8 )
607
608
@pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
608
609
def test_auto_dtype (self , overlap_scheduler ):
@@ -642,6 +643,8 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
642
643
MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
643
644
MODEL_PATH = f"{ llm_models_root ()} /DeepSeek-V3-Lite/bf16"
644
645
646
+ @pytest .mark .skip_less_device (2 )
647
+ @pytest .mark .skip_less_device_memory (60000 )
645
648
def test_nixl_backend (self ):
646
649
ctx_server_config = {
647
650
"disable_overlap_scheduler" : True ,
@@ -680,7 +683,7 @@ def test_nixl_backend(self):
680
683
@parametrize_with_ids ("overlap_scheduler" , [True , False ])
681
684
@parametrize_with_ids ("mtp_nextn" ,
682
685
[0 , pytest .param (2 , marks = skip_pre_hopper )])
683
- @pytest .mark .skip_less_device (4 )
686
+ @pytest .mark .skip_less_device (8 )
684
687
def test_auto_dtype (self , overlap_scheduler , mtp_nextn ):
685
688
ctx_server_config = {"disable_overlap_scheduler" : True }
686
689
gen_server_config = {"disable_overlap_scheduler" : not overlap_scheduler }
@@ -724,6 +727,7 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
724
727
MODEL_NAME = "google/gemma-3-1b-it"
725
728
MODEL_PATH = f"{ llm_models_root ()} /gemma/gemma-3-1b-it/"
726
729
730
+ @pytest .mark .skip_less_device (2 )
727
731
@pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
728
732
def test_auto_dtype (self , overlap_scheduler ):
729
733
pytest .skip (
@@ -779,6 +783,7 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
779
783
MODEL_NAME = "Qwen3/Qwen3-8B"
780
784
MODEL_PATH = f"{ llm_models_root ()} /Qwen3/Qwen3-8B-FP8"
781
785
786
+ @pytest .mark .skip_less_device (2 )
782
787
def test_nixl_backend (self ):
783
788
ctx_server_config = {
784
789
"disable_overlap_scheduler" : True ,
@@ -813,8 +818,9 @@ def test_nixl_backend(self):
813
818
task = GSM8K (self .MODEL_NAME )
814
819
task .evaluate (llm )
815
820
816
- @pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
817
821
@skip_pre_hopper
822
+ @pytest .mark .skip_less_device (2 )
823
+ @pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
818
824
def test_auto_dtype (self , overlap_scheduler ):
819
825
ctx_server_config = {
820
826
"disable_overlap_scheduler" : True ,
0 commit comments