@@ -281,30 +281,6 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
281
281
MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
282
282
MODEL_PATH = f"{ llm_models_root ()} /llama-3.1-model/Llama-3.1-8B-Instruct"
283
283
284
- def test_nixl_backend (self ):
285
- ctx_server_config = {"cache_transceiver_config" : {"backend" : "nixl" }}
286
- gen_server_config = {"cache_transceiver_config" : {"backend" : "nixl" }}
287
- disaggregated_server_config = {
288
- "hostname" : "localhost" ,
289
- "port" : 8000 ,
290
- "backend" : "pytorch" ,
291
- "context_servers" : {
292
- "num_instances" : 1 ,
293
- "urls" : ["localhost:8001" ]
294
- },
295
- "generation_servers" : {
296
- "num_instances" : 1 ,
297
- "urls" : ["localhost:8002" ]
298
- }
299
- }
300
- with launch_disaggregated_llm (disaggregated_server_config ,
301
- ctx_server_config , gen_server_config ,
302
- self .MODEL_PATH ) as llm :
303
- task = MMLU (self .MODEL_NAME )
304
- task .evaluate (llm )
305
- task = GSM8K (self .MODEL_NAME )
306
- task .evaluate (llm )
307
-
308
284
@pytest .mark .skip_less_device_memory (32000 )
309
285
@pytest .mark .parametrize ("disable_overlap_scheduler" , [False , True ])
310
286
def test_auto_dtype (self , disable_overlap_scheduler ):
@@ -590,8 +566,18 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
590
566
MODEL_PATH = f"{ llm_models_root ()} /DeepSeek-V3-Lite/bf16"
591
567
592
568
def test_nixl_backend (self ):
593
- ctx_server_config = {"cache_transceiver_config" : {"backend" : "nixl" }}
594
- gen_server_config = {"cache_transceiver_config" : {"backend" : "nixl" }}
569
+ ctx_server_config = {
570
+ "disable_overlap_scheduler" : True ,
571
+ "cache_transceiver_config" : {
572
+ "backend" : "nixl"
573
+ }
574
+ }
575
+ gen_server_config = {
576
+ "disable_overlap_scheduler" : True ,
577
+ "cache_transceiver_config" : {
578
+ "backend" : "nixl"
579
+ }
580
+ }
595
581
disaggregated_server_config = {
596
582
"hostname" : "localhost" ,
597
583
"port" : 8000 ,
@@ -606,10 +592,8 @@ def test_nixl_backend(self):
606
592
}
607
593
}
608
594
with launch_disaggregated_llm (disaggregated_server_config ,
609
- ctx_server_config ,
610
- gen_server_config ,
611
- self .MODEL_PATH ,
612
- tensor_parallel_size = 4 ) as llm :
595
+ ctx_server_config , gen_server_config ,
596
+ self .MODEL_PATH ) as llm :
613
597
task = MMLU (self .MODEL_NAME )
614
598
task .evaluate (llm )
615
599
task = GSM8K (self .MODEL_NAME )
@@ -716,6 +700,42 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
716
700
MODEL_NAME = "Qwen3/Qwen3-8B"
717
701
MODEL_PATH = f"{ llm_models_root ()} /Qwen3/Qwen3-8B-FP8"
718
702
703
+ def test_nixl_backend (self ):
704
+ ctx_server_config = {
705
+ "disable_overlap_scheduler" : True ,
706
+ "cache_transceiver_config" : {
707
+ "backend" : "nixl"
708
+ }
709
+ }
710
+ gen_server_config = {
711
+ "disable_overlap_scheduler" : True ,
712
+ "cache_transceiver_config" : {
713
+ "backend" : "nixl"
714
+ }
715
+ }
716
+ ctx_server_config ["cache_transceiver_config" ]
717
+ ctx_server_config ["cache_transceiver_config" ]
718
+ disaggregated_server_config = {
719
+ "hostname" : "localhost" ,
720
+ "port" : 8000 ,
721
+ "backend" : "pytorch" ,
722
+ "context_servers" : {
723
+ "num_instances" : 1 ,
724
+ "urls" : ["localhost:8001" ]
725
+ },
726
+ "generation_servers" : {
727
+ "num_instances" : 1 ,
728
+ "urls" : ["localhost:8002" ]
729
+ }
730
+ }
731
+ with launch_disaggregated_llm (disaggregated_server_config ,
732
+ ctx_server_config , gen_server_config ,
733
+ self .MODEL_PATH ) as llm :
734
+ task = MMLU (self .MODEL_NAME )
735
+ task .evaluate (llm )
736
+ task = GSM8K (self .MODEL_NAME )
737
+ task .evaluate (llm )
738
+
719
739
@pytest .mark .parametrize ("overlap_scheduler" , [False , True ])
720
740
def test_auto_dtype (self , overlap_scheduler ):
721
741
ctx_server_config = {
0 commit comments