@@ -2090,7 +2090,7 @@ def test_ptp_quickstart_advanced_8gpus(llm_root, llm_venv, model_name,
2090
2090
def test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k (
2091
2091
llm_root , llm_venv , model_name , model_path , cuda_graph ):
2092
2092
print (f"Testing { model_name } on 8 GPUs." )
2093
- example_root = Path (os .path .join (llm_root , "examples" , "pytorch " ))
2093
+ example_root = Path (os .path .join (llm_root , "examples" , "llm-api " ))
2094
2094
cmd = [
2095
2095
str (example_root / "quickstart_advanced.py" ),
2096
2096
"--enable_chunked_prefill" ,
@@ -2115,10 +2115,12 @@ def test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k(
2115
2115
@pytest .mark .skip_less_device_memory (80000 )
2116
2116
@pytest .mark .skip_less_device (2 )
2117
2117
@pytest .mark .parametrize ("model_name,model_path" , [
2118
- ("Llama3.1-70B-BF16" , "llama-3.1-model/Meta-Llama-3.1-70B" ),
2119
2118
('Nemotron-Super-49B-v1-BF16' ,
2120
2119
'nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1' ),
2121
2120
("Mixtral-8x7B-BF16" , "Mixtral-8x7B-Instruct-v0.1" ),
2121
+ pytest .param ('Llama3.1-70B-BF16' ,
2122
+ 'llama-3.1-model/Meta-Llama-3.1-70B' ,
2123
+ marks = pytest .mark .skip_less_device_memory (95000 )),
2122
2124
])
2123
2125
def test_ptp_quickstart_advanced_2gpus_sm120 (llm_root , llm_venv , model_name ,
2124
2126
model_path ):
@@ -2565,6 +2567,106 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
2565
2567
print ("All answers are correct!" )
2566
2568
2567
2569
2570
+ @pytest .mark .skip_less_device_memory (80000 )
2571
+ @pytest .mark .parametrize ("model_name,model_path" , [
2572
+ ("gemma-3-27b-it" , "gemma/gemma-3-27b-it" ),
2573
+ ("mistral-small-3.1-24b-instruct" , "Mistral-Small-3.1-24B-Instruct-2503" ),
2574
+ ("Phi-4-multimodal-instruct" , "multimodals/Phi-4-multimodal-instruct" ),
2575
+ ])
2576
+ def test_ptp_quickstart_multimodal_multiturn (llm_root , llm_venv , model_name ,
2577
+ model_path ):
2578
+ example_root = Path (os .path .join (llm_root , "examples" , "llm-api" ))
2579
+ test_data_root = Path (
2580
+ os .path .join (llm_models_root (), "multimodals" , "test_data" ))
2581
+
2582
+ print (f"Accuracy test { model_name } image mode with example inputs." )
2583
+
2584
+ # Define accuracy inputs for image modality
2585
+ accuracy_inputs = {
2586
+ "image" : {
2587
+ "prompt" : [
2588
+ "Describe what you see in this image." ,
2589
+ "How would you describe the atmosphere of this scene?" ,
2590
+ ],
2591
+ "media" : [
2592
+ str (test_data_root / "inpaint.png" ),
2593
+ ],
2594
+ }
2595
+ }
2596
+
2597
+ # Define expected keywords for each model
2598
+ expected_keywords = {
2599
+ "gemma-3-27b-it" : {
2600
+ "image" : [
2601
+ ["half" , "dome" , "yosemite" , "landmark" , "rounded" ],
2602
+ ["atmosphere" , "peaceful" , "majestic" , "calm" , "quiet" ],
2603
+ ],
2604
+ },
2605
+ "mistral-small-3.1-24b-instruct" : {
2606
+ "image" : [
2607
+ ["depicts" , "landscape" , "rock" , "sky" , "high" , "altitude" ],
2608
+ ["atmosphere" , "serene" , "majestic" , "sense" , "tranquility" ],
2609
+ ],
2610
+ },
2611
+ "Phi-4-multimodal-instruct" : {
2612
+ "image" : [
2613
+ ["depicts" , "landscape" , "mountain" , "half" , "dome" ],
2614
+ ["atmosphere" , "serene" , "sense" , "tranquility" , "peace." ],
2615
+ ],
2616
+ },
2617
+ }
2618
+ # Build command for image modality
2619
+ cmd = [
2620
+ str (example_root / "quickstart_multimodal.py" ),
2621
+ "--model_dir" ,
2622
+ f"{ llm_models_root ()} /{ model_path } " ,
2623
+ "--modality" ,
2624
+ "image" ,
2625
+ "--multiturn" ,
2626
+ "--prompt" ,
2627
+ * accuracy_inputs ["image" ]["prompt" ],
2628
+ "--media" ,
2629
+ * accuracy_inputs ["image" ]["media" ],
2630
+ ]
2631
+
2632
+ # Add model-specific configurations
2633
+ if model_name == "gemma-3-27b-it" :
2634
+ # Gemma3 VLM needs a custom mask which is only supported by flashinfer backend currently.
2635
+ # Custom mask involves bidirectional masking of image tokens in context phase. To get this
2636
+ # correct, chunked prefill and kv cache reuse need to be turned off.
2637
+ cmd .append ("--image_format=pil" )
2638
+ cmd .append ("--attention_backend=FLASHINFER" )
2639
+ cmd .append ("--disable_kv_cache_reuse" )
2640
+ elif model_name == "Phi-4-multimodal-instruct" :
2641
+ # Set max_seq_len to 4096 to use short rope factor.
2642
+ cmd .append ("--max_seq_len=4096" )
2643
+ cmd .append ("--load_lora" )
2644
+ cmd .append ("--auto_model_name" )
2645
+ cmd .append ("Phi4MMForCausalLM" )
2646
+
2647
+ output = llm_venv .run_cmd (cmd , caller = check_output )
2648
+ print ("output:" , output )
2649
+ # Set match ratio based on model
2650
+ match_ratio = 4.0 / 5
2651
+ if model_name == "Phi-4-multimodal-instruct" :
2652
+ match_ratio = 0.6
2653
+
2654
+ # Check output accuracy
2655
+ for prompt_output , prompt_keywords in zip (
2656
+ parse_output (output ), expected_keywords [model_name ]["image" ]):
2657
+ matches = [
2658
+ keyword in prompt_output .lower () for keyword in prompt_keywords
2659
+ ]
2660
+ obs_match_ratio = 1. * sum (matches ) / len (matches )
2661
+ print ("prompt_output:" , prompt_output )
2662
+ print ("prompt_keywords:" , prompt_keywords )
2663
+ print ("matches:" , matches )
2664
+ print ("obs_match_ratio:" , obs_match_ratio )
2665
+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2666
+
2667
+ print ("All answers are correct!" )
2668
+
2669
+
2568
2670
@pytest .mark .parametrize ("model_name,model_path" , [
2569
2671
("BertForSequenceClassification" , "bert/bert-base-uncased-yelp-polarity" ),
2570
2672
])
0 commit comments