@@ -2076,7 +2076,7 @@ def test_ptp_quickstart_advanced_8gpus(llm_root, llm_venv, model_name,
2076
2076
def test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k (
2077
2077
llm_root , llm_venv , model_name , model_path , cuda_graph ):
2078
2078
print (f"Testing { model_name } on 8 GPUs." )
2079
- example_root = Path (os .path .join (llm_root , "examples" , "pytorch " ))
2079
+ example_root = Path (os .path .join (llm_root , "examples" , "llm-api " ))
2080
2080
cmd = [
2081
2081
str (example_root / "quickstart_advanced.py" ),
2082
2082
"--enable_chunked_prefill" ,
@@ -2101,10 +2101,12 @@ def test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k(
2101
2101
@pytest .mark .skip_less_device_memory (80000 )
2102
2102
@pytest .mark .skip_less_device (2 )
2103
2103
@pytest .mark .parametrize ("model_name,model_path" , [
2104
- ("Llama3.1-70B-BF16" , "llama-3.1-model/Meta-Llama-3.1-70B" ),
2105
2104
('Nemotron-Super-49B-v1-BF16' ,
2106
2105
'nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1' ),
2107
2106
("Mixtral-8x7B-BF16" , "Mixtral-8x7B-Instruct-v0.1" ),
2107
+ pytest .param ('Llama3.1-70B-BF16' ,
2108
+ 'llama-3.1-model/Meta-Llama-3.1-70B' ,
2109
+ marks = pytest .mark .skip_less_device_memory (95000 )),
2108
2110
])
2109
2111
def test_ptp_quickstart_advanced_2gpus_sm120 (llm_root , llm_venv , model_name ,
2110
2112
model_path ):
@@ -2551,6 +2553,106 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
2551
2553
print ("All answers are correct!" )
2552
2554
2553
2555
2556
+ @pytest .mark .skip_less_device_memory (80000 )
2557
+ @pytest .mark .parametrize ("model_name,model_path" , [
2558
+ ("gemma-3-27b-it" , "gemma/gemma-3-27b-it" ),
2559
+ ("mistral-small-3.1-24b-instruct" , "Mistral-Small-3.1-24B-Instruct-2503" ),
2560
+ ("Phi-4-multimodal-instruct" , "multimodals/Phi-4-multimodal-instruct" ),
2561
+ ])
2562
+ def test_ptp_quickstart_multimodal_multiturn (llm_root , llm_venv , model_name ,
2563
+ model_path ):
2564
+ example_root = Path (os .path .join (llm_root , "examples" , "llm-api" ))
2565
+ test_data_root = Path (
2566
+ os .path .join (llm_models_root (), "multimodals" , "test_data" ))
2567
+
2568
+ print (f"Accuracy test { model_name } image mode with example inputs." )
2569
+
2570
+ # Define accuracy inputs for image modality
2571
+ accuracy_inputs = {
2572
+ "image" : {
2573
+ "prompt" : [
2574
+ "Describe what you see in this image." ,
2575
+ "How would you describe the atmosphere of this scene?" ,
2576
+ ],
2577
+ "media" : [
2578
+ str (test_data_root / "inpaint.png" ),
2579
+ ],
2580
+ }
2581
+ }
2582
+
2583
+ # Define expected keywords for each model
2584
+ expected_keywords = {
2585
+ "gemma-3-27b-it" : {
2586
+ "image" : [
2587
+ ["half" , "dome" , "yosemite" , "landmark" , "rounded" ],
2588
+ ["atmosphere" , "peaceful" , "majestic" , "calm" , "quiet" ],
2589
+ ],
2590
+ },
2591
+ "mistral-small-3.1-24b-instruct" : {
2592
+ "image" : [
2593
+ ["depicts" , "landscape" , "rock" , "sky" , "high" , "altitude" ],
2594
+ ["atmosphere" , "serene" , "majestic" , "sense" , "tranquility" ],
2595
+ ],
2596
+ },
2597
+ "Phi-4-multimodal-instruct" : {
2598
+ "image" : [
2599
+ ["depicts" , "landscape" , "mountain" , "half" , "dome" ],
2600
+ ["atmosphere" , "serene" , "sense" , "tranquility" , "peace." ],
2601
+ ],
2602
+ },
2603
+ }
2604
+ # Build command for image modality
2605
+ cmd = [
2606
+ str (example_root / "quickstart_multimodal.py" ),
2607
+ "--model_dir" ,
2608
+ f"{ llm_models_root ()} /{ model_path } " ,
2609
+ "--modality" ,
2610
+ "image" ,
2611
+ "--multiturn" ,
2612
+ "--prompt" ,
2613
+ * accuracy_inputs ["image" ]["prompt" ],
2614
+ "--media" ,
2615
+ * accuracy_inputs ["image" ]["media" ],
2616
+ ]
2617
+
2618
+ # Add model-specific configurations
2619
+ if model_name == "gemma-3-27b-it" :
2620
+ # Gemma3 VLM needs a custom mask which is only supported by flashinfer backend currently.
2621
+ # Custom mask involves bidirectional masking of image tokens in context phase. To get this
2622
+ # correct, chunked prefill and kv cache reuse need to be turned off.
2623
+ cmd .append ("--image_format=pil" )
2624
+ cmd .append ("--attention_backend=FLASHINFER" )
2625
+ cmd .append ("--disable_kv_cache_reuse" )
2626
+ elif model_name == "Phi-4-multimodal-instruct" :
2627
+ # Set max_seq_len to 4096 to use short rope factor.
2628
+ cmd .append ("--max_seq_len=4096" )
2629
+ cmd .append ("--load_lora" )
2630
+ cmd .append ("--auto_model_name" )
2631
+ cmd .append ("Phi4MMForCausalLM" )
2632
+
2633
+ output = llm_venv .run_cmd (cmd , caller = check_output )
2634
+ print ("output:" , output )
2635
+ # Set match ratio based on model
2636
+ match_ratio = 4.0 / 5
2637
+ if model_name == "Phi-4-multimodal-instruct" :
2638
+ match_ratio = 0.6
2639
+
2640
+ # Check output accuracy
2641
+ for prompt_output , prompt_keywords in zip (
2642
+ parse_output (output ), expected_keywords [model_name ]["image" ]):
2643
+ matches = [
2644
+ keyword in prompt_output .lower () for keyword in prompt_keywords
2645
+ ]
2646
+ obs_match_ratio = 1. * sum (matches ) / len (matches )
2647
+ print ("prompt_output:" , prompt_output )
2648
+ print ("prompt_keywords:" , prompt_keywords )
2649
+ print ("matches:" , matches )
2650
+ print ("obs_match_ratio:" , obs_match_ratio )
2651
+ assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } "
2652
+
2653
+ print ("All answers are correct!" )
2654
+
2655
+
2554
2656
@pytest .mark .parametrize ("model_name,model_path" , [
2555
2657
("BertForSequenceClassification" , "bert/bert-base-uncased-yelp-polarity" ),
2556
2658
])
0 commit comments