Updates

quic-akuruvil · quic-akuruvil · commit 783670af95ac · 2025-08-04T18:49:34.000Z
Signed-off-by: Ann Kuruvilla &lt;quic_akuruvil@quicinc.com&gt;
diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
@@ -648,6 +648,17 @@ def forward(self, input_ids, position_ids, pixel_values, image_idx, past_key_val
         image_idx = (indices1.max() + 1).unsqueeze(0).unsqueeze(0)
         return outputs.logits, pixel_values, image_idx, outputs.past_key_values
 
+    def get_npi_file(self, model_name: str, **compiler_options):
+        if model_name == "google/gemma-3-4b-it":
+            compiler_options["node_precision_info"] = constants.DEFAULT_GEMMA3_4B_NODE_PRECISION_INFO
+        elif model_name == "google/gemma-3-27b-it":
+            compiler_options["node_precision_info"] = constants.DEFAULT_GEMMA3_27B_NODE_PRECISION_INFO
+        else:
+            raise ValueError(
+                f"For Model {self.pretrained_model_name_or_path} default NPI file is not supported/added. Please use one of the following: google/gemma-3-4b-it, google/gemma-3-27b-it"
+            )
+        return compiler_options
+
     def get_specializations(
         self,
         batch_size: int,
@@ -694,18 +705,6 @@ def get_specializations(
         ]
         specializations = {}
 
-        # Default node precision file added for Gemma3:AI-100
-        # if user provides a custom node precision file, it will override default one
-        if "node_precision_info" not in compiler_options:
-            if self.pretrained_model_name_or_path == "google/gemma-3-4b-it":
-                compiler_options["node_precision_info"] = constants.DEFAULT_GEMMA3_4B_NODE_PRECISION_INFO
-            elif self.pretrained_model_name_or_path == "google/gemma-3-27b-it":
-                compiler_options["node_precision_info"] = constants.DEFAULT_GEMMA3_27B_NODE_PRECISION_INFO
-            else:
-                raise ValueError(
-                    f"For Model {self.pretrained_model_name_or_path} default NPI file is not supported/added. Please use one of the following: google/gemma-3-4b-it, google/gemma-3-27b-it"
-                )
-
         if kv_offload:
             specializations["vision"] = vision
             specializations["lang"] = lang
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -681,6 +681,9 @@ def compile(
             **compiler_options,
         )
 
+        if hasattr(self.model, "get_npi_file"):
+            compiler_options = self.model.get_npi_file(self.model.pretrained_model_name_or_path, **compiler_options)
+
         custom_io_vision = {}
         kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
         custom_io_vision["pixel_values"] = "float16"
@@ -1030,6 +1033,9 @@ def compile(
             **compiler_options,
         )
 
+        if hasattr(self.model, "get_npi_file"):
+            self.model.get_npi_file(self.pretrained_model_name_or_path)
+
         custom_io = {}
         kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
         # inputs
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -5,7 +5,6 @@
 #
 # -----------------------------------------------------------------------------
 
-import importlib.resources as pkg_resources
 import os
 from dataclasses import dataclass
 
@@ -103,8 +102,8 @@ def get_models_dir():
 LLAMA4_MAX_POSITION_EMBEDDINGS = 65536
 
 # Gemma3 Constant
-DEFAULT_GEMMA3_4B_NODE_PRECISION_INFO = str(pkg_resources.path(gemma3, "fp32_nodes_gemma3_4b_mm.yaml"))
-DEFAULT_GEMMA3_27B_NODE_PRECISION_INFO = str(pkg_resources.path(gemma3, "fp32_nodes_gemma3_27b_mm.yaml"))
+DEFAULT_GEMMA3_4B_NODE_PRECISION_INFO = "QEfficient/transformers/models/gemma3/fp32_nodes_gemma3_4b_mm.yaml"
+DEFAULT_GEMMA3_27B_NODE_PRECISION_INFO = "QEfficient/transformers/models/gemma3/fp32_nodes_gemma3_27b_mm.yaml"
 
 
 class Constants:
diff --git a/examples/gemma3_example/gemma3_mm.py b/examples/gemma3_example/gemma3_mm.py
@@ -22,11 +22,11 @@
 # pass HF_TOKEN if gated model
 # For running the model in single QPC approach use kv_offload=False. For Dual QPC approach use kv_offload=True ###
 qeff_model = QEFFAutoModelForImageTextToText.from_pretrained(
-    model_id, config=config, attn_implementation="eager", kv_offload=True
+    model_id, config=config, attn_implementation="eager", kv_offload=False
 )
 
 ### use skip_vision=Ture, if want to run only text, or false ###
-skip_vision = True
+skip_vision = False
 
 if skip_vision:
     ## Only Text ##