Fixed transform flag and other importing related issue

abhishek-singh591 · abhishek-singh591 · commit b50f44e74511 · 2025-09-05T11:11:02.000Z
Signed-off-by: abhishek-singh591 &lt;sabhis@qti.qualcomm.com&gt;
diff --git a/QEfficient/base/onnx_transforms.py b/QEfficient/base/onnx_transforms.py
@@ -67,10 +67,11 @@ def apply(
 
         def process_tensor(info: TensorInfo) -> bool:
             tensor, tsize = info
-            transformed = False
+            transformed_clip = False
+            transformed_split = False
 
-            if apply_clip and cls._clip_tensor(tensor, onnx_base_dir, fp16_min, fp16_max):
-                transformed = True
+            if apply_clip:
+                transformed_clip = cls._clip_tensor(tensor, onnx_base_dir, fp16_min, fp16_max)
 
             if apply_split and tsize > size_threshold:
                 if file_num_tracker["size"] + tsize > file_chunk_size:
@@ -80,9 +81,11 @@ def process_tensor(info: TensorInfo) -> bool:
                     file_num_tracker["size"] += tsize
 
                 cls._split_tensor(tensor, model_name, file_num_tracker["num"])
-                transformed = True
+                transformed_split = True
 
-            return transformed
+            if apply_clip and apply_split:
+                return transformed_clip and transformed_split
+            return transformed_clip or transformed_split
 
         with ThreadPoolExecutor(max_workers=os.cpu_count() * 4) as executor:
             transformed_flags = list(executor.map(process_tensor, tensor_infos))
diff --git a/QEfficient/exporter/export_utils.py b/QEfficient/exporter/export_utils.py
@@ -218,7 +218,9 @@ def fix_onnx_fp16(
         :str: Updated base name of exported ONNX model.
     """
     model = onnx.load(os.path.join(gen_models_path, f"{model_base_name}.onnx"))
-    model, fp16_fix = ClipAndSplitTransform.apply(model, onnx_base_dir=gen_models_path, apply_split=False)
+    model, fp16_fix = ClipAndSplitTransform.apply(
+        model, model_name="", onnx_base_dir=gen_models_path, apply_split=False
+    )
 
     if fp16_fix:
         # Save FP16 model
diff --git a/run.py b/run.py
@@ -0,0 +1,80 @@
+# # Initiate the Original Transformer model
+# from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
+
+# # Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.
+# # os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache"
+
+# # ROOT_DIR = os.path.dirname(os.path.abspath(""))
+# # CACHE_DIR = os.path.join(ROOT_DIR, "tmp") #, you can use a different location for just one model by passing this param as cache_dir in below API.
+
+# # Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl
+# model_name = "gpt2"  # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.
+
+# qeff_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="gpt2")
+# print(f"{model_name} optimized for Cloud AI 100 \n", qeff_model)
+
+# # We can now export the modified models to ONNX framework
+# # This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for
+# # Cloud AI 100 Platform.
+
+# # While generating the ONNX model, this will clip the overflow constants to fp16
+# # Verify the model on Onnxruntime vs Pytorch
+
+# # Then generate inputs and customio yaml file required for compilation.
+# qeff_model.export()
+
+# # Compile the model for provided compilation arguments
+# # Please use platform SDK to Check num_cores for your card.
+
+# qeff_model.compile(
+#     num_cores=14,
+#     mxfp6=True,
+#     device_group=[0],
+# )
+
+# # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100
+# # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
+
+# qeff_model.generate(prompts=["My name is"])
+
+from transformers import AutoTokenizer
+
+from QEfficient import QEFFAutoModelForCausalLM
+
+print("done")
+model_name = "gpt2"
+# model_name = "google/gemma-3-1b-it"
+# model_name = "meta-llama/Llama-3.1-8B"
+# model_name = "meta-llama/Llama-3.2-1B"
+# model_name = "meta-llama/Llama-3.1-70B"
+# model_name = "meta-llama/Llama-3.1-8B"
+model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
+##########################################
+model.export()
+model.compile(prefill_seq_len=128, ctx_len=256, num_cores=16, num_devices=1)  # Qpc file
+
+# model.compile(
+#     num_cores=14,
+#     mxfp6=True,
+#     device_group=[0],
+# )
+print("done")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+print("done")
+model.generate(prompts=["Hi there!!"], tokenizer=tokenizer, device_group=[0])
+print("done")
+
+# from qgenie import ChatMessage, QGenieClient
+
+
+# client = QGenieClient()
+
+
+# chat_response = client.chat(
+#     messages=[
+#         ChatMessage(role="user", content="Analyze this repository: https://github.com/quic/efficient-transformers")
+#     ],
+#     max_tokens=400,
+# )
+
+# print(chat_response.first_content)
diff --git a/tests/base/test_onnx_transforms.py b/tests/base/test_onnx_transforms.py
@@ -8,7 +8,7 @@
 import numpy as np
 import onnx
 
-from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
+from QEfficient.base.onnx_transforms import ClipAndSplitTransform
 
 
 def test_fp16clip_transform():
@@ -32,7 +32,7 @@ def test_fp16clip_transform():
     }
     """)
     onnx.checker.check_model(test_onnx, True, True, True)
-    transformed_onnx, transformed = FP16ClipTransform.apply(test_onnx)
+    transformed_onnx, transformed = ClipAndSplitTransform.apply(test_onnx, model_name="", apply_split=False)
     assert transformed
     assert onnx.numpy_helper.to_array(transformed_onnx.graph.initializer[0]) == 65504.0
     assert onnx.numpy_helper.to_array(transformed_onnx.graph.initializer[1]) == 2147483647
@@ -63,7 +63,9 @@ def test_fp16clip_transform_external(tmp_path):
     np.array(-1e10, dtype="float32").tofile(tmp_path / external_tensors_file)
     onnx.checker.check_model(onnx_path, True, True, True)
 
-    transformed_onnx, transformed = FP16ClipTransform.apply(test_onnx, onnx_base_dir=str(tmp_path))
+    transformed_onnx, transformed = ClipAndSplitTransform.apply(
+        test_onnx, model_name="", onnx_base_dir=str(tmp_path), apply_split=False
+    )
     assert transformed
     assert onnx.numpy_helper.to_array(transformed_onnx.graph.initializer[0]) == -65504.0
 
@@ -92,12 +94,13 @@ def test_split_tensors_transform(tmp_path):
     tensors.tofile(tmp_path / external_tensors_file)
     onnx.checker.check_model(onnx_path, True, True, True)
 
-    trans_onnx, transformed = SplitTensorsTransform.apply(
+    trans_onnx, transformed = ClipAndSplitTransform.apply(
         test_onnx,
         model_name="test_split",
         onnx_base_dir=str(tmp_path),
         file_chunk_size=32 * 4,
         size_threshold=16 * 4,
+        apply_clip=True,
     )
 
     tensor0_ext_data = onnx.external_data_helper.ExternalDataInfo(trans_onnx.graph.initializer[0])