Skip to content

Commit b50f44e

Browse files
Fixed transform flag and other importing related issue
Signed-off-by: abhishek-singh591 <[email protected]>
1 parent f40dd27 commit b50f44e

File tree

4 files changed

+98
-10
lines changed

4 files changed

+98
-10
lines changed

QEfficient/base/onnx_transforms.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,11 @@ def apply(
6767

6868
def process_tensor(info: TensorInfo) -> bool:
6969
tensor, tsize = info
70-
transformed = False
70+
transformed_clip = False
71+
transformed_split = False
7172

72-
if apply_clip and cls._clip_tensor(tensor, onnx_base_dir, fp16_min, fp16_max):
73-
transformed = True
73+
if apply_clip:
74+
transformed_clip = cls._clip_tensor(tensor, onnx_base_dir, fp16_min, fp16_max)
7475

7576
if apply_split and tsize > size_threshold:
7677
if file_num_tracker["size"] + tsize > file_chunk_size:
@@ -80,9 +81,11 @@ def process_tensor(info: TensorInfo) -> bool:
8081
file_num_tracker["size"] += tsize
8182

8283
cls._split_tensor(tensor, model_name, file_num_tracker["num"])
83-
transformed = True
84+
transformed_split = True
8485

85-
return transformed
86+
if apply_clip and apply_split:
87+
return transformed_clip and transformed_split
88+
return transformed_clip or transformed_split
8689

8790
with ThreadPoolExecutor(max_workers=os.cpu_count() * 4) as executor:
8891
transformed_flags = list(executor.map(process_tensor, tensor_infos))

QEfficient/exporter/export_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,9 @@ def fix_onnx_fp16(
218218
:str: Updated base name of exported ONNX model.
219219
"""
220220
model = onnx.load(os.path.join(gen_models_path, f"{model_base_name}.onnx"))
221-
model, fp16_fix = ClipAndSplitTransform.apply(model, onnx_base_dir=gen_models_path, apply_split=False)
221+
model, fp16_fix = ClipAndSplitTransform.apply(
222+
model, model_name="", onnx_base_dir=gen_models_path, apply_split=False
223+
)
222224

223225
if fp16_fix:
224226
# Save FP16 model

run.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# # Initiate the Original Transformer model
2+
# from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM
3+
4+
# # Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.
5+
# # os.environ["TRANSFORMERS_CACHE"] = "/local/mnt/workspace/hf_cache"
6+
7+
# # ROOT_DIR = os.path.dirname(os.path.abspath(""))
8+
# # CACHE_DIR = os.path.join(ROOT_DIR, "tmp") #, you can use a different location for just one model by passing this param as cache_dir in below API.
9+
10+
# # Model-Card name to be onboarded (This is HF Model Card name) : https://huggingface.co/gpt2-xl
11+
# model_name = "gpt2" # Similar, we can change model name and generate corresponding models, if we have added the support in the lib.
12+
13+
# qeff_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path="gpt2")
14+
# print(f"{model_name} optimized for Cloud AI 100 \n", qeff_model)
15+
16+
# # We can now export the modified models to ONNX framework
17+
# # This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for
18+
# # Cloud AI 100 Platform.
19+
20+
# # While generating the ONNX model, this will clip the overflow constants to fp16
21+
# # Verify the model on Onnxruntime vs Pytorch
22+
23+
# # Then generate inputs and customio yaml file required for compilation.
24+
# qeff_model.export()
25+
26+
# # Compile the model for provided compilation arguments
27+
# # Please use platform SDK to Check num_cores for your card.
28+
29+
# qeff_model.compile(
30+
# num_cores=14,
31+
# mxfp6=True,
32+
# device_group=[0],
33+
# )
34+
35+
# # post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100
36+
# # We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach
37+
38+
# qeff_model.generate(prompts=["My name is"])
39+
40+
from transformers import AutoTokenizer
41+
42+
from QEfficient import QEFFAutoModelForCausalLM
43+
44+
print("done")
45+
model_name = "gpt2"
46+
# model_name = "google/gemma-3-1b-it"
47+
# model_name = "meta-llama/Llama-3.1-8B"
48+
# model_name = "meta-llama/Llama-3.2-1B"
49+
# model_name = "meta-llama/Llama-3.1-70B"
50+
# model_name = "meta-llama/Llama-3.1-8B"
51+
model = QEFFAutoModelForCausalLM.from_pretrained(model_name)
52+
##########################################
53+
model.export()
54+
model.compile(prefill_seq_len=128, ctx_len=256, num_cores=16, num_devices=1) # Qpc file
55+
56+
# model.compile(
57+
# num_cores=14,
58+
# mxfp6=True,
59+
# device_group=[0],
60+
# )
61+
print("done")
62+
tokenizer = AutoTokenizer.from_pretrained(model_name)
63+
print("done")
64+
model.generate(prompts=["Hi there!!"], tokenizer=tokenizer, device_group=[0])
65+
print("done")
66+
67+
# from qgenie import ChatMessage, QGenieClient
68+
69+
70+
# client = QGenieClient()
71+
72+
73+
# chat_response = client.chat(
74+
# messages=[
75+
# ChatMessage(role="user", content="Analyze this repository: https://github.com/quic/efficient-transformers")
76+
# ],
77+
# max_tokens=400,
78+
# )
79+
80+
# print(chat_response.first_content)

tests/base/test_onnx_transforms.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import numpy as np
99
import onnx
1010

11-
from QEfficient.base.onnx_transforms import FP16ClipTransform, SplitTensorsTransform
11+
from QEfficient.base.onnx_transforms import ClipAndSplitTransform
1212

1313

1414
def test_fp16clip_transform():
@@ -32,7 +32,7 @@ def test_fp16clip_transform():
3232
}
3333
""")
3434
onnx.checker.check_model(test_onnx, True, True, True)
35-
transformed_onnx, transformed = FP16ClipTransform.apply(test_onnx)
35+
transformed_onnx, transformed = ClipAndSplitTransform.apply(test_onnx, model_name="", apply_split=False)
3636
assert transformed
3737
assert onnx.numpy_helper.to_array(transformed_onnx.graph.initializer[0]) == 65504.0
3838
assert onnx.numpy_helper.to_array(transformed_onnx.graph.initializer[1]) == 2147483647
@@ -63,7 +63,9 @@ def test_fp16clip_transform_external(tmp_path):
6363
np.array(-1e10, dtype="float32").tofile(tmp_path / external_tensors_file)
6464
onnx.checker.check_model(onnx_path, True, True, True)
6565

66-
transformed_onnx, transformed = FP16ClipTransform.apply(test_onnx, onnx_base_dir=str(tmp_path))
66+
transformed_onnx, transformed = ClipAndSplitTransform.apply(
67+
test_onnx, model_name="", onnx_base_dir=str(tmp_path), apply_split=False
68+
)
6769
assert transformed
6870
assert onnx.numpy_helper.to_array(transformed_onnx.graph.initializer[0]) == -65504.0
6971

@@ -92,12 +94,13 @@ def test_split_tensors_transform(tmp_path):
9294
tensors.tofile(tmp_path / external_tensors_file)
9395
onnx.checker.check_model(onnx_path, True, True, True)
9496

95-
trans_onnx, transformed = SplitTensorsTransform.apply(
97+
trans_onnx, transformed = ClipAndSplitTransform.apply(
9698
test_onnx,
9799
model_name="test_split",
98100
onnx_base_dir=str(tmp_path),
99101
file_chunk_size=32 * 4,
100102
size_threshold=16 * 4,
103+
apply_clip=True,
101104
)
102105

103106
tensor0_ext_data = onnx.external_data_helper.ExternalDataInfo(trans_onnx.graph.initializer[0])

0 commit comments

Comments
 (0)