From a8661cd751bca76f2f4338e6484ce8b6a2ef72c8 Mon Sep 17 00:00:00 2001 From: Sharvari Medhe Date: Tue, 5 Aug 2025 10:26:01 +0000 Subject: [PATCH 1/6] Bug fixed: the device_group arg is deprecated from compile and added in generate method, changes made in jupyter notebooks Signed-off-by: Sharvari Medhe --- notebooks/QEfficientGPT2.ipynb | 9 +++++---- notebooks/QEfficientMPT.ipynb | 8 ++++---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 74e8097bb..726c41b39 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -94,9 +94,10 @@ "\n", "qeff_model.compile(\n", " num_cores=14,\n", - " mxfp6=True,\n", - " device_group=[0],\n", - ")" + " mxfp6=True\n", + ")\n", + "\n", + "#the device_group arg is deprecated from compile and added in generate method" ] }, { @@ -117,7 +118,7 @@ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", - "qeff_model.generate(prompts=[\"My name is\"])" + "qeff_model.generate(prompts=[\"My name is\"], device_group=[0])" ] } ], diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index d1a1f3c5f..beea985ad 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -93,9 +93,9 @@ "\n", "qeff_model.compile(\n", " num_cores=14,\n", - " mxfp6=True,\n", - " device_group=[0],\n", - ")" + " mxfp6=True\n", + ")\n", + "# the device_group arg is deprecated from compile and added in generate method" ] }, { @@ -116,7 +116,7 @@ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", - "qeff_model.generate(prompts=[\"My name is\"])" + "qeff_model.generate(prompts=[\"My name is\"], device_group=[0])" ] } ], From e1f02e19d7e16eeafe4e027d1f9ee97515aa4f0e Mon Sep 17 00:00:00 2001 From: Sharvari Medhe Date: Tue, 5 Aug 2025 10:53:05 +0000 Subject: [PATCH 2/6] Fix formatting issues in notebooks Signed-off-by: Sharvari Medhe --- notebooks/QEfficientGPT2.ipynb | 7 ++----- notebooks/QEfficientMPT.ipynb | 5 +---- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 726c41b39..7a54e8b47 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -92,12 +92,9 @@ "# Compile the model for provided compilation arguments\n", "# Please use platform SDK to Check num_cores for your card.\n", "\n", - "qeff_model.compile(\n", - " num_cores=14,\n", - " mxfp6=True\n", - ")\n", + "qeff_model.compile(num_cores=14, mxfp6=True)\n", "\n", - "#the device_group arg is deprecated from compile and added in generate method" + "# the device_group arg is deprecated from compile and added in generate method" ] }, { diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index beea985ad..7e8b51ede 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -91,10 +91,7 @@ "# Compile the model for provided compilation arguments\n", "# Please use platform SDK to Check num_cores for your card.\n", "\n", - "qeff_model.compile(\n", - " num_cores=14,\n", - " mxfp6=True\n", - ")\n", + "qeff_model.compile(num_cores=14, mxfp6=True)\n", "# the device_group arg is deprecated from compile and added in generate method" ] }, From 4311272d4524058ed2941bc3b1895b0d03d6b940 Mon Sep 17 00:00:00 2001 From: Sharvari Medhe Date: Mon, 11 Aug 2025 05:23:37 +0000 Subject: [PATCH 3/6] notebooks updated Signed-off-by: Sharvari Medhe --- notebooks/QEfficientGPT2.ipynb | 120 ++++++++++++++++++++++++++++----- notebooks/QEfficientMPT.ipynb | 30 ++++++--- 2 files changed, 126 insertions(+), 24 deletions(-) diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 7a54e8b47..0d41211a0 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -27,14 +27,62 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "c21f82d5-17df-4fc9-a180-05edd032f02d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sharvari/qeff_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.Op.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n", + " param_schemas = callee.param_schemas()\n", + "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.OnnxFunction.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n", + " param_schemas = callee.param_schemas()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gpt2 optimized for Cloud AI 100 \n", + " QEFFAutoModelForCausalLM\n", + "QEffGPT2LMHeadModel(\n", + " (transformer): QEffGPT2Model(\n", + " (wte): Embedding(50257, 768)\n", + " (wpe): Embedding(1024, 768)\n", + " (drop): Dropout(p=0.1, inplace=False)\n", + " (h): ModuleList(\n", + " (0-11): 12 x QEffGPT2Block(\n", + " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (attn): QEffGPT2Attention(\n", + " (c_attn): Conv1D(nf=2304, nx=768)\n", + " (c_proj): Conv1D(nf=768, nx=768)\n", + " (attn_dropout): Dropout(p=0.1, inplace=False)\n", + " (resid_dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " (mlp): GPT2MLP(\n", + " (c_fc): Conv1D(nf=3072, nx=768)\n", + " (c_proj): Conv1D(nf=768, nx=3072)\n", + " (act): NewGELUActivation()\n", + " (dropout): Dropout(p=0.1, inplace=False)\n", + " )\n", + " )\n", + " )\n", + " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n", + " )\n", + " (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n", + ")\n" + ] + } + ], "source": [ "# Initiate the Original Transformer model\n", "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", - "\n", + "from transformers import AutoTokenizer\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", "\n", @@ -58,10 +106,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "0b293196-ba44-460e-94fb-4378283bc196", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/home/sharvari/.cache/qeff_models/GPT2LMHeadModel-d4ac0dba02c16a59/GPT2LMHeadModel.onnx')" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# We can now export the modified models to ONNX framework\n", "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n", @@ -84,17 +143,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "3fb4d6dd-9973-4608-b68b-ec6825cfef0e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('/home/sharvari/.cache/qeff_models/GPT2LMHeadModel-d4ac0dba02c16a59/qpc-46bd7fd6377ab8fb/qpc')" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "# Compile the model for provided compilation arguments\n", "# Please use platform SDK to Check num_cores for your card.\n", "\n", - "qeff_model.compile(num_cores=14, mxfp6=True)\n", - "\n", - "# the device_group arg is deprecated from compile and added in generate method" + "qeff_model.compile(num_cores=14, mxfp6_matmul=True)" ] }, { @@ -107,21 +175,41 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "4711fc74-aa5d-4e20-af0e-0d461d2e19bb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'AutoTokenizer' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\u001b[39;00m\n\u001b[1;32m 2\u001b[0m \u001b[38;5;66;03m# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_name)\n\u001b[1;32m 4\u001b[0m qeff_model\u001b[38;5;241m.\u001b[39mgenerate(prompts\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMy name is\u001b[39m\u001b[38;5;124m\"\u001b[39m], tokenizer\u001b[38;5;241m=\u001b[39mtokenizer)\n", + "\u001b[0;31mNameError\u001b[0m: name 'AutoTokenizer' is not defined" + ] + } + ], "source": [ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", - "\n", - "qeff_model.generate(prompts=[\"My name is\"], device_group=[0])" + "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", + "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bab713e", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "qeff_env", "language": "python", "name": "python3" }, @@ -135,7 +223,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.19" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index 7e8b51ede..cc23a2bac 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -29,12 +29,26 @@ "execution_count": null, "id": "c21f82d5-17df-4fc9-a180-05edd032f02d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/sharvari/qeff_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.Op.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n", + " param_schemas = callee.param_schemas()\n", + "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.OnnxFunction.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n", + " param_schemas = callee.param_schemas()\n", + "Fetching 2 files: 0%| | 0/2 [00:00 Date: Mon, 11 Aug 2025 05:34:31 +0000 Subject: [PATCH 4/6] format fixed in notebooks Signed-off-by: Sharvari Medhe --- notebooks/QEfficientGPT2.ipynb | 3 ++- notebooks/QEfficientMPT.ipynb | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 0d41211a0..2b8047ab3 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -195,7 +195,8 @@ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", - "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)" + "qeff_model.generate(prompts=[\"My name is\"], \n", + " tokenizer=tokenizer)" ] }, { diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index cc23a2bac..9ccbe7205 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -127,7 +127,8 @@ "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", - "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)" + "qeff_model.generate(prompts=[\"My name is\"], \n", + " tokenizer=tokenizer)" ] } ], From 094b42b28499048786c51c05180fc009975a4ee3 Mon Sep 17 00:00:00 2001 From: Sharvari Medhe Date: Mon, 11 Aug 2025 05:36:30 +0000 Subject: [PATCH 5/6] format fixed in notebooks Signed-off-by: Sharvari Medhe --- notebooks/QEfficientGPT2.ipynb | 3 +-- notebooks/QEfficientMPT.ipynb | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 2b8047ab3..0d41211a0 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -195,8 +195,7 @@ "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n", "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", - "qeff_model.generate(prompts=[\"My name is\"], \n", - " tokenizer=tokenizer)" + "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)" ] }, { diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index 9ccbe7205..cc23a2bac 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -127,8 +127,7 @@ "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", - "qeff_model.generate(prompts=[\"My name is\"], \n", - " tokenizer=tokenizer)" + "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)" ] } ], From 9b0d44db264043fe0f783885dea696a118081777 Mon Sep 17 00:00:00 2001 From: Sharvari Medhe Date: Mon, 11 Aug 2025 05:40:34 +0000 Subject: [PATCH 6/6] format fixed in notebooks Signed-off-by: Sharvari Medhe --- notebooks/QEfficientGPT2.ipynb | 2 ++ notebooks/QEfficientMPT.ipynb | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb index 0d41211a0..fcd544598 100644 --- a/notebooks/QEfficientGPT2.ipynb +++ b/notebooks/QEfficientGPT2.ipynb @@ -82,6 +82,8 @@ "source": [ "# Initiate the Original Transformer model\n", "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", + "\n", + "# Initiate the tokenizer for transformers library\n", "from transformers import AutoTokenizer\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n", diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb index cc23a2bac..9fcb75ecf 100644 --- a/notebooks/QEfficientMPT.ipynb +++ b/notebooks/QEfficientMPT.ipynb @@ -46,8 +46,9 @@ ], "source": [ "# Initiate the Original Transformer model\n", - "\n", "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n", + "\n", + "# Initiate the tokenizer for transformers library\n", "from transformers import AutoTokenizer\n", "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n", "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",