From a8661cd751bca76f2f4338e6484ce8b6a2ef72c8 Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Tue, 5 Aug 2025 10:26:01 +0000
Subject: [PATCH 1/6] Bug fixed: the device_group arg is deprecated from
 compile and added in generate method, changes made in jupyter notebooks

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 notebooks/QEfficientGPT2.ipynb | 9 +++++----
 notebooks/QEfficientMPT.ipynb  | 8 ++++----
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 74e8097bb..726c41b39 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -94,9 +94,10 @@
     "\n",
     "qeff_model.compile(\n",
     "    num_cores=14,\n",
-    "    mxfp6=True,\n",
-    "    device_group=[0],\n",
-    ")"
+    "    mxfp6=True\n",
+    ")\n",
+    "\n",
+    "#the device_group arg is deprecated from compile and added in generate method"
    ]
   },
   {
@@ -117,7 +118,7 @@
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "qeff_model.generate(prompts=[\"My name is\"])"
+    "qeff_model.generate(prompts=[\"My name is\"], device_group=[0])"
    ]
   }
  ],
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index d1a1f3c5f..beea985ad 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -93,9 +93,9 @@
     "\n",
     "qeff_model.compile(\n",
     "    num_cores=14,\n",
-    "    mxfp6=True,\n",
-    "    device_group=[0],\n",
-    ")"
+    "    mxfp6=True\n",
+    ")\n",
+    "# the device_group arg is deprecated from compile and added in generate method"
    ]
   },
   {
@@ -116,7 +116,7 @@
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "qeff_model.generate(prompts=[\"My name is\"])"
+    "qeff_model.generate(prompts=[\"My name is\"], device_group=[0])"
    ]
   }
  ],

From e1f02e19d7e16eeafe4e027d1f9ee97515aa4f0e Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Tue, 5 Aug 2025 10:53:05 +0000
Subject: [PATCH 2/6] Fix formatting issues in notebooks

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 notebooks/QEfficientGPT2.ipynb | 7 ++-----
 notebooks/QEfficientMPT.ipynb  | 5 +----
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 726c41b39..7a54e8b47 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -92,12 +92,9 @@
     "# Compile the model for provided compilation arguments\n",
     "# Please use platform SDK to Check num_cores for your card.\n",
     "\n",
-    "qeff_model.compile(\n",
-    "    num_cores=14,\n",
-    "    mxfp6=True\n",
-    ")\n",
+    "qeff_model.compile(num_cores=14, mxfp6=True)\n",
     "\n",
-    "#the device_group arg is deprecated from compile and added in generate method"
+    "# the device_group arg is deprecated from compile and added in generate method"
    ]
   },
   {
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index beea985ad..7e8b51ede 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -91,10 +91,7 @@
     "# Compile the model for provided compilation arguments\n",
     "# Please use platform SDK to Check num_cores for your card.\n",
     "\n",
-    "qeff_model.compile(\n",
-    "    num_cores=14,\n",
-    "    mxfp6=True\n",
-    ")\n",
+    "qeff_model.compile(num_cores=14, mxfp6=True)\n",
     "# the device_group arg is deprecated from compile and added in generate method"
    ]
   },

From 4311272d4524058ed2941bc3b1895b0d03d6b940 Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Mon, 11 Aug 2025 05:23:37 +0000
Subject: [PATCH 3/6] notebooks updated

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 notebooks/QEfficientGPT2.ipynb | 120 ++++++++++++++++++++++++++++-----
 notebooks/QEfficientMPT.ipynb  |  30 ++++++---
 2 files changed, 126 insertions(+), 24 deletions(-)

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 7a54e8b47..0d41211a0 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -27,14 +27,62 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "c21f82d5-17df-4fc9-a180-05edd032f02d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.Op.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n",
+      "  param_schemas = callee.param_schemas()\n",
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.OnnxFunction.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n",
+      "  param_schemas = callee.param_schemas()\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gpt2 optimized for Cloud AI 100 \n",
+      " QEFFAutoModelForCausalLM\n",
+      "QEffGPT2LMHeadModel(\n",
+      "  (transformer): QEffGPT2Model(\n",
+      "    (wte): Embedding(50257, 768)\n",
+      "    (wpe): Embedding(1024, 768)\n",
+      "    (drop): Dropout(p=0.1, inplace=False)\n",
+      "    (h): ModuleList(\n",
+      "      (0-11): 12 x QEffGPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): QEffGPT2Attention(\n",
+      "          (c_attn): Conv1D(nf=2304, nx=768)\n",
+      "          (c_proj): Conv1D(nf=768, nx=768)\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D(nf=3072, nx=768)\n",
+      "          (c_proj): Conv1D(nf=768, nx=3072)\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "  )\n",
+      "  (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
+      ")\n"
+     ]
+    }
+   ],
    "source": [
     "# Initiate the Original Transformer model\n",
     "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
-    "\n",
+    "from transformers import AutoTokenizer\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
@@ -58,10 +106,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "0b293196-ba44-460e-94fb-4378283bc196",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PosixPath('/home/sharvari/.cache/qeff_models/GPT2LMHeadModel-d4ac0dba02c16a59/GPT2LMHeadModel.onnx')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# We can now export the modified models to ONNX framework\n",
     "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n",
@@ -84,17 +143,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "3fb4d6dd-9973-4608-b68b-ec6825cfef0e",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PosixPath('/home/sharvari/.cache/qeff_models/GPT2LMHeadModel-d4ac0dba02c16a59/qpc-46bd7fd6377ab8fb/qpc')"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Compile the model for provided compilation arguments\n",
     "# Please use platform SDK to Check num_cores for your card.\n",
     "\n",
-    "qeff_model.compile(num_cores=14, mxfp6=True)\n",
-    "\n",
-    "# the device_group arg is deprecated from compile and added in generate method"
+    "qeff_model.compile(num_cores=14, mxfp6_matmul=True)"
    ]
   },
   {
@@ -107,21 +175,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "4711fc74-aa5d-4e20-af0e-0d461d2e19bb",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'AutoTokenizer' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;66;03m# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_name)\n\u001b[1;32m      4\u001b[0m qeff_model\u001b[38;5;241m.\u001b[39mgenerate(prompts\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMy name is\u001b[39m\u001b[38;5;124m\"\u001b[39m], tokenizer\u001b[38;5;241m=\u001b[39mtokenizer)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'AutoTokenizer' is not defined"
+     ]
+    }
+   ],
    "source": [
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
-    "\n",
-    "qeff_model.generate(prompts=[\"My name is\"], device_group=[0])"
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bab713e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "qeff_env",
    "language": "python",
    "name": "python3"
   },
@@ -135,7 +223,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.19"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index 7e8b51ede..cc23a2bac 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -29,12 +29,26 @@
    "execution_count": null,
    "id": "c21f82d5-17df-4fc9-a180-05edd032f02d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.Op.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n",
+      "  param_schemas = callee.param_schemas()\n",
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.OnnxFunction.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n",
+      "  param_schemas = callee.param_schemas()\n",
+      "Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    }
+   ],
    "source": [
     "# Initiate the Original Transformer model\n",
     "\n",
     "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
-    "\n",
+    "from transformers import AutoTokenizer\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
@@ -91,8 +105,7 @@
     "# Compile the model for provided compilation arguments\n",
     "# Please use platform SDK to Check num_cores for your card.\n",
     "\n",
-    "qeff_model.compile(num_cores=14, mxfp6=True)\n",
-    "# the device_group arg is deprecated from compile and added in generate method"
+    "qeff_model.compile(num_cores=14, mxfp6_matmul=True)"
    ]
   },
   {
@@ -113,15 +126,16 @@
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "qeff_model.generate(prompts=[\"My name is\"], device_group=[0])"
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "py38",
+   "display_name": "qeff_env",
    "language": "python",
-   "name": "py38"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -133,7 +147,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.19"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

From 53aebd1755ad770da0d0f0cadfe40096b54218a1 Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Mon, 11 Aug 2025 05:34:31 +0000
Subject: [PATCH 4/6] format fixed in notebooks

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 notebooks/QEfficientGPT2.ipynb | 3 ++-
 notebooks/QEfficientMPT.ipynb  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 0d41211a0..2b8047ab3 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -195,7 +195,8 @@
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
+    "qeff_model.generate(prompts=[\"My name is\"], \n",
+    "        tokenizer=tokenizer)"
    ]
   },
   {
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index cc23a2bac..9ccbe7205 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -127,7 +127,8 @@
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
+    "qeff_model.generate(prompts=[\"My name is\"], \n",
+    "    tokenizer=tokenizer)"
    ]
   }
  ],

From 094b42b28499048786c51c05180fc009975a4ee3 Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Mon, 11 Aug 2025 05:36:30 +0000
Subject: [PATCH 5/6] format fixed in notebooks

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 notebooks/QEfficientGPT2.ipynb | 3 +--
 notebooks/QEfficientMPT.ipynb  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 2b8047ab3..0d41211a0 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -195,8 +195,7 @@
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-    "qeff_model.generate(prompts=[\"My name is\"], \n",
-    "        tokenizer=tokenizer)"
+    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
    ]
   },
   {
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index 9ccbe7205..cc23a2bac 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -127,8 +127,7 @@
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
     "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-    "qeff_model.generate(prompts=[\"My name is\"], \n",
-    "    tokenizer=tokenizer)"
+    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
    ]
   }
  ],

From 9b0d44db264043fe0f783885dea696a118081777 Mon Sep 17 00:00:00 2001
From: Sharvari Medhe <smedhe@qti.qualcomm.com>
Date: Mon, 11 Aug 2025 05:40:34 +0000
Subject: [PATCH 6/6] format fixed in notebooks

Signed-off-by: Sharvari Medhe <smedhe@qti.qualcomm.com>
---
 notebooks/QEfficientGPT2.ipynb | 2 ++
 notebooks/QEfficientMPT.ipynb  | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/notebooks/QEfficientGPT2.ipynb b/notebooks/QEfficientGPT2.ipynb
index 0d41211a0..fcd544598 100644
--- a/notebooks/QEfficientGPT2.ipynb
+++ b/notebooks/QEfficientGPT2.ipynb
@@ -82,6 +82,8 @@
    "source": [
     "# Initiate the Original Transformer model\n",
     "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
+    "\n",
+    "# Initiate the tokenizer for transformers library\n",
     "from transformers import AutoTokenizer\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
diff --git a/notebooks/QEfficientMPT.ipynb b/notebooks/QEfficientMPT.ipynb
index cc23a2bac..9fcb75ecf 100644
--- a/notebooks/QEfficientMPT.ipynb
+++ b/notebooks/QEfficientMPT.ipynb
@@ -46,8 +46,9 @@
    ],
    "source": [
     "# Initiate the Original Transformer model\n",
-    "\n",
     "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
+    "\n",
+    "# Initiate the tokenizer for transformers library\n",
     "from transformers import AutoTokenizer\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",