quic · smedhe · Aug 5, 2025 · Aug 5, 2025 · Aug 11, 2025 · Aug 11, 2025
@@ -1,8 +1,8 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8a341fa4-b4dc-4cea-a4b3-249aa5fc9394",
   "metadata": {},
   "source": [
    "### Demonstrate the LLM GPT2 Model OnBoarding on Cloud AI 100 Platform"
@@ -27,14 +27,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "c21f82d5-17df-4fc9-a180-05edd032f02d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.Op.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n",
+      "  param_schemas = callee.param_schemas()\n",
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.OnnxFunction.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n",
+      "  param_schemas = callee.param_schemas()\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gpt2 optimized for Cloud AI 100 \n",
+      " QEFFAutoModelForCausalLM\n",
+      "QEffGPT2LMHeadModel(\n",
+      "  (transformer): QEffGPT2Model(\n",
+      "    (wte): Embedding(50257, 768)\n",
+      "    (wpe): Embedding(1024, 768)\n",
+      "    (drop): Dropout(p=0.1, inplace=False)\n",
+      "    (h): ModuleList(\n",
+      "      (0-11): 12 x QEffGPT2Block(\n",
+      "        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (attn): QEffGPT2Attention(\n",
+      "          (c_attn): Conv1D(nf=2304, nx=768)\n",
+      "          (c_proj): Conv1D(nf=768, nx=768)\n",
+      "          (attn_dropout): Dropout(p=0.1, inplace=False)\n",
+      "          (resid_dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "        (mlp): GPT2MLP(\n",
+      "          (c_fc): Conv1D(nf=3072, nx=768)\n",
+      "          (c_proj): Conv1D(nf=768, nx=3072)\n",
+      "          (act): NewGELUActivation()\n",
+      "          (dropout): Dropout(p=0.1, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "    )\n",
+      "    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
+      "  )\n",
+      "  (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
+      ")\n"
+     ]
+    }
+   ],
    "source": [
     "# Initiate the Original Transformer model\n",
     "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
     "\n",
+    "# Initiate the tokenizer for transformers library\n",
+    "from transformers import AutoTokenizer\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
@@ -58,10 +108,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "0b293196-ba44-460e-94fb-4378283bc196",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PosixPath('/home/sharvari/.cache/qeff_models/GPT2LMHeadModel-d4ac0dba02c16a59/GPT2LMHeadModel.onnx')"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# We can now export the modified models to ONNX framework\n",
     "# This will generate single Onnx Model for both Prefill and Decode Variations which are optimized for\n",
@@ -84,19 +145,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "3fb4d6dd-9973-4608-b68b-ec6825cfef0e",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "PosixPath('/home/sharvari/.cache/qeff_models/GPT2LMHeadModel-d4ac0dba02c16a59/qpc-46bd7fd6377ab8fb/qpc')"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
     "# Compile the model for provided compilation arguments\n",
     "# Please use platform SDK to Check num_cores for your card.\n",
     "\n",
-    "qeff_model.compile(\n",
-    "    num_cores=14,\n",
-    "    mxfp6=True,\n",
-    "    device_group=[0],\n",
-    ")"
+    "qeff_model.compile(num_cores=14, mxfp6_matmul=True)"
    ]
   },
   {
@@ -109,21 +177,41 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "4711fc74-aa5d-4e20-af0e-0d461d2e19bb",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'AutoTokenizer' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\u001b[39;00m\n\u001b[1;32m      2\u001b[0m \u001b[38;5;66;03m# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\u001b[39;00m\n\u001b[0;32m----> 3\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m \u001b[43mAutoTokenizer\u001b[49m\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_name)\n\u001b[1;32m      4\u001b[0m qeff_model\u001b[38;5;241m.\u001b[39mgenerate(prompts\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mMy name is\u001b[39m\u001b[38;5;124m\"\u001b[39m], tokenizer\u001b[38;5;241m=\u001b[39mtokenizer)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'AutoTokenizer' is not defined"
+     ]
+    }
+   ],
    "source": [
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
-    "\n",
-    "qeff_model.generate(prompts=[\"My name is\"])"
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bab713e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "qeff_env",
    "language": "python",
    "name": "python3"
   },
@@ -137,7 +225,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.19"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

@@ -1,8 +1,8 @@
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "8a341fa4-b4dc-4cea-a4b3-249aa5fc9394",
   "metadata": {},
   "source": [
    "### Demonstrate the LLM MPT Model OnBoarding on Cloud AI 100 Platform"
@@ -29,12 +29,27 @@
    "execution_count": null,
    "id": "c21f82d5-17df-4fc9-a180-05edd032f02d",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.Op.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n",
+      "  param_schemas = callee.param_schemas()\n",
+      "/home/sharvari/qeff_env/lib/python3.10/site-packages/onnxscript/converter.py:816: FutureWarning: 'onnxscript.values.OnnxFunction.param_schemas' is deprecated in version 0.1 and will be removed in the future. Please use '.op_signature' instead.\n",
+      "  param_schemas = callee.param_schemas()\n",
+      "Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]"
+     ]
+    }
+   ],
    "source": [
     "# Initiate the Original Transformer model\n",
-    "\n",
     "from QEfficient import QEFFAutoModelForCausalLM as AutoModelForCausalLM\n",
     "\n",
+    "# Initiate the tokenizer for transformers library\n",
+    "from transformers import AutoTokenizer\n",
     "# Please uncomment and use appropriate Cache Directory for transformers, in case you don't want to use default ~/.cache dir.\n",
     "# os.environ[\"TRANSFORMERS_CACHE\"] = \"/local/mnt/workspace/hf_cache\"\n",
     "\n",
@@ -91,11 +106,7 @@
     "# Compile the model for provided compilation arguments\n",
     "# Please use platform SDK to Check num_cores for your card.\n",
     "\n",
-    "qeff_model.compile(\n",
-    "    num_cores=14,\n",
-    "    mxfp6=True,\n",
-    "    device_group=[0],\n",
-    ")"
+    "qeff_model.compile(num_cores=14, mxfp6_matmul=True)"
    ]
   },
   {
@@ -116,15 +127,16 @@
     "# post compilation, we can print the latency stats for the kv models, We provide API to print token and Latency stats on Cloud AI 100\n",
     "# We need the compiled prefill and decode qpc to compute the token generated, This is based on Greedy Sampling Approach\n",
     "\n",
-    "qeff_model.generate(prompts=[\"My name is\"])"
+    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "qeff_model.generate(prompts=[\"My name is\"], tokenizer=tokenizer)"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "py38",
+   "display_name": "qeff_env",
    "language": "python",
-   "name": "py38"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -136,7 +148,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.19"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,