intel · ssounda1 · May 2, 2024 · May 14, 2024 · May 24, 2024 · Jun 3, 2024
diff --git a/script/HFModelsInference/.gitignore b/script/HFModelsInference/.gitignore
@@ -0,0 +1,3 @@
+results*\*\**
+cache
+
diff --git a/script/HFModelsInference/RunHFModelsIntelNPU.ipynb b/script/HFModelsInference/RunHFModelsIntelNPU.ipynb
@@ -0,0 +1,361 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Simple app to download models from Hugging Face and use the intel_npu_acceleration_library for running Inference on Intel NPU\n",
+    "\n",
+    "## Pre-Requisites\n",
+    "\n",
+    "Make sure you have a clean conda environment to begin with. <br>Python version should be 3.10<br>\n",
+    "You can create a conda env with conda create -n intel-npu python=3.10 <br><br>\n",
+    "Install VC++ redist from https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist <br>\n",
+    "\n",
+    "## Note\n",
+    "If the notebook fails to execute on the first run, please restart the kernel and try again. Some dependent python packages may require kernel restart.\n",
+    "To set the build env inside conda env, use <br>\n",
+    "\"c:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\VsDevCmd.bat\" <br>\n",
+    "Then create a new conda environment python=3.10. Actiavate and build library from source using - <br>\n",
+    "pip install \"intel-npu-acceleration-library @ git+https://github.com/intel/intel-npu-acceleration-library.git\"\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import platform\n",
+    "import importlib\n",
+    "import time\n",
+    "from timeit import default_timer as timer\n",
+    "import subprocess\n",
+    "\n",
+    "assert 'Intel' in platform.processor(), \"Only INTEL processors are supported\"\n",
+    "!python -m pip install -r requirements.txt -q\n",
+    "\n",
+    "# Import NPU acceleration library\n",
+    "import intel_npu_acceleration_library\n",
+    "import intel_npu_acceleration_library.backend\n",
+    "\n",
+    "import os\n",
+    "import openvino\n",
+    "import openvino.torch\n",
+    "import numpy as np\n",
+    "from subprocess import Popen\n",
+    "from PIL import Image\n",
+    "import shutil\n",
+    "import evaluate\n",
+    "import toml\n",
+    "import json\n",
+    "import torchaudio\n",
+    "from huggingface_hub import login\n",
+    "import requests\n",
+    "import validators\n",
+    "\n",
+    "gleu = evaluate.load('google_bleu')\n",
+    "\n",
+    "#Clear cache\n",
+    "if os.path.exists(\"./cache\"):\n",
+    "    shutil.rmtree(\"./cache\")\n",
+    "\n",
+    "#Setup the test config\n",
+    "#Read the JSON config to get the tokenizer and model ID\n",
+    "with open('TestConfigNPU.toml', 'r') as f:\n",
+    "    testconfig = toml.load(f)\n",
+    "\n",
+    "#Set the number of iterations here\n",
+    "numiter = testconfig[\"Configuration\"][\"num-iter\"]\n",
+    "\n",
+    "# Flag to enable additional run on CPU\n",
+    "run_additional_cpu = testconfig[\"Configuration\"][\"run_additional_cpu\"]\n",
+    "\n",
+    "#Question and answers\n",
+    "input_text_vqa = testconfig[\"Configuration\"][\"input_text_vqa\"]\n",
+    "ref_answer_vqa = testconfig[\"Configuration\"][\"ref_answer_vqa\"]\n",
+    "input_text_ttot = testconfig[\"Configuration\"][\"input_text_ttot\"]\n",
+    "ref_answer_ttot = testconfig[\"Configuration\"][\"ref_answer_ttot\"]\n",
+    "ref_answer_aud = testconfig[\"Configuration\"][\"ref_answer_aud\"]\n",
+    "\n",
+    "#Input image file\n",
+    "input_img_file = testconfig[\"Configuration\"][\"input_img_file\"]\n",
+    "assert \".jpg\" in input_img_file, \"Only jpg files are supproted\"\n",
+    "if validators.url(input_img_file):\n",
+    "    if os.path.exists(\"./test.jpg\"):\n",
+    "        os.remove(\"./test.jpg\")\n",
+    "    with requests.get(input_img_file, stream=True) as r:\n",
+    "        with open(\"./test.jpg\", 'wb') as f:\n",
+    "            shutil.copyfileobj(r.raw, f)\n",
+    "        input_img_file = \"./test.jpg\"\n",
+    "\n",
+    "#Input Audio file and Sampling rate\n",
+    "input_aud_file = testconfig[\"Configuration\"][\"input_aud_file\"]\n",
+    "assert \".wav\" in input_aud_file, \"Only wav files are supproted\"\n",
+    "if validators.url(input_aud_file):\n",
+    "    with requests.get(input_aud_file, stream=True) as r:\n",
+    "        with open(\"./test.wav\", 'wb') as f:\n",
+    "            shutil.copyfileobj(r.raw, f)\n",
+    "        input_aud_file = \"./test.wav\"\n",
+    "\n",
+    "#Clean previous results flag\n",
+    "clean_prev_results = testconfig[\"Configuration\"][\"clean_prev_results\"]\n",
+    "\n",
+    "# Flag to enable Pytorch profiling\n",
+    "enable_pytorch_profiling = testconfig[\"Configuration\"][\"enable_pytorch_profiling\"]\n",
+    "\n",
+    "if enable_pytorch_profiling:\n",
+    "    from torch.profiler import profile, record_function, ProfilerActivity\n",
+    "\n",
+    "#HF Access Token\n",
+    "hfaccesstoken = testconfig[\"Configuration\"][\"HFaccessToken\"]\n",
+    "assert hfaccesstoken, \"Hugging Face token is not set. Please set it in TestConfig TOML\"\n",
+    "login(token=hfaccesstoken)\n",
+    "\n",
+    "# Import NPU acceleration library\n",
+    "import intel_npu_acceleration_library\n",
+    "\n",
+    "#Get NPU driver version\n",
+    "npu_driver_version = subprocess.run(\n",
+    "    [\n",
+    "        \"powershell.exe\",\n",
+    "        \"--NoProfile\",\n",
+    "        \"--ExecutionPolicy\", \"Bypass\",\n",
+    "        \"Get-WmiObject\", \"Win32_PnPSignedDriver | select devicename, driverversion | Select-String -Pattern 'Intel(R) AI Boost' -CaseSensitive -SimpleMatch\"\n",
+    "    ],\n",
+    "    capture_output=True,\n",
+    "    text=True\n",
+    ")\n",
+    "npu_driver_version_string = npu_driver_version.stdout\n",
+    "#Book-Keeping\n",
+    "results = {\n",
+    "    \"NPU-Version\": npu_driver_version_string\n",
+    "}\n",
+    "inf_time_list = []\n",
+    "pytorch_accuracy_list = []\n",
+    "run_time = time.strftime(\"%Y%m%d-%H%M%S\")\n",
+    "results[\"runtimestamp\"] = run_time\n",
+    "results_folder = './results'\n",
+    "if clean_prev_results:\n",
+    "    if os.path.exists(results_folder):\n",
+    "        shutil.rmtree(results_folder)\n",
+    "if not os.path.exists(results_folder):\n",
+    "    os.makedirs(results_folder)\n",
+    "results_folder = './results/' + 'results-' + str(run_time) +\"/\"\n",
+    "if not os.path.exists(results_folder):\n",
+    "    os.makedirs(results_folder)\n",
+    "\n",
+    "#Clear the cache dir and contents\n",
+    "if os.path.exists('./cache'):\n",
+    "    shutil.rmtree(\"./cache\")\n",
+    "\n",
+    "#Results dumper\n",
+    "def results_dumper(results):\n",
+    "    #Dump Results to File\n",
+    "    with open(results_folder + 'results-' + str(run_time) + '.json', 'w') as f:\n",
+    "            json.dump(results, f, indent=4)\n",
+    "\n",
+    "    # Latency Results to CSV\n",
+    "    csv_column_headers = [ \"XPU\",\n",
+    "                        \"Model\", \n",
+    "                        \"Pytorch First Inference Latency(ms)\",\n",
+    "                        \"PytorchInf(ms)(Avg over \" + str(numiter - 1) + \" iterations)\", \n",
+    "                        \"Pytorch Accuracy Score(%)(Avg over \" + str(numiter) + \" iterations)\",            \n",
+    "                        ]\n",
+    "    csv_data = []\n",
+    "    csv_data.append(csv_column_headers)\n",
+    "    for model_entry in results:\n",
+    "        if \"runtimestamp\" not in model_entry:\n",
+    "            csv_column_data = []\n",
+    "            for header in csv_column_headers:\n",
+    "                if header in results[model_entry]:\n",
+    "                    csv_column_data.append(results[model_entry][header])\n",
+    "                else:\n",
+    "                    csv_column_data.append(\"N/A\")\n",
+    "            csv_data.append(csv_column_data)\n",
+    "\n",
+    "    #Dump Results CSV to File\n",
+    "    with open(results_folder + 'results-' + str(run_time) + '.csv', 'w') as f:\n",
+    "        for row_entry in csv_data:\n",
+    "            to_write = \"\"\n",
+    "            for entry in row_entry:\n",
+    "                to_write = to_write + entry + \",\"\n",
+    "            f.write(to_write + '\\n')\n",
+    "\n",
+    "\n",
+    "#Main inference function\n",
+    "def run_inference(tokenizer_type, model_id, model_gen, inputs_types, useNPUAccelerationLibrary, input_q, ref_answer, import_library, attn_impl, target_dtype):\n",
+    "    torch_lib = importlib.import_module(\"torch\")\n",
+    "    target_dtype = getattr(torch_lib, target_dtype)\n",
+    "    transformerlib = importlib.import_module(\"transformers\")\n",
+    "    imported_lib = importlib.import_module(import_library)\n",
+    "    tokenattr = getattr(transformerlib, tokenizer_type)\n",
+    "    modelattr = getattr(imported_lib, model_gen)\n",
+    "    if import_library == \"transformers\":\n",
+    "        if attn_impl != \"none\":\n",
+    "            model = modelattr.from_pretrained(model_id, attn_implementation=attn_impl, trust_remote_code=True)\n",
+    "        else:\n",
+    "            model = modelattr.from_pretrained(model_id, trust_remote_code=True)\n",
+    "    else:\n",
+    "        model = modelattr.from_pretrained(model_id, trust_remote_code=True)\n",
+    "    xpu_type = \"CPU\"\n",
+    "    first_latency = 0\n",
+    "\n",
+    "    if useNPUAccelerationLibrary:\n",
+    "        xpu_type = \"NPU\"\n",
+    "        dtype = target_dtype\n",
+    "        model = intel_npu_acceleration_library.compile(model.eval(), dtype=dtype)\n",
+    "    tokenizer = tokenattr.from_pretrained(model_id, trust_remote_code=True)\n",
+    "    first_output_str=\"\"\n",
+    "    for i in range(1,numiter+1):\n",
+    "        start_time = timer()\n",
+    "        if \"pixel_values\" in inputs_types:\n",
+    "            image = Image.open(input_img_file).convert(\"RGB\")\n",
+    "            inputs = tokenizer(images=image, text=input_q, return_tensors=\"pt\").pixel_values\n",
+    "            if enable_pytorch_profiling:\n",
+    "                with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:\n",
+    "                    with record_function(\"model_inference\"):\n",
+    "                        outputs = model.generate(inputs, max_new_tokens=50)                    \n",
+    "            else:\n",
+    "                outputs = model.generate(inputs, max_new_tokens=50)\n",
+    "        elif \"image\" in inputs_types:\n",
+    "            image = Image.open(input_img_file).convert(\"RGB\")\n",
+    "            inputs = tokenizer(images=image, text=input_q, return_tensors=\"pt\")\n",
+    "            if enable_pytorch_profiling:\n",
+    "                with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:\n",
+    "                    with record_function(\"model_inference\"):\n",
+    "                        outputs = model.generate(**inputs)                    \n",
+    "            else:\n",
+    "                outputs = model.generate(**inputs)\n",
+    "        elif \"audio\" in inputs_types:\n",
+    "            input_q = \"N/A\"\n",
+    "            inputs, sampling_rate = torchaudio.load(input_aud_file)\n",
+    "            waveform = torchaudio.functional.resample(inputs, orig_freq=sampling_rate, new_freq=tokenizer.feature_extractor.sampling_rate).squeeze().numpy()\n",
+    "            sampling_rate = tokenizer.feature_extractor.sampling_rate\n",
+    "            inputs = tokenizer(waveform, sampling_rate=sampling_rate, return_tensors=\"pt\")\n",
+    "            if enable_pytorch_profiling:\n",
+    "                with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:\n",
+    "                    with record_function(\"model_inference\"):\n",
+    "                        outputs = model.generate(inputs.input_features)\n",
+    "            else:\n",
+    "                outputs = model.generate(inputs.input_features)\n",
+    "        else:\n",
+    "            inputs = tokenizer(input_q, return_tensors=\"pt\")\n",
+    "            if enable_pytorch_profiling:\n",
+    "                with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:\n",
+    "                    with record_function(\"model_inference\"):\n",
+    "                        outputs = model.generate(**inputs)\n",
+    "            else:\n",
+    "                outputs = model.generate(**inputs)\n",
+    "        \n",
+    "        latency_metric = (timer()-start_time) * 1000\n",
+    "    \n",
+    "        if enable_pytorch_profiling:\n",
+    "            proftracefile = results_folder + model_id.replace(\"/\", \"-\") + \"-prof-trace-\" + str(i) + \"-\" + xpu_type\n",
+    "            with open(proftracefile + \".txt\", \"w\") as f:\n",
+    "                f.write(prof.key_averages(group_by_input_shape=True).table(sort_by=\"cpu_time_total\", row_limit=10))\n",
+    "            prof.export_chrome_trace(proftracefile + \"-trace.json\")\n",
+    "        \n",
+    "        batch_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
+    "        output_str = str(batch_outputs)\n",
+    "        if input_q in output_str:\n",
+    "            output_str = output_str.split(input_q)[1]\n",
+    "        if len(output_str) > 0:\n",
+    "            gleu_score = gleu.compute(predictions=[output_str], references=[ref_answer])\n",
+    "        else:\n",
+    "            gleu_score = {'google_bleu': 0, 'status': 'Output string is empty'}        \n",
+    "        pytorch_accuracy_list.append(round(float(gleu_score['google_bleu']), 2))\n",
+    "        if i == 1:\n",
+    "            first_output_str = output_str\n",
+    "            first_latency = latency_metric\n",
+    "        else:\n",
+    "            inf_time_list.append(latency_metric)\n",
+    "    # End of iterations loop\n",
+    "    \n",
+    "    inf_time = str(round(np.average(inf_time_list),2))\n",
+    "    acc_avg = round(np.average(pytorch_accuracy_list), 4)\n",
+    "    \n",
+    "    results[model_id + \"-\" + xpu_type]={\n",
+    "        \"XPU\": xpu_type,\n",
+    "        \"Model\": model_id,\n",
+    "        \"Precision\": str(target_dtype),\n",
+    "        \"ATTN Impl\": attn_impl,\n",
+    "        \"Pytorch First Inference Latency(ms)\": str(round(first_latency,2)),\n",
+    "        \"Pytorch Raw inf (ms)\": \",\".join(str(round(inf,2)) for inf in inf_time_list),\n",
+    "        \"PytorchInf(ms)(Avg over \" + str(numiter - 1) + \" iterations)\": inf_time,\n",
+    "        \"Pytorch Raw Accuracy (%)\": \",\".join(str(round(acc * 100, 2)) for acc in pytorch_accuracy_list),\n",
+    "        \"Pytorch Accuracy Score(%)(Avg over \" + str(numiter) + \" iterations)\": str(acc_avg * 100),\n",
+    "        \"Input-Question\": input_q,\n",
+    "        \"First iteration Response\": first_output_str,\n",
+    "        \"Ref Answer\": ref_answer\n",
+    "    }\n",
+    "    inf_time_list.clear()\n",
+    "    pytorch_accuracy_list.clear()    \n",
+    "    # Dump intermedeate results\n",
+    "    results_dumper(results)\n",
+    "\n",
+    "\n",
+    "#Main Function\n",
+    "for model_blob in testconfig:\n",
+    "    entry = testconfig[model_blob]\n",
+    "    if \"tokenizer\" in entry:\n",
+    "        tokenizer_type = entry[\"tokenizer\"]\n",
+    "        model_id = entry[\"id\"]\n",
+    "        model_gen = entry[\"modelgen\"]\n",
+    "        inputs_types = entry[\"inputs\"] if \"inputs\" in entry else \"text\"\n",
+    "        useNPUAccelerationLibrary = entry[\"useNPUAccelerationLibrary\"] if \"useNPUAccelerationLibrary\" in entry else False\n",
+    "        input_q = entry[\"input_q\"] if \"input_q\" in entry else input_text_ttot\n",
+    "        input_q_vqa = entry[\"input_q_vqa\"] if \"input_q_vqa\" in entry else input_text_vqa\n",
+    "        ref_answer = entry[\"ref_answer\"] if \"ref_answer\" in entry else ref_answer_ttot\n",
+    "        ref_answer_vqa = entry[\"ref_answer_vqa\"] if \"ref_answer_vqa\" in entry else ref_answer_vqa\n",
+    "        input_q_aud = entry[\"input_q_aud\"] if \"input_q_aud\" in entry else input_text_ttot\n",
+    "        ref_answer_aud = entry[\"ref_answer_aud\"] if \"ref_answer_aud\" in entry else ref_answer_aud\n",
+    "        import_library = entry[\"import_library\"] if \"import_library\" in entry else \"transformers\"\n",
+    "        attn_impl = entry[\"attn_impl\"] if \"attn_impl\" in entry else \"none\"\n",
+    "        target_dtype = entry[\"dtype\"] if \"dtype\" in entry else \"int8\"\n",
+    "        if \"image\" in inputs_types or \"pixel_values\" in inputs_types:\n",
+    "            input_q = input_q_vqa\n",
+    "            ref_answer = ref_answer_vqa\n",
+    "        if \"audio\" in inputs_types:\n",
+    "            input_q = input_q_aud\n",
+    "            ref_answer = ref_answer_aud\n",
+    "        print(f\"Model inference started for {model_id}\")\n",
+    "        run_inference(tokenizer_type, model_id, model_gen, inputs_types, useNPUAccelerationLibrary, input_q, ref_answer, import_library, attn_impl, target_dtype)\n",
+    "        intel_npu_acceleration_library.backend.runtime._model_cache={} #Clear cache after every iteration\n",
+    "        if useNPUAccelerationLibrary & run_additional_cpu:\n",
+    "            #Run inference on CPU also for comparison\n",
+    "            run_inference(tokenizer_type, model_id, model_gen, inputs_types, False, input_q, ref_answer, import_library, attn_impl, target_dtype)\n",
+    "        print(\"Model Inference complete\")\n",
+    "\n",
+    "#Clear cache\n",
+    "if os.path.exists(\"./cache\"):\n",
+    "    shutil.rmtree(\"./cache\")\n",
+    "\n",
+    "#Dump Final results to file\n",
+    "results_dumper(results)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}