Skip to content
This repository was archived by the owner on Apr 24, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions script/HFModelsInference/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
results*\*\**
cache

361 changes: 361 additions & 0 deletions script/HFModelsInference/RunHFModelsIntelNPU.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,361 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Simple app to download models from Hugging Face and use the intel_npu_acceleration_library for running Inference on Intel NPU\n",
"\n",
"## Pre-Requisites\n",
"\n",
"Make sure you have a clean conda environment to begin with. <br>Python version should be 3.10<br>\n",
"You can create a conda env with conda create -n intel-npu python=3.10 <br><br>\n",
"Install VC++ redist from https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist <br>\n",
"\n",
"## Note\n",
"If the notebook fails to execute on the first run, please restart the kernel and try again. Some dependent python packages may require kernel restart.\n",
"To set the build env inside conda env, use <br>\n",
"\"c:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\VsDevCmd.bat\" <br>\n",
"Then create a new conda environment python=3.10. Actiavate and build library from source using - <br>\n",
"pip install \"intel-npu-acceleration-library @ git+https://github.com/intel/intel-npu-acceleration-library.git\"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import platform\n",
"import importlib\n",
"import time\n",
"from timeit import default_timer as timer\n",
"import subprocess\n",
"\n",
"assert 'Intel' in platform.processor(), \"Only INTEL processors are supported\"\n",
"!python -m pip install -r requirements.txt -q\n",
"\n",
"# Import NPU acceleration library\n",
"import intel_npu_acceleration_library\n",
"import intel_npu_acceleration_library.backend\n",
"\n",
"import os\n",
"import openvino\n",
"import openvino.torch\n",
"import numpy as np\n",
"from subprocess import Popen\n",
"from PIL import Image\n",
"import shutil\n",
"import evaluate\n",
"import toml\n",
"import json\n",
"import torchaudio\n",
"from huggingface_hub import login\n",
"import requests\n",
"import validators\n",
"\n",
"gleu = evaluate.load('google_bleu')\n",
"\n",
"#Clear cache\n",
"if os.path.exists(\"./cache\"):\n",
" shutil.rmtree(\"./cache\")\n",
"\n",
"#Setup the test config\n",
"#Read the JSON config to get the tokenizer and model ID\n",
"with open('TestConfigNPU.toml', 'r') as f:\n",
" testconfig = toml.load(f)\n",
"\n",
"#Set the number of iterations here\n",
"numiter = testconfig[\"Configuration\"][\"num-iter\"]\n",
"\n",
"# Flag to enable additional run on CPU\n",
"run_additional_cpu = testconfig[\"Configuration\"][\"run_additional_cpu\"]\n",
"\n",
"#Question and answers\n",
"input_text_vqa = testconfig[\"Configuration\"][\"input_text_vqa\"]\n",
"ref_answer_vqa = testconfig[\"Configuration\"][\"ref_answer_vqa\"]\n",
"input_text_ttot = testconfig[\"Configuration\"][\"input_text_ttot\"]\n",
"ref_answer_ttot = testconfig[\"Configuration\"][\"ref_answer_ttot\"]\n",
"ref_answer_aud = testconfig[\"Configuration\"][\"ref_answer_aud\"]\n",
"\n",
"#Input image file\n",
"input_img_file = testconfig[\"Configuration\"][\"input_img_file\"]\n",
"assert \".jpg\" in input_img_file, \"Only jpg files are supproted\"\n",
"if validators.url(input_img_file):\n",
" if os.path.exists(\"./test.jpg\"):\n",
" os.remove(\"./test.jpg\")\n",
" with requests.get(input_img_file, stream=True) as r:\n",
" with open(\"./test.jpg\", 'wb') as f:\n",
" shutil.copyfileobj(r.raw, f)\n",
" input_img_file = \"./test.jpg\"\n",
"\n",
"#Input Audio file and Sampling rate\n",
"input_aud_file = testconfig[\"Configuration\"][\"input_aud_file\"]\n",
"assert \".wav\" in input_aud_file, \"Only wav files are supproted\"\n",
"if validators.url(input_aud_file):\n",
" with requests.get(input_aud_file, stream=True) as r:\n",
" with open(\"./test.wav\", 'wb') as f:\n",
" shutil.copyfileobj(r.raw, f)\n",
" input_aud_file = \"./test.wav\"\n",
"\n",
"#Clean previous results flag\n",
"clean_prev_results = testconfig[\"Configuration\"][\"clean_prev_results\"]\n",
"\n",
"# Flag to enable Pytorch profiling\n",
"enable_pytorch_profiling = testconfig[\"Configuration\"][\"enable_pytorch_profiling\"]\n",
"\n",
"if enable_pytorch_profiling:\n",
" from torch.profiler import profile, record_function, ProfilerActivity\n",
"\n",
"#HF Access Token\n",
"hfaccesstoken = testconfig[\"Configuration\"][\"HFaccessToken\"]\n",
"assert hfaccesstoken, \"Hugging Face token is not set. Please set it in TestConfig TOML\"\n",
"login(token=hfaccesstoken)\n",
"\n",
"# Import NPU acceleration library\n",
"import intel_npu_acceleration_library\n",
"\n",
"#Get NPU driver version\n",
"npu_driver_version = subprocess.run(\n",
" [\n",
" \"powershell.exe\",\n",
" \"--NoProfile\",\n",
" \"--ExecutionPolicy\", \"Bypass\",\n",
" \"Get-WmiObject\", \"Win32_PnPSignedDriver | select devicename, driverversion | Select-String -Pattern 'Intel(R) AI Boost' -CaseSensitive -SimpleMatch\"\n",
" ],\n",
" capture_output=True,\n",
" text=True\n",
")\n",
"npu_driver_version_string = npu_driver_version.stdout\n",
"#Book-Keeping\n",
"results = {\n",
" \"NPU-Version\": npu_driver_version_string\n",
"}\n",
"inf_time_list = []\n",
"pytorch_accuracy_list = []\n",
"run_time = time.strftime(\"%Y%m%d-%H%M%S\")\n",
"results[\"runtimestamp\"] = run_time\n",
"results_folder = './results'\n",
"if clean_prev_results:\n",
" if os.path.exists(results_folder):\n",
" shutil.rmtree(results_folder)\n",
"if not os.path.exists(results_folder):\n",
" os.makedirs(results_folder)\n",
"results_folder = './results/' + 'results-' + str(run_time) +\"/\"\n",
"if not os.path.exists(results_folder):\n",
" os.makedirs(results_folder)\n",
"\n",
"#Clear the cache dir and contents\n",
"if os.path.exists('./cache'):\n",
" shutil.rmtree(\"./cache\")\n",
"\n",
"#Results dumper\n",
"def results_dumper(results):\n",
" #Dump Results to File\n",
" with open(results_folder + 'results-' + str(run_time) + '.json', 'w') as f:\n",
" json.dump(results, f, indent=4)\n",
"\n",
" # Latency Results to CSV\n",
" csv_column_headers = [ \"XPU\",\n",
" \"Model\", \n",
" \"Pytorch First Inference Latency(ms)\",\n",
" \"PytorchInf(ms)(Avg over \" + str(numiter - 1) + \" iterations)\", \n",
" \"Pytorch Accuracy Score(%)(Avg over \" + str(numiter) + \" iterations)\", \n",
" ]\n",
" csv_data = []\n",
" csv_data.append(csv_column_headers)\n",
" for model_entry in results:\n",
" if \"runtimestamp\" not in model_entry:\n",
" csv_column_data = []\n",
" for header in csv_column_headers:\n",
" if header in results[model_entry]:\n",
" csv_column_data.append(results[model_entry][header])\n",
" else:\n",
" csv_column_data.append(\"N/A\")\n",
" csv_data.append(csv_column_data)\n",
"\n",
" #Dump Results CSV to File\n",
" with open(results_folder + 'results-' + str(run_time) + '.csv', 'w') as f:\n",
" for row_entry in csv_data:\n",
" to_write = \"\"\n",
" for entry in row_entry:\n",
" to_write = to_write + entry + \",\"\n",
" f.write(to_write + '\\n')\n",
"\n",
"\n",
"#Main inference function\n",
"def run_inference(tokenizer_type, model_id, model_gen, inputs_types, useNPUAccelerationLibrary, input_q, ref_answer, import_library, attn_impl, target_dtype):\n",
" torch_lib = importlib.import_module(\"torch\")\n",
" target_dtype = getattr(torch_lib, target_dtype)\n",
" transformerlib = importlib.import_module(\"transformers\")\n",
" imported_lib = importlib.import_module(import_library)\n",
" tokenattr = getattr(transformerlib, tokenizer_type)\n",
" modelattr = getattr(imported_lib, model_gen)\n",
" if import_library == \"transformers\":\n",
" if attn_impl != \"none\":\n",
" model = modelattr.from_pretrained(model_id, attn_implementation=attn_impl, trust_remote_code=True)\n",
" else:\n",
" model = modelattr.from_pretrained(model_id, trust_remote_code=True)\n",
" else:\n",
" model = modelattr.from_pretrained(model_id, trust_remote_code=True)\n",
" xpu_type = \"CPU\"\n",
" first_latency = 0\n",
"\n",
" if useNPUAccelerationLibrary:\n",
" xpu_type = \"NPU\"\n",
" dtype = target_dtype\n",
" model = intel_npu_acceleration_library.compile(model.eval(), dtype=dtype)\n",
" tokenizer = tokenattr.from_pretrained(model_id, trust_remote_code=True)\n",
" first_output_str=\"\"\n",
" for i in range(1,numiter+1):\n",
" start_time = timer()\n",
" if \"pixel_values\" in inputs_types:\n",
" image = Image.open(input_img_file).convert(\"RGB\")\n",
" inputs = tokenizer(images=image, text=input_q, return_tensors=\"pt\").pixel_values\n",
" if enable_pytorch_profiling:\n",
" with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:\n",
" with record_function(\"model_inference\"):\n",
" outputs = model.generate(inputs, max_new_tokens=50) \n",
" else:\n",
" outputs = model.generate(inputs, max_new_tokens=50)\n",
" elif \"image\" in inputs_types:\n",
" image = Image.open(input_img_file).convert(\"RGB\")\n",
" inputs = tokenizer(images=image, text=input_q, return_tensors=\"pt\")\n",
" if enable_pytorch_profiling:\n",
" with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:\n",
" with record_function(\"model_inference\"):\n",
" outputs = model.generate(**inputs) \n",
" else:\n",
" outputs = model.generate(**inputs)\n",
" elif \"audio\" in inputs_types:\n",
" input_q = \"N/A\"\n",
" inputs, sampling_rate = torchaudio.load(input_aud_file)\n",
" waveform = torchaudio.functional.resample(inputs, orig_freq=sampling_rate, new_freq=tokenizer.feature_extractor.sampling_rate).squeeze().numpy()\n",
" sampling_rate = tokenizer.feature_extractor.sampling_rate\n",
" inputs = tokenizer(waveform, sampling_rate=sampling_rate, return_tensors=\"pt\")\n",
" if enable_pytorch_profiling:\n",
" with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:\n",
" with record_function(\"model_inference\"):\n",
" outputs = model.generate(inputs.input_features)\n",
" else:\n",
" outputs = model.generate(inputs.input_features)\n",
" else:\n",
" inputs = tokenizer(input_q, return_tensors=\"pt\")\n",
" if enable_pytorch_profiling:\n",
" with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:\n",
" with record_function(\"model_inference\"):\n",
" outputs = model.generate(**inputs)\n",
" else:\n",
" outputs = model.generate(**inputs)\n",
" \n",
" latency_metric = (timer()-start_time) * 1000\n",
" \n",
" if enable_pytorch_profiling:\n",
" proftracefile = results_folder + model_id.replace(\"/\", \"-\") + \"-prof-trace-\" + str(i) + \"-\" + xpu_type\n",
" with open(proftracefile + \".txt\", \"w\") as f:\n",
" f.write(prof.key_averages(group_by_input_shape=True).table(sort_by=\"cpu_time_total\", row_limit=10))\n",
" prof.export_chrome_trace(proftracefile + \"-trace.json\")\n",
" \n",
" batch_outputs = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
" output_str = str(batch_outputs)\n",
" if input_q in output_str:\n",
" output_str = output_str.split(input_q)[1]\n",
" if len(output_str) > 0:\n",
" gleu_score = gleu.compute(predictions=[output_str], references=[ref_answer])\n",
" else:\n",
" gleu_score = {'google_bleu': 0, 'status': 'Output string is empty'} \n",
" pytorch_accuracy_list.append(round(float(gleu_score['google_bleu']), 2))\n",
" if i == 1:\n",
" first_output_str = output_str\n",
" first_latency = latency_metric\n",
" else:\n",
" inf_time_list.append(latency_metric)\n",
" # End of iterations loop\n",
" \n",
" inf_time = str(round(np.average(inf_time_list),2))\n",
" acc_avg = round(np.average(pytorch_accuracy_list), 4)\n",
" \n",
" results[model_id + \"-\" + xpu_type]={\n",
" \"XPU\": xpu_type,\n",
" \"Model\": model_id,\n",
" \"Precision\": str(target_dtype),\n",
" \"ATTN Impl\": attn_impl,\n",
" \"Pytorch First Inference Latency(ms)\": str(round(first_latency,2)),\n",
" \"Pytorch Raw inf (ms)\": \",\".join(str(round(inf,2)) for inf in inf_time_list),\n",
" \"PytorchInf(ms)(Avg over \" + str(numiter - 1) + \" iterations)\": inf_time,\n",
" \"Pytorch Raw Accuracy (%)\": \",\".join(str(round(acc * 100, 2)) for acc in pytorch_accuracy_list),\n",
" \"Pytorch Accuracy Score(%)(Avg over \" + str(numiter) + \" iterations)\": str(acc_avg * 100),\n",
" \"Input-Question\": input_q,\n",
" \"First iteration Response\": first_output_str,\n",
" \"Ref Answer\": ref_answer\n",
" }\n",
" inf_time_list.clear()\n",
" pytorch_accuracy_list.clear() \n",
" # Dump intermedeate results\n",
" results_dumper(results)\n",
"\n",
"\n",
"#Main Function\n",
"for model_blob in testconfig:\n",
" entry = testconfig[model_blob]\n",
" if \"tokenizer\" in entry:\n",
" tokenizer_type = entry[\"tokenizer\"]\n",
" model_id = entry[\"id\"]\n",
" model_gen = entry[\"modelgen\"]\n",
" inputs_types = entry[\"inputs\"] if \"inputs\" in entry else \"text\"\n",
" useNPUAccelerationLibrary = entry[\"useNPUAccelerationLibrary\"] if \"useNPUAccelerationLibrary\" in entry else False\n",
" input_q = entry[\"input_q\"] if \"input_q\" in entry else input_text_ttot\n",
" input_q_vqa = entry[\"input_q_vqa\"] if \"input_q_vqa\" in entry else input_text_vqa\n",
" ref_answer = entry[\"ref_answer\"] if \"ref_answer\" in entry else ref_answer_ttot\n",
" ref_answer_vqa = entry[\"ref_answer_vqa\"] if \"ref_answer_vqa\" in entry else ref_answer_vqa\n",
" input_q_aud = entry[\"input_q_aud\"] if \"input_q_aud\" in entry else input_text_ttot\n",
" ref_answer_aud = entry[\"ref_answer_aud\"] if \"ref_answer_aud\" in entry else ref_answer_aud\n",
" import_library = entry[\"import_library\"] if \"import_library\" in entry else \"transformers\"\n",
" attn_impl = entry[\"attn_impl\"] if \"attn_impl\" in entry else \"none\"\n",
" target_dtype = entry[\"dtype\"] if \"dtype\" in entry else \"int8\"\n",
" if \"image\" in inputs_types or \"pixel_values\" in inputs_types:\n",
" input_q = input_q_vqa\n",
" ref_answer = ref_answer_vqa\n",
" if \"audio\" in inputs_types:\n",
" input_q = input_q_aud\n",
" ref_answer = ref_answer_aud\n",
" print(f\"Model inference started for {model_id}\")\n",
" run_inference(tokenizer_type, model_id, model_gen, inputs_types, useNPUAccelerationLibrary, input_q, ref_answer, import_library, attn_impl, target_dtype)\n",
" intel_npu_acceleration_library.backend.runtime._model_cache={} #Clear cache after every iteration\n",
" if useNPUAccelerationLibrary & run_additional_cpu:\n",
" #Run inference on CPU also for comparison\n",
" run_inference(tokenizer_type, model_id, model_gen, inputs_types, False, input_q, ref_answer, import_library, attn_impl, target_dtype)\n",
" print(\"Model Inference complete\")\n",
"\n",
"#Clear cache\n",
"if os.path.exists(\"./cache\"):\n",
" shutil.rmtree(\"./cache\")\n",
"\n",
"#Dump Final results to file\n",
"results_dumper(results)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading