11
11
12
12
from QEfficient .base .common import QEFFCommonLoader
13
13
from QEfficient .utils import check_and_assign_cache_dir
14
+ from QEfficient .utils .custom_yaml import generate_custom_io
14
15
from QEfficient .utils .logging_utils import logger
15
16
16
- from .custom_yaml import generate_custom_io
17
-
18
17
# Specifically for Docker images.
19
18
ROOT_DIR = os .path .dirname (os .path .abspath ("" ))
20
19
21
20
22
- def get_onnx_model_path (
21
+ def get_onnx_path_and_setup_customIO (
23
22
model_name : str ,
24
23
cache_dir : Optional [str ] = None ,
25
24
hf_token : Optional [str ] = None ,
26
25
full_batch_size : Optional [int ] = None ,
27
26
local_model_dir : Optional [str ] = None ,
27
+ mxint8_kv_cache : Optional [int ] = False ,
28
28
):
29
29
"""
30
- exports the model to onnx if pre-exported file is not found and returns onnx_model_path
30
+ exports the model to onnx if pre-exported file is not found and returns onnx_model_path and generates cutom_io file.
31
31
32
32
``Mandatory`` Args:
33
33
:model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
@@ -47,9 +47,11 @@ def get_onnx_model_path(
47
47
full_batch_size = full_batch_size ,
48
48
local_model_dir = local_model_dir ,
49
49
)
50
- generate_custom_io (qeff_model , cache_dir = "." , mxint8_kv_cache = False )
51
50
onnx_model_path = qeff_model .export ()
52
51
logger .info (f"Generated onnx_path: { onnx_model_path } " )
52
+
53
+ # Generating Custom IO for the compile.
54
+ generate_custom_io (qeff_model , mxint8_kv_cache = mxint8_kv_cache )
53
55
return onnx_model_path
54
56
55
57
@@ -59,6 +61,7 @@ def main(
59
61
hf_token : Optional [str ] = None ,
60
62
local_model_dir : Optional [str ] = None ,
61
63
full_batch_size : Optional [int ] = None ,
64
+ mxint8_kv_cache : Optional [bool ] = False ,
62
65
) -> None :
63
66
"""
64
67
Helper function used by export CLI app for exporting to ONNX Model.
@@ -71,19 +74,20 @@ def main(
71
74
:hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
72
75
:local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
73
76
:full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
74
-
77
+ :mxint8_kv_cache (bool): Whether to export int8 model or not. ``Defaults to False.``
75
78
.. code-block:: bash
76
79
77
80
python -m QEfficient.cloud.export OPTIONS
78
81
79
82
"""
80
83
cache_dir = check_and_assign_cache_dir (local_model_dir , cache_dir )
81
- get_onnx_model_path (
84
+ get_onnx_path_and_setup_customIO (
82
85
model_name = model_name ,
83
86
cache_dir = cache_dir ,
84
87
hf_token = hf_token ,
85
88
full_batch_size = full_batch_size ,
86
89
local_model_dir = local_model_dir ,
90
+ mxint8_kv_cache = mxint8_kv_cache ,
87
91
)
88
92
89
93
@@ -109,5 +113,11 @@ def main(
109
113
default = None ,
110
114
help = "Set full batch size to enable continuous batching mode, default is None" ,
111
115
)
116
+ parser .add_argument (
117
+ "--mxint8_kv_cache" ,
118
+ "--mxint8-kv-cache" ,
119
+ required = False ,
120
+ help = "Compress Present/Past KV to MXINT8 using CustomIO config, default is False" ,
121
+ )
112
122
args = parser .parse_args ()
113
123
main (** args .__dict__ )
0 commit comments