Moved custom_yaml.py in utils and other appropriate changes

abhishek-singh591 · abhishek-singh591 · commit e68a0bf1c6d1 · 2025-09-12T05:48:13.000Z
diff --git a/QEfficient/cloud/export.py b/QEfficient/cloud/export.py
@@ -11,23 +11,23 @@
 
 from QEfficient.base.common import QEFFCommonLoader
 from QEfficient.utils import check_and_assign_cache_dir
+from QEfficient.utils.custom_yaml import generate_custom_io
 from QEfficient.utils.logging_utils import logger
 
-from .custom_yaml import generate_custom_io
-
 # Specifically for Docker images.
 ROOT_DIR = os.path.dirname(os.path.abspath(""))
 
 
-def get_onnx_model_path(
+def get_onnx_path_and_setup_customIO(
     model_name: str,
     cache_dir: Optional[str] = None,
     hf_token: Optional[str] = None,
     full_batch_size: Optional[int] = None,
     local_model_dir: Optional[str] = None,
+    mxint8_kv_cache: Optional[int] = False,
 ):
     """
-    exports the model to onnx if pre-exported file is not found and returns onnx_model_path
+    exports the model to onnx if pre-exported file is not found and returns onnx_model_path and generates cutom_io file.
 
     ``Mandatory`` Args:
         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``.
@@ -47,9 +47,11 @@ def get_onnx_model_path(
         full_batch_size=full_batch_size,
         local_model_dir=local_model_dir,
     )
-    generate_custom_io(qeff_model, cache_dir=".", mxint8_kv_cache=False)
     onnx_model_path = qeff_model.export()
     logger.info(f"Generated onnx_path: {onnx_model_path}")
+
+    # Generating Custom IO for the compile.
+    generate_custom_io(qeff_model, mxint8_kv_cache=mxint8_kv_cache)
     return onnx_model_path
 
 
@@ -59,6 +61,7 @@ def main(
     hf_token: Optional[str] = None,
     local_model_dir: Optional[str] = None,
     full_batch_size: Optional[int] = None,
+    mxint8_kv_cache: Optional[bool] = False,
 ) -> None:
     """
     Helper function used by export CLI app for exporting to ONNX Model.
@@ -71,19 +74,20 @@ def main(
         :hf_token (str): HuggingFace login token to access private repos. ``Defaults to None.``
         :local_model_dir (str): Path to custom model weights and config files. ``Defaults to None.``
         :full_batch_size (int): Set full batch size to enable continuous batching mode. ``Defaults to None.``
-
+        :mxint8_kv_cache (bool): Whether to export int8 model or not. ``Defaults to False.``
     .. code-block:: bash
 
         python -m QEfficient.cloud.export OPTIONS
 
     """
     cache_dir = check_and_assign_cache_dir(local_model_dir, cache_dir)
-    get_onnx_model_path(
+    get_onnx_path_and_setup_customIO(
         model_name=model_name,
         cache_dir=cache_dir,
         hf_token=hf_token,
         full_batch_size=full_batch_size,
         local_model_dir=local_model_dir,
+        mxint8_kv_cache=mxint8_kv_cache,
     )
 
 
@@ -109,5 +113,11 @@ def main(
         default=None,
         help="Set full batch size to enable continuous batching mode, default is None",
     )
+    parser.add_argument(
+        "--mxint8_kv_cache",
+        "--mxint8-kv-cache",
+        required=False,
+        help="Compress Present/Past KV to MXINT8 using CustomIO config, default is False",
+    )
     args = parser.parse_args()
     main(**args.__dict__)
diff --git a/QEfficient/utils/custom_yaml.py b/QEfficient/utils/custom_yaml.py
@@ -1,3 +1,10 @@
+# -----------------------------------------------------------------------------
+#
+# Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# ----------------------------------------------------------------------------
+
 import warnings
 from pathlib import Path
 
@@ -94,6 +101,7 @@ def generate(self) -> dict:
 
         self.dump(custom_io_vision, f"{self.dtype_suffix}_vision")
         self.dump(custom_io_lang, f"{self.dtype_suffix}_lang")
+        warnings.warn(f"Unsupported model class via CLI: {type(self.model).__name__}", UserWarning)
         return {**custom_io_vision, **custom_io_lang}