rename llmcompressor to llm_compressor for align with other formats (#780)

WeiweiZhang1 · pre-commit-ci[bot] · wenhuach21 · web-flow · commit e66652a4d624 · 2025-09-05T20:24:47.000+08:00
* rename llmcompressor to llm_compressor for align with other formats Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add log, refine doc Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add act args for export config Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix line too long Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * refine packing device, refine nvfp logging Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixtypo Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix comments Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * fix typo Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix import & log Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * fix doctypo Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: Zhang, Weiwei1 <weiwei1.zhang@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Wenhua Cheng <wenhua.cheng@intel.com>
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -691,7 +691,7 @@ def _check_configs(self) -> None:
         if self.gradient_accumulate_steps <= 0:
             raise ValueError("`gradient_accumulate_steps` must be positive")
 
-        if self.act_bits <= 8:
+        if self.act_bits <= 8 and (not is_nv_fp(self.act_data_type) or "static_gs" not in self.act_data_type):
             logger.warning(
                 "activation quantization is an experimental feature with limited support and a complex API. "
                 "And please save the quantized model to fake format as real deployment is not supported currently"
@@ -843,19 +843,21 @@ def _parse_format_to_list(self, format: str) -> list:
                         "for the current quantization configuration, "
                         "please change to `fake` format for research purpose"
                     )
-
                 formats[index] = format
-            elif format == "llmcompressor":
+            elif format == "llm_compressor":
                 from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported
 
                 if check_compressed_tensors_supported() and (is_nv_fp(self.data_type) or is_mx_fp(self.data_type)):
-                    format = format.replace("llmcompressor", f"llmcompressor:{self.data_type}")
+                    format = format.replace("llm_compressor", f"llm_compressor:{self.data_type}")
                     formats[index] = format
                 elif not is_wfp8afp8(self):
                     logger.error(
-                        "Currently, the llmcompressor format only supports MXFP/NVFP/FP8. "
+                        "Currently, the llm_compressor format only supports MXFP/NVFP/FP8. "
                         "Please change format to fake or auto_round etc."
                     )
+            else:
+                if (is_nv_fp(self.data_type) or is_mx_fp(self.data_type)) and format != "fake":
+                    logger.warning(f"nv_fp and mx_fp dtypes are not supported for export format: {format}")
 
         # Remove duplicates from formats list
         def remove_duplicates(lst):
@@ -887,8 +889,13 @@ def _check_supported_format(self, format: str) -> bool:
         # Only support to export afp8/nv_fp
         if self.act_bits <= 8:
             if not is_standard_fp(self.act_data_type) or self.act_dynamic:
-                if format == "llmcompressor":
-                    if is_nv_fp(self.act_data_type):
+                if "llm_compressor" in format:
+                    if is_nv_fp(self.act_data_type) and "static_gs" in self.act_data_type:
+                        logger.warning(
+                            f"AutoRound supports exporting to format '{format}', "
+                            "but loading quantized models in this format is not yet supported. "
+                            "It is currently recommended to export to the 'llm_compressor' format."
+                        )
                         return format
                     bits, group_size, sym, act_bits = 8, -1, True, 8
                     assert (
@@ -899,10 +906,11 @@ def _check_supported_format(self, format: str) -> bool:
                         and self.act_dynamic
                     ), (
                         f"Currently only support to export llmcompressor format for dynamic quantized"
-                        f" W{self.bits}A{self.act_bits} model, but got bits={self.bits},"
-                        f" group_size={self.group_size}, sym={self.sym}, act_bits={self.act_bits}"
+                        f" W{bits}Afp{act_bits} model, but got bits={self.bits}, data_type={self.data_type}"
+                        f" group_size={self.group_size}, sym={self.sym}"
+                        f", act_bits={self.act_bits}, act_data_type={self.act_data_type}"
                     )
-                elif format != "fake" and not is_nv_fp(format):
+                elif format != "fake" and (not is_nv_fp(format) or "static_gs" not in self.act_data_type):
                     logger.warning(
                         "Currently only support to export auto_round format quantized model"
                         " with fp8 or nv_fp4 dtype activation for activation quantization."
@@ -1652,7 +1660,7 @@ def quantize(self) -> tuple[torch.nn.Module, dict[str, Any]]:
                     or "gptq" in formats[0]
                     or "auto_round" in formats[0]
                     or "gguf" in formats[0]
-                    or "llmcompressor" in formats[0]
+                    or "llm_compressor" in formats[0]
                 )
                 and self.inplace
             ):
@@ -3017,8 +3025,8 @@ def save_quantized(
                 "Support for exporting activation quantization is limited. "
                 "Please ensure that your configuration is supported."
             )
-        if format == "llmcompressor" and (is_nv_fp(self.data_type) or is_mx_fp(self.data_type)):
-            format = format.replace("llmcompressor", f"llmcompressor:{self.data_type}")
+        if format == "llm_compressor" and (is_nv_fp(self.data_type) or is_mx_fp(self.data_type)):
+            format = format.replace("llm_compressor", f"llm_compressor:{self.data_type}")
 
         from auto_round.export import EXPORT_FORMAT
 
diff --git a/auto_round/export/__init__.py b/auto_round/export/__init__.py
@@ -78,14 +78,14 @@ def _packing_layer_with_autoawq(*args, **kwargs):
     return pack_layer(*args, **kwargs)
 
 
-@register_format("llmcompressor")
+@register_format("llm_compressor")
 def _save_quantized_as_llmcompressor(*args, **kwargs):
     from auto_round.export.export_to_llmcompressor.export import save_quantized_as_llmcompressor
 
     return save_quantized_as_llmcompressor(*args, **kwargs)
 
 
-@register_layer_packing("llmcompressor")
+@register_layer_packing("llm_compressor")
 def _packing_layer_with_llmcompressor(*args, **kwargs):
     from auto_round.export.export_to_llmcompressor.export import pack_layer
 
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
@@ -25,6 +25,7 @@
 import transformers
 from tqdm import tqdm
 
+from auto_round.export.export_to_autoround.utils import REQUIRED_CONFIG_KEYS, check_neq_config
 from auto_round.utils import (
     SUPPORTED_FORMATS,
     SUPPORTED_LAYER_TYPES,
@@ -40,8 +41,6 @@
     set_module,
 )
 
-from .utils import check_neq_config
-
 
 def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_bits=16):
     """
@@ -313,12 +312,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
             block_name_to_quantize is not None and check_start_with_block_name(layer_name, block_name_to_quantize)
         ):
             neq_keys = check_neq_config(
-                layer_config[layer_name],
-                data_type=quantization_config["data_type"],
-                bits=quantization_config["bits"],
-                act_bits=quantization_config["act_bits"],
-                group_size=quantization_config["group_size"],
-                sym=quantization_config["sym"],
+                layer_config[layer_name], **{k: quantization_config[k] for k in REQUIRED_CONFIG_KEYS}
             )
             if len(neq_keys) > 0:
                 extra_config[layer_name] = {}
diff --git a/auto_round/export/export_to_autoround/export_to_fp8.py b/auto_round/export/export_to_autoround/export_to_fp8.py
@@ -23,8 +23,10 @@
 from tqdm import tqdm
 
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad
+from auto_round.export.export_to_autoround.utils import REQUIRED_CONFIG_KEYS, check_neq_config
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
+    _get_device,
     check_start_with_block_name,
     check_to_quantized,
     filter_quantization_config,
@@ -33,8 +35,6 @@
     set_module,
 )
 
-from .utils import check_neq_config
-
 
 class FP8WOQLinear(torch.nn.Module):
 
@@ -86,11 +86,7 @@ def pack_layer(layer_name, model, data_type, packing_device=None):
         None: The function modifies the model in place.
     """
     if packing_device is None:
-        packing_device = "cpu"
-        if torch.cuda.is_available():
-            packing_device = "cuda"
-        elif torch.xpu.is_available():
-            packing_device = "xpu"
+        packing_device = _get_device()
     layer = get_module(model, layer_name)
     if hasattr(layer, "orig_layer"):
         layer = layer.orig_layer
@@ -187,12 +183,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round",
             block_name_to_quantize is not None and check_start_with_block_name(layer_name, block_name_to_quantize)
         ):
             neq_keys = check_neq_config(
-                layer_config[layer_name],
-                data_type=quantization_config["data_type"],
-                bits=quantization_config["bits"],
-                act_bits=quantization_config["act_bits"],
-                group_size=quantization_config["group_size"],
-                sym=quantization_config["sym"],
+                layer_config[layer_name], **{k: quantization_config[k] for k in REQUIRED_CONFIG_KEYS}
             )
             if len(neq_keys) > 0:
                 extra_config[layer_name] = {}
@@ -205,11 +196,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round",
     max_workers = 1
     if not torch.cuda.is_available() and not torch.xpu.is_available():
         max_workers = 2  ## 2 with cuda packing will cause hang occasionally
-    packing_device = "cpu"
-    if torch.cuda.is_available():
-        packing_device = "cuda"
-    elif torch.xpu.is_available():
-        packing_device = "xpu"
+    packing_device = _get_device()
     with ThreadPoolExecutor(max_workers=max_workers) as executor:
         with tqdm(total=len(names), leave=True) as pbar:
 
diff --git a/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py b/auto_round/export/export_to_autoround/export_to_nvfp_mxfp.py
@@ -24,6 +24,7 @@
 import transformers
 from tqdm import tqdm
 
+from auto_round.export.export_to_autoround.utils import REQUIRED_CONFIG_KEYS, check_neq_config
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
     check_start_with_block_name,
@@ -39,7 +40,6 @@
 from auto_round.wrapper import WrapperWALayer
 
 from .qlinear_fp import QuantLinear
-from .utils import check_neq_config
 
 __all__ = [
     "pack_layer",
@@ -203,12 +203,7 @@ def save_quantized_as_fp(output_dir, inplace=True, **kwargs):
             block_name_to_quantize is not None and check_start_with_block_name(layer_name, block_name_to_quantize)
         ):
             neq_keys = check_neq_config(
-                layer_config[layer_name],
-                data_type=quantization_config["data_type"],
-                bits=quantization_config["bits"],
-                act_bits=quantization_config["act_bits"],
-                group_size=quantization_config["group_size"],
-                sym=quantization_config["sym"],
+                layer_config[layer_name], **{k: quantization_config[k] for k in REQUIRED_CONFIG_KEYS}
             )
             if len(neq_keys) > 0:
                 extra_config[layer_name] = {}
diff --git a/auto_round/export/export_to_autoround/qlinear_fp.py b/auto_round/export/export_to_autoround/qlinear_fp.py
@@ -38,7 +38,7 @@
 from auto_round.data_type.mxfp import FP32_EXPONENT_BIAS, FP32_MIN_NORMAL
 from auto_round.data_type.nvfp import cast_to_fp4, get_reciprocal
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size, revert_tensor_by_pad
-from auto_round.utils import is_mx_fp, is_nv_fp
+from auto_round.utils import _get_device, is_mx_fp, is_nv_fp
 
 # from auto_round.utils import get_weight_compress_dtype
 logger = getLogger(__name__)
@@ -141,15 +141,11 @@ def post_init(self):
         pass
 
     def pack(self, linear, scales, zeros=None, g_idx=None, global_scale=None, input_global_scale=None):
-        if linear.bias is not None:
-            self.bias = linear.bias.clone().half()
-        device = "cpu"
-        if torch.cuda.is_available():
-            device = "cuda:0"
-        elif torch.xpu.is_available():
-            device = "xpu:0"
-
-        W = linear.weight.data.to(device).clone()
+        device = _get_device()
+        if getattr(linear, "bias", None) is not None:
+            self.bias = linear.bias.detach().to(torch.float16)
+
+        W = linear.weight.data.detach().to(device)
         if isinstance(linear, nn.Conv2d):
             W = W.flatten(1)
         if isinstance(linear, transformers.pytorch_utils.Conv1D):
@@ -163,7 +159,8 @@ def pack(self, linear, scales, zeros=None, g_idx=None, global_scale=None, input_
             scaled_tensor = tensor.to(global_scale.dtype) * get_reciprocal(
                 scales.reshape(tensor.shape[0], -1) * get_reciprocal(global_scale)
             )
-            scaled_tensor = cast_to_fp4(torch.clamp(scaled_tensor, -6.0, 6.0))
+            scaled_tensor.clamp_(-6.0, 6.0)
+            scaled_tensor = cast_to_fp4(scaled_tensor)
         else:
             scaled_tensor = tensor / (2 ** scales.reshape(tensor.shape[0], -1))
         scaled_tensor = revert_tensor_by_pad(scaled_tensor, orig_shape=orig_shape, pad_len=pad_len)
diff --git a/auto_round/export/export_to_autoround/qlinear_triton_act.py b/auto_round/export/export_to_autoround/qlinear_triton_act.py
@@ -41,6 +41,8 @@
 import torch.nn as nn
 import transformers
 
+from auto_round.utils import _get_device
+
 logger = getLogger(__name__)
 
 
@@ -117,16 +119,14 @@ def post_init(self):
         pass
 
     def pack(self, linear, scales, zeros, act_scales, w_bf16_to_fp8_scale, g_idx=None):
+        device = _get_device()
         scales_t = scales.t().contiguous()
 
         self.act_scales.data.copy_(act_scales.squeeze().clone())
         self.w_bf16_to_fp8_scale.data.copy_(w_bf16_to_fp8_scale.squeeze().clone())
         if linear.bias is not None:
             self.bias = linear.bias.clone().half()
         self.scales = scales_t.clone().half()
-        device = "cpu"
-        if torch.cuda.is_available():
-            device = "cuda:0"
 
         W = linear.weight.data.to(device).clone()
         if isinstance(linear, nn.Conv2d):
diff --git a/auto_round/export/export_to_autoround/utils.py b/auto_round/export/export_to_autoround/utils.py
@@ -12,26 +12,36 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+REQUIRED_CONFIG_KEYS = (
+    "data_type",
+    "bits",
+    "group_size",
+    "sym",
+    "act_bits",
+    "act_data_type",
+    "act_group_size",
+    "act_sym",
+    "act_dynamic",
+)
 
-def check_neq_config(config, data_type, bits, act_bits, group_size, sym):
-    """
-    Checks if the provided configuration parameters are not equal to the values in the config dictionary.
 
-    Args:
-        config (dict): A dictionary containing the configuration parameters.
-        data_type (str): The expected data type.
-        bits (int): The expected number of bits.
-        group_size (int): The expected group size.
-        sym (bool): The expected symmetry flag.
+def check_neq_config(config: dict, **expected) -> dict[str, tuple]:
+    """
+    Compare a config dict against expected values.
+    Ensures all required keys are present in both config and expected.
 
     Returns:
-        list: A list of strings indicating which configuration parameters do not match.
+        dict[str, tuple]: {key: (actual, expected)} for mismatched values.
     """
-    expected_config = {
-        "data_type": data_type,
-        "bits": bits,
-        "group_size": group_size,
-        "sym": sym,
-        "act_bits": act_bits,
-    }
-    return [key for key, expected_value in expected_config.items() if config.get(key) != expected_value]
+    # 1. Check missing from expected
+    missing_expected = [k for k in REQUIRED_CONFIG_KEYS if k not in expected]
+    if missing_expected:
+        raise ValueError(f"Missing expected values for keys: {missing_expected}")
+
+    # 2. Check missing from layer config
+    missing_config = [k for k in REQUIRED_CONFIG_KEYS if k not in config]
+    if missing_config:
+        raise ValueError(f"Missing config values for keys: {missing_config}")
+
+    # 3. Collect mismatches
+    return {key: (config[key], expected[key]) for key in REQUIRED_CONFIG_KEYS if config[key] != expected[key]}
diff --git a/auto_round/export/export_to_llmcompressor/export.py b/auto_round/export/export_to_llmcompressor/export.py
@@ -60,8 +60,8 @@ def pack_layer(layer_name, model, backend):
 
         return pack_layer(layer_name, model, backend)
 
-    ## passed as no other llmcompressor format is supported yet
-    logger.warning("No other llmcompressor packing format(except NVFP&MXFP) is supported yet, skip packing")
+    ## passed as no other llm_compressor format is supported yet
+    logger.warning("No other llm_compressor packing format(except NVFP&MXFP) is supported yet, skip packing")
     return
 
 
diff --git a/auto_round/script/llm.py b/auto_round/script/llm.py
@@ -511,7 +511,7 @@ def tune(args):
                 "auto_round" not in format
                 and "fake" not in format
                 and "awq" not in format
-                and "llmcompressor" not in format
+                and "llm_compressor" not in format
             ):
                 # TODO gptq could support some mixed precision config
                 logger.warning(f"mixed precision exporting does not support {format} currently")
diff --git a/auto_round/utils.py b/auto_round/utils.py
@@ -54,7 +54,7 @@ def __init__(self):
             "itrex",
             "itrex_xpu",
             "fake",
-            "llmcompressor",
+            "llm_compressor",
         )
         self._gguf_format = tuple(sorted(GGUF_CONFIG.keys()))
         self._support_list = self._support_format + self._gguf_format
@@ -2647,3 +2647,12 @@ def get_max_vram(ratio: float = 0.9) -> dict:
     else:
         raise RuntimeError("No CUDA or XPU devices found.")
     return max_memory
+
+
+def _get_device() -> torch.device:
+    """Selects best available device (CUDA > XPU > CPU)."""
+    if torch.cuda.is_available():
+        return torch.device("cuda:0")
+    if hasattr(torch, "xpu") and torch.xpu.is_available():
+        return torch.device("xpu:0")
+    return torch.device("cpu")
diff --git a/docs/step_by_step.md b/docs/step_by_step.md
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
diff --git a/test/test_cpu/test_llmcompressor_w8a8.py b/test/test_cpu/test_llmcompressor_w8a8.py
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py